In [None]:
# Encoding non-numeric(categorical) data
# 1. By using LabelEncoder
# 2. By using OneHotEncoder
# Most ML models(KNN, logistic regression, etc) only works with numbers



In [None]:
# 1. LabelEncoding: This assigns a number to each unique category. Works well only when there is numerical order like one is bigger than the other.
# Works for just ONE column/target

import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.DataFrame({
    "Sports" : ['Soccer', 'Tennis', 'Basketball'],
    "Schools" : ['Colleges', 'Universities', 'Polytechniques']
})
print("Before encoding\n", df)

le = LabelEncoder() # intialising LabelEncoder

df['Sports_Encoded'] = le.fit_transform(df['Sports']) # pass in the new encoded column in [] and the column you want to encode (the second [])

print("\nAfter encoding\n", df)


In [None]:
#  1(ii)  OrdinalEncoder: Similar to LabelEncoder but for multiple columns

import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

df = pd.DataFrame({
    "Sports" : ['Soccer', 'Tennis', 'Basketball'],
    "Schools" : ['Colleges', 'Universities', 'Polytechniques'],
    "Volume" : ['Height', 'Width', 'Length']
})
print("Before encoding\n", df)

oe = OrdinalEncoder()
df[['Sports_Encoded', 'Schools_Encoded', 'Volume_Encoded']] = oe.fit_transform(df[['Sports', 'Schools', 'Volume']])
print("After encoding\n", df)                                                                              

In [None]:
# One-Hot Encoding (OHE)
# Turns a categorical column with values like  A, B, C into multiple binary columns (0/1)
# Use it for nominal categories(no order) so the model doesn't assume numeric order

# Two methods of using OHE is pandas.get_dummies(df, columns you want to encode, drop_first=False (if you waant the full dummy
# The second method is using One_Hot_Encoder()


In [None]:
#  pandas.get_dummies()

import pandas as pd

df = pd.DataFrame({
    "Team" : ["A" , "B", "C", "A"],
    "City" : ["Lagos", "Abuja", "Lagos", "Kano"],
    "Score" : [10, 12, 9, 11]
})

print(df)

df_dummy1 = pd.get_dummies(df, columns = ["Team", "City"], drop_first=False) # I need the full dummy set i.e keep the reference which is the first("A" and "Lagos")
df_dummy2 = pd.get_dummies(df, columns = ["Team", "City"], drop_first=True)# Does not print out the first, only their dummies
print(df_dummy1)
print(df_dummy2)



In [None]:
# Using OneHotEncoder from sklearn.preprocessing

from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Step 1 is to load in your dataset which could also be csv or excel, etc

df = pd.DataFrame({
    "Color" : ["Red", "Blue", "Green", "Blue", "Red"]
})

print(df, "\n")

# Step 2 is to initialize the OneHotEncoder
encoder = OneHotEncoder()

# Step 3 is to fit and transform the column into binary
encoded = encoder.fit_transform(df[["Color"]])

# Step 4 is to convert the encoded result into an array
print(encoded.toarray())  # outputs the codes(numbers) used to rep each color

In [None]:
# Improving the simplest OneHotEncoder code by converting the array to a df and including the names of the columns
# Ex 1
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Load the dataset
df = pd.DataFrame({
    "Time" : ["Seconds", "Minutes", "Hours", "Days", "Weeks", "Months", "Years"]
})
print("Before OneHotEncoding:")
print(df)

# Step 1 is to initialise the OneHotEncoding
encoder = OneHotEncoder()

# Apply the 
encoded = encoder.fit_transform(df[["Time"]])  

# Change it to a DataFrame by using
encoded_df = pd.DataFrame(
    encoded.toarray(),
    columns = encoder.get_feature_names_out(["Time"])
)
# Concatenate the original and the encoded

df_encoded = pd.concat([df, encoded_df], axis = 1)

print("After OneHotEncoding:")
print(df_encoded)
    



In [None]:
# Ex 2
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Ex Dataset

df = pd.DataFrame({
    "Family" : ["Father", "Mother", "Children", "Siblings", "Aunts", "Uncles"]
})
#print("Before OneHotEncoder: ")
#print(df)

# initialising OneHotEncoder
encoder = OneHotEncoder()

# Use the encoder

encoded = encoder.fit_transform(df[["Family"]])

encoded_df = pd.DataFrame(
    encoded.toarray(),  # first convert to an array
    columns = encoder.get_feature_names_out(["Family"])
)
print(encoded_df)  # only has the new encoded columns

# Concatenate the original df and the encoded columns

df_encoded = pd.concat([df, encoded_df], axis = 1)  # prints out NAn without axis = 1, ask Chatgpt why
print("After OneHotEncoder:")
print(df_encoded)


In [None]:
# OneHotEncoding multiple columns
# Ex 3

from sklearn.preprocessing import OneHotEncoder
import pandas as pd

excel_file = r"C:\Users\adead\Downloads\excel dataset for practice\Netflix-Movies-Sample-Data.xlsx"
df = pd.read_excel(excel_file, header=5)  # use the 6th row as my header. stupid excel doesn't load the dataset in exactly correctly
df = df.drop(df.columns[0], axis=1) # pd imports the excel file with an extra column containing NaN (bullshit)
print("Before encoding\n:")
print(df.head(20))

# Initialize OneHotEncoder
encoder = OneHotEncoder()

# fit and transform
encoded = encoder.fit_transform(df[["Name", "Age Rating", "Duration", "Category"]])
print(encoded)

# Convert it to a DataFrame
encoded_df = pd.DataFrame(
    encoded.toarray(),
    columns = encoder.get_feature_names_out(["Name", "Age Rating", "Duration", "Category"])
)
print(encoded_df)

# Concatenate the original DataFrame(df) with the encoded_df
df_encoded = pd.concat([df, encoded_df], axis = 1)
print("After encoding\n:")
print(df_encoded)


In [None]:
# OneHotEncoding multiple columns
# Ex 4

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Load the excel file in

excel_file = r"C:\Users\adead\Downloads\excel dataset for practice\Healthcare-Insurance-Sample-Data.xlsx"
ins = pd.read_excel(excel_file, header=5)
ins = ins.drop(ins.columns[0], axis=1)

print("Before encoding\n:")
print(ins.head(20)) 

# Encode the string lateral columns
# Initialize OneHotEncoder

encoder = OneHotEncoder()

# fit & transform 
encoded = encoder.fit_transform(ins[["Name", "Gender", "Smoking Status","Location"]])

# Convert the encoded result to DataFrame, the sparse matrix to a DataFrame
encoded_df = pd.DataFrame(
    encoded.toarray(),
    columns = encoder.get_feature_names_out(["Name", "Gender", "Smoking Status","Location"])
)
print(encoded_df)

# Concatenate the original DataFrame(df) with the  new encoded_df

df_encoded = pd.concat([ins, encoded_df], axis=1)
print("After encoding:\n")
print(df_encoded)

In [None]:
# OneHotEncoding multiple columns with drop='first' and sparse_output=False
# Ex 5

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Load the excel file in

excel_file = r"C:\Users\adead\Downloads\excel dataset for practice\Healthcare-Insurance-Sample-Data.xlsx"
ins = pd.read_excel(excel_file, header=5)
ins = ins.drop(ins.columns[0], axis=1)

print("Before encoding:\n")
#print(ins.head(20)) 
print(ins.shape) 

# Encode the string lateral columns
# Initialize OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse_output=False)

# fit & transform 
encoded = encoder.fit_transform(ins[["Name", "Gender", "Smoking Status","Location"]])

# Convert the encoded result to DataFrame, the sparse matrix to a NumPy array

encoded_df = pd.DataFrame(
    encoded,
    columns = encoder.get_feature_names_out(["Name", "Gender", "Smoking Status","Location"])
)
#print(encoded_df)

# Concatenate the original DataFrame(df) with the  new encoded_df

df_encoded = pd.concat([ins, encoded_df], axis=1)
print("After encoding\n:")
print(df_encoded)

In [8]:
# OneHotEncoding multiple columns with drop='first' and sparse_output=False
# Ex 6

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Load in the dataset and remove the extra column that excel automatically adds to the dataset

excel = r"C:\Users\adead\Downloads\excel dataset for practice\2022-FIFA-World-Cup-Performance-Sample-Data.xlsx"
excel_file = pd.read_excel(excel, header=5)  # the fifth row is my heading
cup = excel_file.drop(excel_file.columns[0], axis=1) # drop the extra column that pd automatically adds to the excel file
print("Before OneHotEncoding:\n")
print(cup.head())
#print(cup.columns)

# Initialize the OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse_output=False)

# use fit and transform

encoded = encoder.fit_transform(cup[['Player Name ', 'Position', 'Player DOB', 'Club ']])  # in terms of sparse matrix

# Convert  encoded to a DataFrame
encoded_df = pd.DataFrame(
    encoded,
    columns = encoder.get_feature_names_out(['Player Name ', 'Position', 'Player DOB', 'Club '])
)  # this converts it to a df
#print(encoded_df)

# Concatenate  the new df(encoded_df) with the original df

df_encoded = pd.concat([cup, encoded_df], axis=1)
print("After OneHotEncoding:\n")
print(df_encoded.head())

Before OneHotEncoding:

         Player Name  Position  Jersey Number Player DOB            Club   \
0    Nicolas Otamendi       DF             19 1989-07-13          Benfica   
1        Marcos Acuna       DF              8 1991-03-16          Sevilla   
2  Nicolas Tagliafico       DF              3 1991-05-06             Lyon   
3     German Pezzella       DF              6 1991-02-25       Real Betis   
4       Nahuel Molina       DF             26 1998-02-11  Atletico Madrid   

    Appearances  Goals Scored   Assists Provided   Dribbles per 90 Min  \
0             7              0                  1                 0.33   
1             6              0                  0                 1.45   
2             6              0                  0                 0.48   
3             3              0                  0                 0.00   
4             7              1                  1                 0.32   

   Interceptions per 90 Min  Tackles per 90 Min  Total Duels Won per