# One-hot encoding

In [19]:
import pandas as pd

# Sample dataset with a categorical column
data = {'Color': ['Red', 'Blue', 'Green', 'Red', 'Green']}
df = pd.DataFrame(data)

# one-hot encoding using Pandas
one_hot_encoded = pd.get_dummies(df, columns=['Color'])

# Convert boolean values to integers (0 and 1)
one_hot_encoded = one_hot_encoded.astype(int)

one_hot_encoded

Unnamed: 0,Color_Blue,Color_Green,Color_Red
0,0,0,1
1,1,0,0
2,0,1,0
3,0,0,1
4,0,1,0


# Label Encoding

In [20]:
from sklearn.preprocessing import LabelEncoder

# Sample dataset with a categorical column
data = {'Size': ['Small', 'Medium', 'Large', 'Medium', 'Small']}
df = pd.DataFrame(data)

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'Size' column
df['Size_encoded'] = label_encoder.fit_transform(df['Size'])

df

Unnamed: 0,Size,Size_encoded
0,Small,2
1,Medium,1
2,Large,0
3,Medium,1
4,Small,2


# Ordinal encoding

In [21]:

import pandas as pd
import category_encoders as ce

# Sample dataset with an ordinal categorical column
data = {'Education_Level': ['High School', 'Bachelor\'s', 'Master\'s', 'Bachelor\'s', 'High School']}
df = pd.DataFrame(data)

# Define the order of categories
education_order = ['High School', 'Bachelor\'s', 'Master\'s']

# Initialize the OrdinalEncoder with specified order
ordinal_encoder = ce.OrdinalEncoder(mapping=[{'col': 'Education_Level', 'mapping': {level: index for index, level in enumerate(education_order)}}])

# Fit and transform the DataFrame
df_encoded = ordinal_encoder.fit_transform(df)

# Display the DataFrame with ordinal encoding
df_encoded


Unnamed: 0,Education_Level
0,0
1,1
2,2
3,1
4,0


# Count Encoding

In [22]:

import pandas as pd
from sklearn.model_selection import train_test_split
import category_encoders as ce

# Generate a dummy dataset with categorical variables
data = {
    'Color': ['Red', 'Blue', 'Green', 'Red', 'Red', 'Blue', 'Green'],
    'Size': ['Small', 'Medium', 'Large', 'Medium', 'Small', 'Small', 'Medium'],
    'Label': [1, 0, 1, 1, 0, 0, 1]
}

df = pd.DataFrame(data)

# Split the data into training and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Initialize the CountEncoder
count_encoder = ce.CountEncoder()

# Fit the encoder on the training data
count_encoder.fit(train_df[['Color', 'Size']])

print('before encoding')
print(train_df)

# Transform both the training and test datasets
train_encoded = count_encoder.transform(train_df[['Color', 'Size']])
test_encoded = count_encoder.transform(test_df[['Color', 'Size']])

# Display the encoded datasets
print("Training Data (After Count Encoding):\n", train_encoded)
'''
print("\nTest Data (After Count Encoding):\n", test_encoded)
'''

before encoding
   Color    Size  Label
5   Blue   Small      0
2  Green   Large      1
4    Red   Small      0
3    Red  Medium      1
6  Green  Medium      1
Training Data (After Count Encoding):
    Color  Size
5      1     2
2      2     1
4      2     2
3      2     2
6      2     2


'\nprint("\nTest Data (After Count Encoding):\n", test_encoded)\n'

In [23]:
print("data before encoding")
print(df)

df_encoded = count_encoder.transform(df[['Color', 'Size']])

print("data before encoding")
print(df_encoded)

data before encoding
   Color    Size  Label
0    Red   Small      1
1   Blue  Medium      0
2  Green   Large      1
3    Red  Medium      1
4    Red   Small      0
5   Blue   Small      0
6  Green  Medium      1
data before encoding
   Color  Size
0      2     2
1      1     2
2      2     1
3      2     2
4      2     2
5      1     2
6      2     2


# Target encoding

In [24]:
from sklearn.model_selection import train_test_split
import category_encoders as ce

# Generate a dummy dataset with categorical variables
data = {
    'Color': ['Red', 'Blue', 'Green', 'Red', 'Red', 'Blue', 'Green'],
    'Size': ['Small', 'Medium', 'Large', 'Medium', 'Small', 'Small', 'Medium'],
    'Label': [1, 0, 1, 1, 0, 0, 1]
}

df = pd.DataFrame(data)

# Split the data into training and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Initialize the MeanEncoder
mean_encoder = ce.TargetEncoder()

# Fit the encoder on the training data
mean_encoder.fit(train_df[['Color', 'Size']], train_df['Label'])

print('before encoding')
print(train_df)

# Transform both the training and test datasets
train_encoded = mean_encoder.transform(train_df[['Color', 'Size']])
test_encoded = mean_encoder.transform(test_df[['Color', 'Size']])

# Display the encoded datasets
print("Training Data (After Mean Encoding):\n", train_encoded)
'''
print("\nTest Data (After Mean Encoding):\n", test_encoded)
'''

before encoding
   Color    Size  Label
5   Blue   Small      0
2  Green   Large      1
4    Red   Small      0
3    Red  Medium      1
6  Green  Medium      1
Training Data (After Mean Encoding):
       Color      Size
5  0.521935  0.514889
2  0.656740  0.652043
4  0.585815  0.514889
3  0.585815  0.656740
6  0.656740  0.656740


'\nprint("\nTest Data (After Mean Encoding):\n", test_encoded)\n'

# Leave one out encoding

In [25]:
import category_encoders as ce
import pandas as pd
from sklearn.model_selection import train_test_split

# Sample dataset (replace this with your own dataset)
data = {
    'Category1': ['A', 'B', 'A', 'B', 'A', 'A', 'B', 'B'],
    'Category2': ['X', 'Y', 'X', 'Y', 'X', 'Y', 'X', 'Y'],
    'Target': [1, 0, 1, 0, 1, 1, 0, 0]
}

df = pd.DataFrame(data)

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Specify categorical columns for LOO encoding
categorical_columns = ['Category1', 'Category2']

# Initialize LOO encoder
loo_encoder = ce.leave_one_out.LeaveOneOutEncoder(cols=categorical_columns)

print('before encoding')
print(train_df)

# Fit and transform on the training data
train_encoded = loo_encoder.fit_transform(train_df, train_df['Target'])

# Transform the test data using the encoder fitted on the training data
test_encoded = loo_encoder.transform(test_df)

# Display the results
print("Encoded Training Data:")
print(train_encoded)

'''
print("\nEncoded Test Data:")
print(test_encoded)
'''

before encoding
  Category1 Category2  Target
0         A         X       1
7         B         Y       0
2         A         X       1
4         A         X       1
3         B         Y       0
6         B         X       0
Encoded Training Data:
   Category1  Category2  Target
0        1.0   0.666667       1
7        0.0   0.000000       0
2        1.0   0.666667       1
4        1.0   0.666667       1
3        0.0   0.000000       0
6        0.0   1.000000       0


'\nprint("\nEncoded Test Data:")\nprint(test_encoded)\n'

# Catboost Encoder

In [26]:
import pandas as pd
from category_encoders.cat_boost import CatBoostEncoder

# Sample training and testing datasets with a categorical column
train_data = pd.DataFrame({'Category': ['A', 'B', 'C', 'A', 'B'], 'Target': [1, 0, 1, 0, 1]})
test_data = pd.DataFrame({'Category': ['A', 'C', 'B']})

# Initialize the CatBoostEncoder
catboost_encoder = CatBoostEncoder()

# Fit and transform the training data
train_encoded = catboost_encoder.fit_transform(train_data['Category'], train_data['Target'])

# Transform the testing data
test_encoded = catboost_encoder.transform(test_data['Category'])

train_encoded

Unnamed: 0,Category
0,0.6
1,0.6
2,0.6
3,0.8
4,0.3
