<a href="https://colab.research.google.com/github/Aleeshbah11/Data-Cleaning/blob/main/FeatureEngineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Engineering
In ML consists of four main steps:
1. Feature Creation
2. Transformations
3. Feature Extraction
4. Feature Selection

# BINNING

Binning is also called discretization is a feature engineering technique where continuous numerical data is divided into discrete intervals (bins or categories). It helps in reducing noise and can improve performance in some models.
Use it when your data is highly skewed (to reduce variability) or when working with decision trees (categorical data may help). also for creating categories in domain-specific cases (e.g., age groups).

### Equal Width Binning
Divides data into bins of equal size (width).

for example;
The dataset is divided into 3 equal-width bins.
The labels ("Low", "Medium", "High") assign categories to each bin.

In [None]:
import numpy as np
import pandas as pd

# Sample continuous data
data = {'value': np.random.randint(1, 100, 10)}
df = pd.DataFrame(data)

# Apply equal-width binning (3 bins)
df['equal_width_bins'] = pd.cut(df['value'], bins=3, labels=['Low', 'Medium', 'High'])

print(df)


In [None]:
# Apply equal-frequency binning (3 bins)

df['equal_freq_bins'] = pd.qcut(df['value'], q=3, labels=['Low', 'Medium', 'High'])

print(df)


In [None]:
# custom binning

bins = [0, 30, 60, 100]  # Ranges: (0-30), (30-60), (60-100)
labels = ['Low', 'Medium', 'High']

# Apply custom binning
df['custom_bins'] = pd.cut(df['value'], bins=bins, labels=labels)

print(df)


## ONE HOT ENCODING
- It is used to convert categorical data into numerical data.
- Why? because duhhh
- ML models cannot [digest] categories. the input needs to be a number
- It is used for nominal category where rank of catefories does not matter.
- one-hot encoding creates binary columns for each category and assign number according to its presence or absence

### Manual
- Lets code the technique manually first to grasp the concept


In [None]:
import numpy as np
import pandas as pd

# Sample categorical data
categories = ['Red', 'Blue', 'Green', 'Blue', 'Red']
# using set() to get unique categories
unique_categories = list(set(categories))

# Creating one-hot encoding manually
one_hot_encoded = np.zeros((len(categories), len(unique_categories)))

for i, category in enumerate(categories):
    one_hot_encoded[i, unique_categories.index(category)] = 1

# Converting to DataFrame
encoded_df = pd.DataFrame(one_hot_encoded, columns=unique_categories)
print(categories)
print(encoded_df)



['Red', 'Blue', 'Green', 'Blue', 'Red']
   Red  Blue  Green
0  1.0   0.0    0.0
1  0.0   1.0    0.0
2  0.0   0.0    1.0
3  0.0   1.0    0.0
4  1.0   0.0    0.0


### SKLEARN
- using SKlearn for onehotencoding

In [None]:
from sklearn.preprocessing import OneHotEncoder


data = [['Red'], ['Blue'], ['Green'], ['Blue'], ['Red']]
encoder = OneHotEncoder(sparse_output=False)
encoded_array = encoder.fit_transform(data)

# Converting to DataFrame for better visualization
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(['Color']))

print(encoded_df)


   Color_Blue  Color_Green  Color_Red
0         0.0          0.0        1.0
1         1.0          0.0        0.0
2         0.0          1.0        0.0
3         1.0          0.0        0.0
4         0.0          0.0        1.0


### Pandas
- using pandas.get_dummies()

In [None]:
import pandas as pd

data = {'Color': ['Red', 'Blue', 'Green', 'Blue', 'Red']}
df = pd.DataFrame(data)

# Applying one-hot encoding
encoded_df = pd.get_dummies(df, columns=['Color'])

print(encoded_df)


   Color_Blue  Color_Green  Color_Red
0       False        False       True
1        True        False      False
2       False         True      False
3        True        False      False
4       False        False       True


In [None]:
import pandas as pd

data = {'color': ['red','blue','green','purple','yellow','red','blue']}
df=pd.DataFrame(data)

encoded_df = pd.get_dummies(df,columns=['color'])
print(encoded_df)

   color_blue  color_green  color_purple  color_red  color_yellow
0       False        False         False       True         False
1        True        False         False      False         False
2       False         True         False      False         False
3       False        False          True      False         False
4       False        False         False      False          True
5       False        False         False       True         False
6        True        False         False      False         False


In [None]:
import pandas as pd

data = ['red','blue','green']
df = pd.DataFrame(data)

encoded_df=pd.get_dummies(df)
print(encoded_df)


   0_blue  0_green  0_red
0   False    False   True
1    True    False  False
2   False     True  False


##Target Encoding

### Using Pandas

In [None]:
import pandas as pd

data = {'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix',
                 'New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
        'Target': [1, 0, 1, 0, 1, 1, 1, 0, 1, 0]}  # Binary target variable

df = pd.DataFrame(data)

# Compute the mean of the target variable for each category
city_target_mean = df.groupby('City')['Target'].mean()

# Map the computed mean to the original dataset
df['City_Encoded'] = df['City'].map(city_target_mean)

print(df)

          City  Target  City_Encoded
0     New York       1           1.0
1  Los Angeles       0           0.5
2      Chicago       1           0.5
3      Houston       0           0.5
4      Phoenix       1           0.5
5     New York       1           1.0
6  Los Angeles       1           0.5
7      Chicago       0           0.5
8      Houston       1           0.5
9      Phoenix       0           0.5


###Using CategoryEncoders from category_encoders Library

In [None]:
!pip install category_encoders

import pandas as pd
import category_encoders as ce

df = pd.DataFrame({'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix',
                            'New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
                   'Target': [1, 0, 1, 0, 1, 1, 1, 0, 1, 0]})

# Initialize and apply target encoder
target_encoder = ce.TargetEncoder(cols=['City'])
df['City_Encoded'] = target_encoder.fit_transform(df['City'], df['Target'])

print(df)


Collecting category_encoders
  Downloading category_encoders-2.8.0-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.0
          City  Target  City_Encoded
0     New York       1      0.656740
1  Los Angeles       0      0.585815
2      Chicago       1      0.585815
3      Houston       0      0.585815
4      Phoenix       1      0.585815
5     New York       1      0.656740
6  Los Angeles       1      0.585815
7      Chicago       0      0.585815
8      Houston       1      0.585815
9      Phoenix       0      0.585815


### Handling Target Leakage with Cross-Validation

In [None]:
from sklearn.model_selection import KFold

# Sample dataset
df = pd.DataFrame({'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix',
                            'New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
                   'Target': [1, 0, 1, 0, 1, 1, 1, 0, 1, 0]})

kf = KFold(n_splits=5, shuffle=True, random_state=42)
df['City_Encoded'] = 0

for train_idx, val_idx in kf.split(df):
    train_df, val_df = df.iloc[train_idx], df.iloc[val_idx]
    city_target_mean = train_df.groupby('City')['Target'].mean()
    df.loc[val_idx, 'City_Encoded'] = df.loc[val_idx, 'City'].map(city_target_mean)

print(df)


          City  Target  City_Encoded
0     New York       1           NaN
1  Los Angeles       0           1.0
2      Chicago       1           NaN
3      Houston       0           1.0
4      Phoenix       1           NaN
5     New York       1           NaN
6  Los Angeles       1           0.0
7      Chicago       0           NaN
8      Houston       1           0.0
9      Phoenix       0           NaN


## Mean Encoding for Ordinal Categories

Best for:

Ordinal categorical features (e.g., education level, customer ratings).
Tree-based models (XGBoost, LightGBM, Random Forest).

Avoid when:

Using linear models, as it introduces dependencies.
The dataset is small, since mean encoding can lead to overfitting.

In [None]:
import pandas as pd

# Sample dataset
data = {'Education_Level': ['High School', 'Bachelor', 'Master', 'PhD', 'High School',
                            'Bachelor', 'Master', 'PhD', 'High School', 'Bachelor'],
        'Salary': [40, 60, 80, 100, 45, 65, 85, 105, 38, 62]}  # Target variable

df = pd.DataFrame(data)

# Compute the mean salary for each education level
education_mean = df.groupby('Education_Level')['Salary'].mean()

# Map the computed mean to the original dataset
df['Education_Encoded'] = df['Education_Level'].map(education_mean)

print(df)


  Education_Level  Salary  Education_Encoded
0     High School      40          41.000000
1        Bachelor      60          62.333333
2          Master      80          82.500000
3             PhD     100         102.500000
4     High School      45          41.000000
5        Bachelor      65          62.333333
6          Master      85          82.500000
7             PhD     105         102.500000
8     High School      38          41.000000
9        Bachelor      62          62.333333


### Handling Mean Encoding with category_encoders

In [None]:
import category_encoders as ce

# Initialize mean encoder
mean_encoder = ce.TargetEncoder(cols=['Education_Level'])

# Apply mean encoding
df['Education_Encoded'] = mean_encoder.fit_transform(df['Education_Level'], df['Salary'])

print(df)


  Education_Level  Salary  Education_Encoded
0     High School      40          63.829438
1        Bachelor      60          67.124697
2          Master      80          70.056840
3             PhD     100          72.893862
4     High School      45          63.829438
5        Bachelor      65          67.124697
6          Master      85          70.056840
7             PhD     105          72.893862
8     High School      38          63.829438
9        Bachelor      62          67.124697


###  Preventing Data Leakage Using Cross-Validation

A risk of mean encoding is data leakage, where the target values influence encoding in a way that inflates model performance. To mitigate this, use cross-validation encoding

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
df['Education_Encoded'] = 0

for train_idx, val_idx in kf.split(df):
    train_df, val_df = df.iloc[train_idx], df.iloc[val_idx]
    mean_values = train_df.groupby('Education_Level')['Salary'].mean()
    df.loc[val_idx, 'Education_Encoded'] = df.loc[val_idx, 'Education_Level'].map(mean_values)

print(df)


  Education_Level  Salary  Education_Encoded
0     High School      40               41.5
1        Bachelor      60               63.5
2          Master      80               85.0
3             PhD     100              105.0
4     High School      45               39.0
5        Bachelor      65               61.0
6          Master      85               80.0
7             PhD     105              100.0
8     High School      38               42.5
9        Bachelor      62               62.5


  df.loc[val_idx, 'Education_Encoded'] = df.loc[val_idx, 'Education_Level'].map(mean_values)


## Practice

### One-Hot Encoding

Best for Nominal data for unordered categories. Increases the number of columns significantly for high-cardinality categories.

In [None]:
import pandas as pd

df = pd.DataFrame({'Color': ['Red', 'Blue', 'Green', 'Red', 'Green']})
df_encoded = pd.get_dummies(df, columns=['Color'])
print(df_encoded)


   Color_Blue  Color_Green  Color_Red
0       False        False       True
1        True        False      False
2       False         True      False
3       False        False       True
4       False         True      False


### Label Encoding

Best for Ordinal data ordered categories, Introduces artificial numerical relationships for nominal categories

In [None]:
from sklearn.preprocessing import LabelEncoder

df = pd.DataFrame({'Size': ['Small', 'Medium', 'Large', 'Small', 'Large']})
encoder = LabelEncoder()
df['Size_Encoded'] = encoder.fit_transform(df['Size'])
print(df)


     Size  Size_Encoded
0   Small             2
1  Medium             1
2   Large             0
3   Small             2
4   Large             0


###  Ordinal Encoding

Best for Ordinal categories e.g., education level, Requires correct ordering of values

In [None]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

df = pd.DataFrame({'Education': ['High School', 'Bachelor', 'Master', 'PhD', 'Bachelor']})
encoder = OrdinalEncoder(categories=[['High School', 'Bachelor', 'Master', 'PhD']])
df['Education_Encoded'] = encoder.fit_transform(df[['Education']])
print(df)


     Education  Education_Encoded
0  High School                0.0
1     Bachelor                1.0
2       Master                2.0
3          PhD                3.0
4     Bachelor                1.0


### Target Encoding (Mean Encoding)

Best for High-cardinality featuresCan cause data leakage if not handled properly.

In [None]:
import pandas as pd
import category_encoders as ce

df = pd.DataFrame({'City': ['NY', 'LA', 'SF', 'NY', 'LA'], 'Price': [100, 200, 300, 150, 250]})
encoder = ce.TargetEncoder(cols=['City'])
df['City_Encoded'] = encoder.fit_transform(df['City'], df['Price'])
print(df)


  City  Price  City_Encoded
0   NY    100    189.361170
1   LA    200    203.546277
2   SF    300    213.010847
3   NY    150    189.361170
4   LA    250    203.546277


### Frequency Encoding

Best for High-cardinality features. May lose information if all categories have similar frequencies

In [None]:
df = pd.DataFrame({'City': ['NY', 'LA', 'SF', 'NY', 'LA']})
df['City_Encoded'] = df['City'].map(df['City'].value_counts() / len(df))
print(df)


  City  City_Encoded
0   NY           0.4
1   LA           0.4
2   SF           0.2
3   NY           0.4
4   LA           0.4


### Binary Encoding
Best for Reducing dimensionality while keeping unique representations, still increases the number of columns but less than One-Hot Encoding.

In [None]:
import category_encoders as ce

df = pd.DataFrame({'Category': ['A', 'B', 'C', 'D', 'E']})
encoder = ce.BinaryEncoder(cols=['Category'])
df_encoded = encoder.fit_transform(df)
print(df_encoded)


   Category_0  Category_1  Category_2
0           0           0           1
1           0           1           0
2           0           1           1
3           1           0           0
4           1           0           1


### Hash Encoding Hashing Trick
 Best for High-cardinality categorical data
 Possible collisions when two different categories mapping to the same value.

In [None]:
encoder = ce.HashingEncoder(cols=['Category'], n_components=3)  # Number of hash bins
df_encoded = encoder.fit_transform(df)
print(df_encoded)


   col_0  col_1  col_2
0      0      1      0
1      0      1      0
2      0      1      0
3      0      0      1
4      0      1      0


###  Weight of Evidence (WoE) Encoding
Best for Binary classification problems. Works only when the target variable is binary.

WoE=log(
P(Bad)/
P(Good)
​
 )


𝑃
(
Good
)
P(Good) = Percentage of positive target (e.g., Salary > 75000)
𝑃
(
Bad
)
P(Bad) = Percentage of negative target (e.g., Salary <= 75000)

In [None]:
import numpy as np

df = pd.DataFrame({'Feature': ['A', 'B', 'C', 'A', 'B', 'C'], 'Target': [1, 0, 1, 1, 0, 0]})

# Compute WoE
woe_df = df.groupby('Feature')['Target'].agg(['sum', 'count'])
woe_df['WoE'] = np.log((woe_df['sum'] + 1) / (woe_df['count'] - woe_df['sum'] + 1))

# Map to original data
df['Feature_WoE'] = df['Feature'].map(woe_df['WoE'])
print(df)


  Feature  Target  Feature_WoE
0       A       1     1.098612
1       B       0    -1.098612
2       C       1     0.000000
3       A       1     1.098612
4       B       0    -1.098612
5       C       0     0.000000


## Task

Encode the "City" column in a dataset using multiple encoding techniques

In [None]:
import pandas as pd

# Sample dataset
data = {
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'New York', 'Chicago', 'Houston'],
    'Salary': [70000, 80000, 75000, 72000, 68000, 73000, 77000, 71000]  # Target variable
}

df = pd.DataFrame(data)
print(df)


          City  Salary
0     New York   70000
1  Los Angeles   80000
2      Chicago   75000
3      Houston   72000
4      Phoenix   68000
5     New York   73000
6      Chicago   77000
7      Houston   71000


In [None]:
# OHE

# Issue: More columns = higher memory usage.

df_ohe = pd.get_dummies(df, columns=['City'])
print(df_ohe)


   Salary  City_Chicago  City_Houston  City_Los Angeles  City_New York  \
0   70000         False         False             False           True   
1   80000         False         False              True          False   
2   75000          True         False             False          False   
3   72000         False          True             False          False   
4   68000         False         False             False          False   
5   73000         False         False             False           True   
6   77000          True         False             False          False   
7   71000         False          True             False          False   

   City_Phoenix  
0         False  
1         False  
2         False  
3         False  
4          True  
5         False  
6         False  
7         False  


In [None]:
#Label Encoding

#It creates an artificial order between cities.

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['City_Label'] = encoder.fit_transform(df['City'])
print(df)


          City  Salary  City_Label
0     New York   70000           3
1  Los Angeles   80000           2
2      Chicago   75000           0
3      Houston   72000           1
4      Phoenix   68000           4
5     New York   73000           3
6      Chicago   77000           0
7      Houston   71000           1


In [None]:
# Target Encoding (Mean Encoding)

# If not done correctly, it may lead to data leakage

import category_encoders as ce

target_encoder = ce.TargetEncoder(cols=['City'])
df['City_Target'] = target_encoder.fit_transform(df['City'], df['Salary'])
print(df)


          City  Salary  City_Label   City_Target
0     New York   70000           3  73001.760636
1  Los Angeles   80000           2  74128.232202
2      Chicago   75000           0  73640.090428
3      Houston   72000           1  73001.760636
4      Phoenix   68000           4  72566.930510
5     New York   73000           3  73001.760636
6      Chicago   77000           0  73640.090428
7      Houston   71000           1  73001.760636


In [None]:
# Frequency Encoding

# Good for high-cardinality features!

df['City_Freq'] = df['City'].map(df['City'].value_counts() / len(df))
print(df)


          City  Salary  City_Label   City_Target  City_Freq
0     New York   70000           3  73001.760636      0.250
1  Los Angeles   80000           2  74128.232202      0.125
2      Chicago   75000           0  73640.090428      0.250
3      Houston   72000           1  73001.760636      0.250
4      Phoenix   68000           4  72566.930510      0.125
5     New York   73000           3  73001.760636      0.250
6      Chicago   77000           0  73640.090428      0.250
7      Houston   71000           1  73001.760636      0.250


In [None]:
# Binary Encoding

import category_encoders as ce
import pandas as pd

# Sample dataset
df = pd.DataFrame({'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'New York', 'Chicago', 'Houston']})

# Apply Binary Encoding
encoder = ce.BinaryEncoder(cols=['City'])
df_binary = encoder.fit_transform(df)

print(df_binary)


   City_0  City_1  City_2
0       0       0       1
1       0       1       0
2       0       1       1
3       1       0       0
4       1       0       1
5       0       0       1
6       0       1       1
7       1       0       0


In [None]:
# Implement WoE Encoding

import numpy as np

# Sample dataset with a binary target
df = pd.DataFrame({
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'New York', 'Chicago', 'Houston'],
    'Salary': [1, 0, 1, 0, 1, 1, 0, 0]  # Binary target (1 = High Salary, 0 = Low Salary)
})

# Compute WoE
woe_df = df.groupby('City')['Salary'].agg(['sum', 'count'])
woe_df['WoE'] = np.log((woe_df['sum'] + 1) / (woe_df['count'] - woe_df['sum'] + 1))

# Map WoE to original dataset
df['City_WoE'] = df['City'].map(woe_df['WoE'])

print(df)


          City  Salary  City_WoE
0     New York       1  1.098612
1  Los Angeles       0 -0.693147
2      Chicago       1  0.000000
3      Houston       0 -1.098612
4      Phoenix       1  0.693147
5     New York       1  1.098612
6      Chicago       0  0.000000
7      Houston       0 -1.098612
