# There are multiple types of Feature Engineering Methods.
1. Normalization
2. Binning

Notes:
  1. Normalization: Converting the numerical value into a standard range.
  2. Binning: Converting the numerical values into bucket of range.
  3. Converting the raw data into usable data for our model building is called as Feature Engineering or Feature Extraction or Data Ingestion

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
data=pd.read_csv('california_housing_train.csv')
data.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0


In [14]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
# from sklearn.preprocessing import StandardScaler, MinMaxScaler
x = data.drop('median_house_value',axis=1)
y = data['median_house_value']

scaler_std = StandardScaler()
x_standard_scaler = pd.DataFrame(scaler_std.fit_transform(x),columns= x.columns)
x_standard_scaler.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,2.619365,-0.67152,-1.079671,1.361695,1.764204,-0.361184,-0.075998,-1.252543
1,2.539569,-0.573264,-0.761872,2.296608,3.230441,-0.261865,-0.099404,-1.081483
2,2.494683,-0.905463,-0.920772,-0.882462,-0.866956,-0.955354,-0.999252,-1.170105
3,2.489696,-0.928857,-1.159121,-0.524186,-0.48023,-0.796793,-0.715774,-0.3626
4,2.489696,-0.961609,-0.682422,-0.545747,-0.506328,-0.70183,-0.622148,-1.026454


## MinMaxScaler
#### 1.Scales data to a fixed range, usually [0, 1]
#### 2. x′ = (x−xmin)/(xmax−xmin)
#### 3. Compresses all values within a fixed range
#### 4. Typically between 0 and 1 (can be changed using feature_range)
#### 5. Very sensitive — outliers stretch the range
#### 6.Best for algorithms where bounded input is needed (e.g., Neural Networks, KNN, Gradient Descent based methods)

In [15]:
minmax_scaler = MinMaxScaler()
x_minmax_scaler = pd.DataFrame(minmax_scaler.fit_transform(x),columns=x.columns)
x_minmax_scaler.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,1.0,0.175345,0.27451,0.147885,0.198945,0.028364,0.077454,0.06853
1,0.984064,0.197662,0.352941,0.201608,0.294848,0.031559,0.075974,0.09104
2,0.9751,0.12221,0.313725,0.018927,0.026847,0.009249,0.019076,0.079378
3,0.974104,0.116897,0.254902,0.039515,0.052142,0.01435,0.037,0.185639
4,0.974104,0.109458,0.372549,0.038276,0.050435,0.017405,0.042921,0.098281


## Bining
#### 1. It convert continues values into numarical range
#### 2. Types of binning:
#### 3. Equal-Width : divide range into equal range  -> pd.cut()
#### 4. Equal-frequency : equal no of of sample per b=bin
#### 5. custom : user defined threshold

In [16]:
cols_to_bin = data[['median_income','housing_median_age']]
data['Income_bin_width'] = pd.cut(data['median_income'],bins=4,labels=['low','medium','High','Very High'])

In [17]:
data.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,Income_bin_width
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0,low
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0,low


In [20]:
data['Income_bin_width'].value_counts()

Unnamed: 0_level_0,count
Income_bin_width,Unnamed: 1_level_1
low,10811
medium,5500
High,578
Very High,111


In [23]:
cols_to_bin = data[['median_income','housing_median_age']]
data['Income_bin_freq'] = pd.qcut(data['median_income'],q=4,labels=['low','medium','High','Very High'])
qcut_freq = data['Income_bin_freq'].value_counts()
qcut_freq

Unnamed: 0_level_0,count
Income_bin_freq,Unnamed: 1_level_1
medium,4251
low,4250
Very High,4250
High,4249


# Custom Bins

In [24]:
# Custom Bin
bins = [0,3,6,9,12,data['median_income'].max()]
labels = ['very low','low','medium','High','Very High']
data['Income_bin_custom'] = pd.cut(data['median_income'],bins=bins,labels=labels)

In [25]:
data['Income_bin_custom'].value_counts()

Unnamed: 0_level_0,count
Income_bin_custom,Unnamed: 1_level_1
low,8948
very low,6077
medium,1633
High,248
Very High,94
