# Feature Engineering 

In [23]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer, PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder 


# Handling Numerical data

In [2]:
# Create features
feature_1 = np.array([[-500.5], [-100.1], [0], [100.1], [900.9]])
feature_2 = np.array([[-1000.1], [-200.2], [500.5], [600.6], [9000.9]])
feature_3 = np.array([[0.5, 0.5], [1.1, 3.4], [1.5, 20.2], [1.63, 34.4], [10.9, 3.3]])
feature_4 = np.array([[2, 3],[2, 3],[2, 3]])

In [6]:
# using StandardScaler 
scaler = StandardScaler()  #instantiating it
standardized = scaler.fit_transform(feature_1)
print(standardized, '\n')
print(f"mean: {standardized.mean()}")
print(f"standard deviation : {standardized.std()}")

[[-1.26687088]
 [-0.39316683]
 [-0.17474081]
 [ 0.0436852 ]
 [ 1.79109332]] 

mean: 0.0
standard deviation : 1.0


In [8]:
# using MinMaxScaler 
scaler = MinMaxScaler(feature_range=(0, 1)) #feature_range is use to set the range you want to convert into
scaled = scaler.fit_transform(feature_2)
scaled

array([[0.        ],
       [0.079982  ],
       [0.150045  ],
       [0.16005399],
       [1.        ]])

In [10]:
# using robustscaler
robust = RobustScaler()
robust_scaled = robust.fit_transform(feature_3)
robust_scaled

array([[-1.88679245e+00, -1.71597633e-01],
       [-7.54716981e-01,  0.00000000e+00],
       [ 0.00000000e+00,  9.94082840e-01],
       [ 2.45283019e-01,  1.83431953e+00],
       [ 1.77358491e+01, -5.91715976e-03]])

In [13]:
# using the diamond dataset for practising feature engineering 

diamond = pd.read_csv("C:\\Datasets\\diamonds.csv")
diamond.drop(columns=['Unnamed: 0','x','y','z'], inplace=True)
diamond.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,0.23,Ideal,E,SI2,61.5,55.0,326
1,0.21,Premium,E,SI1,59.8,61.0,326
2,0.23,Good,E,VS1,56.9,65.0,327
3,0.29,Premium,I,VS2,62.4,58.0,334
4,0.31,Good,J,SI2,63.3,58.0,335


In [14]:
X = diamond.drop(columns=['cut'])  # feature matrics 
y = diamond['cut']                 # target vector

In [28]:
# feature engineering: numerical feature
scaler = MinMaxScaler()
numerical_features = X.select_dtypes(include=['int64','float64']).columns
scaled  = scaler.fit_transform(X[numerical_features])
# converting it to dataframe
scaled_df = pd.DataFrame(scaled, columns=numerical_features, index=X.index)
# updating it on the real dataset
X[numerical_features]=scaled_df
# X

# feature engineering: categorical feature
encoder = OneHotEncoder(sparse_output=False)
cat_feature = X.select_dtypes(include=['object']).columns
encoded = encoder.fit_transform(X[cat_feature])
# encoded
X[encoder.get_feature_names_out()] = encoded
X.drop(columns=cat_feature, inplace=True)
X

Unnamed: 0,carat,depth,table,price,color_D,color_E,color_F,color_G,color_H,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.006237,0.513889,0.230769,0.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.002079,0.466667,0.346154,0.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.006237,0.386111,0.423077,0.000054,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.018711,0.538889,0.288462,0.000433,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.022869,0.563889,0.288462,0.000487,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.108108,0.494444,0.269231,0.131427,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
53936,0.108108,0.558333,0.230769,0.131427,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
53937,0.103950,0.550000,0.326923,0.131427,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
53938,0.137214,0.500000,0.288462,0.131427,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
