In [21]:
# Day 87 - Advanced Feature Engineering
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, PolynomialFeatures, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# ---------------- Sample Dataset ----------------
data = {
    'Area': [1200, 1500, np.nan, 1800, 2400, 3000, 3500, np.nan, 4000, 4200],
    'Bedrooms': [2, 3, 3, 4, 4, 5, 4, 3, 5, 6],
    'Location': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'C', 'B', 'A'],
    'Price': [200, 250, 240, 310, 360, 400, 420, 390, 450, 500]
}
df = pd.DataFrame(data)
print("Original Data:\n", df)

Original Data:
      Area  Bedrooms Location  Price
0  1200.0         2        A    200
1  1500.0         3        B    250
2     NaN         3        A    240
3  1800.0         4        C    310
4  2400.0         4        B    360
5  3000.0         5        A    400
6  3500.0         4        C    420
7     NaN         3        C    390
8  4000.0         5        B    450
9  4200.0         6        A    500


In [22]:
# ---------------- 1️⃣ Handling Missing Values ----------------
df['Area'] = df['Area'].fillna(df['Area'].mean())

In [23]:
# ---------------- 2️⃣ Encoding Categorical Variables ----------------
label_encoder = LabelEncoder()
df['Location_Label'] = label_encoder.fit_transform(df['Location'])

# One-hot encoding (alternative) - this will drop the 'Location' column
df = pd.get_dummies(df, columns=['Location'], drop_first=True)

In [24]:
# ---------------- 3️⃣ Feature Scaling ----------------
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['Area', 'Bedrooms']])
df[['Area_Scaled', 'Bedrooms_Scaled']] = scaled_features

In [25]:
# ---------------- 4️⃣ Feature Interaction ----------------
poly = PolynomialFeatures(degree=2, include_bias=False)
interaction_features = poly.fit_transform(df[['Area_Scaled', 'Bedrooms_Scaled']])
interaction_df = pd.DataFrame(
    interaction_features, 
    columns=poly.get_feature_names_out(['Area_Scaled', 'Bedrooms_Scaled'])
)
df = pd.concat([df, interaction_df], axis=1)

In [26]:
# ---------------- 5️⃣ Power Transformation (Normalization) ----------------
pt = PowerTransformer()
area_bed_transformed = pt.fit_transform(df[['Area', 'Bedrooms']])
df['Area_BoxCox'] = area_bed_transformed[:, 0]
df['Bedrooms_BoxCox'] = area_bed_transformed[:, 1]

In [27]:

# ---------------- 6️⃣ Feature Extraction with PCA ----------------
X = df[['Area_Scaled', 'Bedrooms_Scaled', 'Area_Scaled^2', 'Bedrooms_Scaled^2']]
pca = PCA(n_components=2)
pca_features = pca.fit_transform(X)
df['PCA1'] = pca_features[:, 0]
df['PCA2'] = pca_features[:, 1]

In [28]:
# ---------------- Model Comparison ----------------
X = df[['Area', 'Bedrooms', 'Location_Label']]
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model before Feature Engineering
lr1 = LinearRegression()
lr1.fit(X_train, y_train)
pred1 = lr1.predict(X_test)
print("\nR2 Score before Feature Engineering:", round(r2_score(y_test, pred1), 3))

# Model after Feature Engineering
X_fe = df[['Area_Scaled', 'Bedrooms_Scaled', 'Area_Scaled Bedrooms_Scaled', 'PCA1', 'PCA2']]
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_fe, y, test_size=0.2, random_state=42)

lr2 = LinearRegression()
lr2.fit(X_train2, y_train2)
pred2 = lr2.predict(X_test2)
print("R2 Score after Feature Engineering:", round(r2_score(y_test2, pred2), 3))


R2 Score before Feature Engineering: 0.978
R2 Score after Feature Engineering: 0.862


In [29]:
# ---------------- Output ----------------
print("\nTransformed DataFrame:\n", df.head())
print("\nDataFrame shape:", df.shape)
print("\nAll columns:", df.columns.tolist())


Transformed DataFrame:
      Area  Bedrooms  Price  Location_Label  Location_B  Location_C  \
0  1200.0         2    200               0       False       False   
1  1500.0         3    250               1        True       False   
2  2700.0         3    240               0       False       False   
3  1800.0         4    310               2       False        True   
4  2400.0         4    360               1        True       False   

   Area_Scaled  Bedrooms_Scaled  Area_Scaled  Bedrooms_Scaled  Area_Scaled^2  \
0    -1.558783        -1.672857    -1.558783        -1.672857       2.429806   
1    -1.247027        -0.792406    -1.247027        -0.792406       1.555076   
2     0.000000        -0.792406     0.000000        -0.792406       0.000000   
3    -0.935270         0.088045    -0.935270         0.088045       0.874730   
4    -0.311757         0.088045    -0.311757         0.088045       0.097192   

   Area_Scaled Bedrooms_Scaled  Bedrooms_Scaled^2  Area_BoxCox  \
0      