# Advanced House Price Prediction - Feature Engineering

This notebook implements comprehensive feature engineering techniques for the house price prediction dataset including:
1. Advanced Missing Value Treatment
2. Feature Scaling and Transformation
3. Feature Creation and Interaction
4. Categorical Encoding
5. Feature Selection


In [295]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression
from scipy.stats import skew

%matplotlib inline
pd.set_option('display.max_columns', None)

In [297]:
# Load the dataset

df = pd.read_csv('test.csv')
print('Dataset Shape:', df.shape)
df.head()

Dataset Shape: (1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,TA,TA,PConc,TA,TA,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,Ex,Y,SBrkr,926,678,0,1604,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,1998.0,Fin,2.0,470.0,TA,TA,Y,360,36,0,0,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,Ex,Y,SBrkr,1280,0,0,1280,0.0,0.0,2,0,2,1,Gd,5,Typ,0,,Attchd,1992.0,RFn,2.0,506.0,TA,TA,Y,0,82,0,0,144,0,,,,0,1,2010,WD,Normal


## 1. Advanced Missing Value Treatment

In [300]:
def analyze_missing_values(df):
    missing = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])
    missing['Percentage'] = (missing['Missing Values'] / len(df)) * 100
    return missing[missing['Missing Values'] > 0].sort_values('Percentage', ascending=False)

print("Missing Value Analysis:")
missing_analysis = analyze_missing_values(df)
print(missing_analysis)

Missing Value Analysis:
              Missing Values  Percentage
PoolQC                  1456   99.794380
MiscFeature             1408   96.504455
Alley                   1352   92.666210
Fence                   1169   80.123372
MasVnrType               894   61.274846
FireplaceQu              730   50.034270
LotFrontage              227   15.558602
GarageCond                78    5.346127
GarageYrBlt               78    5.346127
GarageQual                78    5.346127
GarageFinish              78    5.346127
GarageType                76    5.209047
BsmtCond                  45    3.084304
BsmtExposure              44    3.015764
BsmtQual                  44    3.015764
BsmtFinType1              42    2.878684
BsmtFinType2              42    2.878684
MasVnrArea                15    1.028101
MSZoning                   4    0.274160
BsmtFullBath               2    0.137080
BsmtHalfBath               2    0.137080
Functional                 2    0.137080
Utilities                  2    0

In [302]:
# Advanced missing value imputation
def handle_missing_values(df):
    df_processed = df.copy()
    # Categorical variables
    categorical_features = df_processed.select_dtypes(include=['object']).columns
    for feature in categorical_features:
        if df_processed[feature].isnull().sum() > 0:
            #Create missing indicator
            df_processed[feature + '_missing'] = df_processed[feature].isnull().astype(int)
            #Fill with mode for categorical
            df_processed[feature].fillna(df_processed[feature].mode()[0], inplace=True)
            
            # Numerical variables
            numerical_features = df_processed.select_dtypes(include=['int64', 'float64']).columns
            for feature in numerical_features:
                if df_processed[feature].isnull().sum() > 0:
                    # Create missing indicator
                    df_processed[feature + '_missing'] = df_processed[feature].isnull().astype(int)
                     # Fill with median for numerical
                    df_processed[feature].fillna(df_processed[feature].median(), inplace=True)

    return df_processed
df = handle_missing_values(df)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed[feature].fillna(df_processed[feature].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed[feature].fillna(df_processed[feature].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work becau

In [304]:
handle_missing_values(df)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,MSZoning_missing,LotFrontage_missing,MasVnrArea_missing,BsmtFinSF1_missing,BsmtFinSF2_missing,BsmtUnfSF_missing,TotalBsmtSF_missing,BsmtFullBath_missing,BsmtHalfBath_missing,GarageYrBlt_missing,GarageCars_missing,GarageArea_missing,Alley_missing,Utilities_missing,Exterior1st_missing,Exterior2nd_missing,MasVnrType_missing,BsmtQual_missing,BsmtCond_missing,BsmtExposure_missing,BsmtFinType1_missing,BsmtFinType2_missing,KitchenQual_missing,Functional_missing,FireplaceQu_missing,GarageType_missing,GarageFinish_missing,GarageQual_missing,GarageCond_missing,PoolQC_missing,Fence_missing,MiscFeature_missing,SaleType_missing
0,1461,20,RH,80.0,11622,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,BrkFace,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,Gd,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,Ex,MnPrv,Shed,0,6,2010,WD,Normal,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0
1,1462,20,RL,81.0,14267,Pave,Grvl,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,Gd,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,Ex,MnPrv,Gar2,12500,6,2010,WD,Normal,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0
2,1463,60,RL,74.0,13830,Pave,Grvl,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,Ex,MnPrv,Shed,0,3,2010,WD,Normal,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
3,1464,60,RL,78.0,9978,Pave,Grvl,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,TA,TA,PConc,TA,TA,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,Ex,Y,SBrkr,926,678,0,1604,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,1998.0,Fin,2.0,470.0,TA,TA,Y,360,36,0,0,0,0,Ex,MnPrv,Shed,0,6,2010,WD,Normal,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0
4,1465,120,RL,43.0,5005,Pave,Grvl,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,BrkFace,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,Ex,Y,SBrkr,1280,0,0,1280,0.0,0.0,2,0,2,1,Gd,5,Typ,0,Gd,Attchd,1992.0,RFn,2.0,506.0,TA,TA,Y,0,82,0,0,144,0,Ex,MnPrv,Shed,0,1,2010,WD,Normal,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,Twnhs,2Story,4,7,1970,1970,Gable,CompShg,CemntBd,CmentBd,BrkFace,0.0,TA,TA,CBlock,TA,TA,No,Unf,0.0,Unf,0.0,546.0,546.0,GasA,Gd,Y,SBrkr,546,546,0,1092,0.0,0.0,1,1,3,1,TA,5,Typ,0,Gd,Attchd,1979.0,Unf,0.0,0.0,TA,TA,Y,0,0,0,0,0,0,Ex,MnPrv,Shed,0,6,2006,WD,Normal,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0
1455,2916,160,RM,21.0,1894,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,TwnhsE,2Story,4,5,1970,1970,Gable,CompShg,CemntBd,CmentBd,BrkFace,0.0,TA,TA,CBlock,TA,TA,No,Rec,252.0,Unf,0.0,294.0,546.0,GasA,TA,Y,SBrkr,546,546,0,1092,0.0,0.0,1,1,3,1,TA,6,Typ,0,Gd,CarPort,1970.0,Unf,1.0,286.0,TA,TA,Y,0,24,0,0,0,0,Ex,MnPrv,Shed,0,4,2006,WD,Abnorml,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,1,0
1456,2917,20,RL,160.0,20000,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,1Story,5,7,1960,1996,Gable,CompShg,VinylSd,VinylSd,BrkFace,0.0,TA,TA,CBlock,TA,TA,No,ALQ,1224.0,Unf,0.0,0.0,1224.0,GasA,Ex,Y,SBrkr,1224,0,0,1224,1.0,0.0,1,0,4,1,TA,7,Typ,1,TA,Detchd,1960.0,Unf,2.0,576.0,TA,TA,Y,474,0,0,0,0,0,Ex,MnPrv,Shed,0,9,2006,WD,Abnorml,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0
1457,2918,85,RL,62.0,10441,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,SFoyer,5,5,1992,1992,Gable,CompShg,HdBoard,Wd Shng,BrkFace,0.0,TA,TA,PConc,Gd,TA,Av,GLQ,337.0,Unf,0.0,575.0,912.0,GasA,TA,Y,SBrkr,970,0,0,970,0.0,1.0,1,0,3,1,TA,6,Typ,0,Gd,Attchd,1979.0,Unf,0.0,0.0,TA,TA,Y,80,32,0,0,0,0,Ex,MnPrv,Shed,700,7,2006,WD,Normal,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0


## 2. Feature Scaling and Transformation

In [310]:
# Handle skewed numerical features\n",
def handle_skewed_features(df, threshold=0.5):
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
    skewed_features = []
    
    for feature in numerical_features:
        if feature != 'Id':
            skewness = skew(df[feature].dropna())
            
            if abs(skewness) > threshold:
                skewed_features.append(feature)
                df[feature] = np.log1p(df[feature])

    print(f"Applied log transformation to {len(skewed_features)} skewed features")
    
    return df

df = handle_skewed_features(df)

Applied log transformation to 48 skewed features


In [312]:
handle_skewed_features(df)

Applied log transformation to 47 skewed features


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,MSZoning_missing,LotFrontage_missing,MasVnrArea_missing,BsmtFinSF1_missing,BsmtFinSF2_missing,BsmtUnfSF_missing,TotalBsmtSF_missing,BsmtFullBath_missing,BsmtHalfBath_missing,GarageYrBlt_missing,GarageCars_missing,GarageArea_missing,Alley_missing,Utilities_missing,Exterior1st_missing,Exterior2nd_missing,MasVnrType_missing,BsmtQual_missing,BsmtCond_missing,BsmtExposure_missing,BsmtFinType1_missing,BsmtFinType2_missing,KitchenQual_missing,Functional_missing,FireplaceQu_missing,GarageType_missing,GarageFinish_missing,GarageQual_missing,GarageCond_missing,PoolQC_missing,Fence_missing,MiscFeature_missing,SaleType_missing
0,1461,3.044522,RH,0.987819,1.205379,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1.147286,1961,Gable,CompShg,VinylSd,VinylSd,BrkFace,0.000000,TA,TA,CBlock,TA,TA,No,Rec,1.087618,LwQ,1.025279,1.060353,1.115792,GasA,TA,Y,SBrkr,6.799056,0.000000,0.0,6.799056,0.000000,0.000000,1,0.000000,2,0.423036,TA,1.791759,Typ,0.000000,Gd,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,4.948760,0.000000,0.0,0.0,1.014191,0.0,Ex,MnPrv,Shed,0.000000,6,2010,WD,Normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000000,0.000000,0.000000,0.000000,0.423036,0.000000,0.423036,0.0
1,1462,3.044522,RL,0.988664,1.211232,Pave,Grvl,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1.147230,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,1.738947,TA,TA,CBlock,TA,TA,No,ALQ,1.117695,Unf,0.000000,1.080845,1.132457,GasA,TA,Y,SBrkr,7.192934,0.000000,0.0,7.192934,0.000000,0.000000,1,0.423036,3,0.423036,Gd,1.945910,Typ,0.000000,Gd,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,5.976351,3.610918,0.0,0.0,0.000000,0.0,Ex,MnPrv,Gar2,1.207475,6,2010,WD,Normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000000,0.000000,0.000000,0.000000,0.423036,0.423036,0.000000,0.0
2,1463,4.110874,RL,0.982453,1.210354,Pave,Grvl,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1.147958,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,0.000000,TA,TA,PConc,Gd,TA,No,GLQ,1.111170,Unf,0.000000,1.022293,1.117920,GasA,Gd,Y,SBrkr,6.834109,6.553933,0.0,7.396335,0.000000,0.000000,2,0.423036,3,0.423036,TA,1.945910,Typ,0.693147,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,5.361292,3.555348,0.0,0.0,0.000000,0.0,Ex,MnPrv,Shed,0.000000,3,2010,WD,Normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.000000,0.000000,0.000000,0.000000,0.423036,0.000000,0.423036,0.0
3,1464,4.110874,RL,0.986087,1.200927,Pave,Grvl,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1.147977,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,1.397363,TA,TA,PConc,TA,TA,No,GLQ,1.099192,Unf,0.000000,1.069712,1.117830,GasA,Ex,Y,SBrkr,6.831954,6.520621,0.0,7.380879,0.000000,0.000000,2,0.423036,3,0.423036,Gd,2.079442,Typ,0.693147,Gd,Attchd,1998.0,Fin,2.0,470.0,TA,TA,Y,5.888878,3.610918,0.0,0.0,0.000000,0.0,Ex,MnPrv,Shed,0.000000,6,2010,WD,Normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.000000,0.000000,0.000000,0.000000,0.423036,0.423036,0.423036,0.0
4,1465,4.795791,RL,0.942082,1.179647,Pave,Grvl,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1.147866,1992,Gable,CompShg,HdBoard,HdBoard,BrkFace,0.000000,Gd,TA,PConc,Gd,TA,No,ALQ,1.058977,Unf,0.000000,1.121709,1.130976,GasA,Ex,Y,SBrkr,7.155396,0.000000,0.0,7.155396,0.000000,0.000000,2,0.000000,2,0.423036,Gd,1.791759,Typ,0.000000,Gd,Attchd,1992.0,RFn,2.0,506.0,TA,TA,Y,0.000000,4.418841,0.0,0.0,1.025279,0.0,Ex,MnPrv,Shed,0.000000,1,2010,WD,Normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000000,0.000000,0.000000,0.000000,0.423036,0.423036,0.423036,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,5.081404,RM,0.879129,1.146811,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,Twnhs,2Story,4,7,1.147456,1970,Gable,CompShg,CemntBd,CmentBd,BrkFace,0.000000,TA,TA,CBlock,TA,TA,No,Unf,0.000000,Unf,0.000000,1.094766,1.094766,GasA,Gd,Y,SBrkr,6.304449,6.304449,0.0,6.996681,0.000000,0.000000,1,0.423036,3,0.423036,TA,1.791759,Typ,0.000000,Gd,Attchd,1979.0,Unf,0.0,0.0,TA,TA,Y,0.000000,0.000000,0.0,0.0,0.000000,0.0,Ex,MnPrv,Shed,0.000000,6,2006,WD,Normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423036,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.423036,0.423036,0.423036,0.423036,0.423036,0.423036,0.423036,0.0
1455,2916,5.081404,RM,0.879129,1.145997,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,TwnhsE,2Story,4,5,1.147456,1970,Gable,CompShg,CemntBd,CmentBd,BrkFace,0.000000,TA,TA,CBlock,TA,TA,No,Rec,1.056722,Unf,0.000000,1.064766,1.094766,GasA,TA,Y,SBrkr,6.304449,6.304449,0.0,6.996681,0.000000,0.000000,1,0.423036,3,0.423036,TA,1.945910,Typ,0.000000,Gd,CarPort,1970.0,Unf,1.0,286.0,TA,TA,Y,0.000000,3.218876,0.0,0.0,0.000000,0.0,Ex,MnPrv,Shed,0.000000,4,2006,WD,Abnorml,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000000,0.000000,0.000000,0.000000,0.423036,0.423036,0.423036,0.0
1456,2917,3.044522,RL,1.031488,1.220561,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,1Story,5,7,1.147267,1996,Gable,CompShg,VinylSd,VinylSd,BrkFace,0.000000,TA,TA,CBlock,TA,TA,No,ALQ,1.129201,Unf,0.000000,0.000000,1.129201,GasA,Ex,Y,SBrkr,7.110696,0.000000,0.0,7.110696,0.693147,0.000000,1,0.000000,4,0.423036,TA,2.079442,Typ,0.693147,TA,Detchd,1960.0,Unf,2.0,576.0,TA,TA,Y,6.163315,0.000000,0.0,0.0,0.000000,0.0,Ex,MnPrv,Shed,0.000000,9,2006,WD,Abnorml,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.000000,0.000000,0.000000,0.000000,0.423036,0.423036,0.423036,0.0
1457,2918,4.454347,RL,0.969893,1.202260,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,SFoyer,5,5,1.147866,1992,Gable,CompShg,HdBoard,Wd Shng,BrkFace,0.000000,TA,TA,PConc,Gd,TA,Av,GLQ,1.071688,Unf,0.000000,1.097121,1.117194,GasA,TA,Y,SBrkr,6.878326,0.000000,0.0,6.878326,0.000000,0.423036,1,0.000000,3,0.423036,TA,1.945910,Typ,0.000000,Gd,Attchd,1979.0,Unf,0.0,0.0,TA,TA,Y,4.394449,3.496508,0.0,0.0,0.000000,0.0,Ex,MnPrv,Shed,1.105879,7,2006,WD,Normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423036,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.423036,0.423036,0.423036,0.423036,0.423036,0.000000,0.000000,0.0


## 3. Feature Creation and Interaction

In [316]:
# Create new features
    
def create_new_features(df):
    # Total square footage
    df['TotalSF'] = df['1stFlrSF'] + df['2ndFlrSF'] + df['TotalBsmtSF']
    # Total bathrooms
    df['TotalBathrooms'] = df['FullBath'] + (0.5 * df['HalfBath']) + df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath'])
    # Total porch area
    df['TotalPorchSF'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']
    # House age and remodel age
    df['HouseAge'] = df['YrSold'] - df['YearBuilt']
    df['RemodAge'] = df['YrSold'] - df['YearRemodAdd']
    # Overall quality squared (to capture exponential effect)
    df['OverallQual2'] = df['OverallQual'] ** 2
    # Interaction features
    df['QualityArea'] = df['OverallQual'] * df['GrLivArea']
    return df
    
df = create_new_features(df)

In [318]:
create_new_features(df)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,MSZoning_missing,LotFrontage_missing,MasVnrArea_missing,BsmtFinSF1_missing,BsmtFinSF2_missing,BsmtUnfSF_missing,TotalBsmtSF_missing,BsmtFullBath_missing,BsmtHalfBath_missing,GarageYrBlt_missing,GarageCars_missing,GarageArea_missing,Alley_missing,Utilities_missing,Exterior1st_missing,Exterior2nd_missing,MasVnrType_missing,BsmtQual_missing,BsmtCond_missing,BsmtExposure_missing,BsmtFinType1_missing,BsmtFinType2_missing,KitchenQual_missing,Functional_missing,FireplaceQu_missing,GarageType_missing,GarageFinish_missing,GarageQual_missing,GarageCond_missing,PoolQC_missing,Fence_missing,MiscFeature_missing,SaleType_missing,TotalSF,TotalBathrooms,TotalPorchSF,HouseAge,RemodAge,OverallQual2,QualityArea
0,1461,3.044522,RH,0.987819,1.205379,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1.147286,1961,Gable,CompShg,VinylSd,VinylSd,BrkFace,0.000000,TA,TA,CBlock,TA,TA,No,Rec,1.087618,LwQ,1.025279,1.060353,1.115792,GasA,TA,Y,SBrkr,6.799056,0.000000,0.0,6.799056,0.000000,0.000000,1,0.000000,2,0.423036,TA,1.791759,Typ,0.000000,Gd,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,4.948760,0.000000,0.0,0.0,1.014191,0.0,Ex,MnPrv,Shed,0.000000,6,2010,WD,Normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000000,0.000000,0.000000,0.000000,0.423036,0.000000,0.423036,0.0,7.914848,1.000000,1.014191,2008.852714,49,25,33.995279
1,1462,3.044522,RL,0.988664,1.211232,Pave,Grvl,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1.147230,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,1.738947,TA,TA,CBlock,TA,TA,No,ALQ,1.117695,Unf,0.000000,1.080845,1.132457,GasA,TA,Y,SBrkr,7.192934,0.000000,0.0,7.192934,0.000000,0.000000,1,0.423036,3,0.423036,Gd,1.945910,Typ,0.000000,Gd,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,5.976351,3.610918,0.0,0.0,0.000000,0.0,Ex,MnPrv,Gar2,1.207475,6,2010,WD,Normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000000,0.000000,0.000000,0.000000,0.423036,0.423036,0.000000,0.0,8.325391,1.211518,3.610918,2008.852770,52,36,43.157605
2,1463,4.110874,RL,0.982453,1.210354,Pave,Grvl,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1.147958,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,0.000000,TA,TA,PConc,Gd,TA,No,GLQ,1.111170,Unf,0.000000,1.022293,1.117920,GasA,Gd,Y,SBrkr,6.834109,6.553933,0.0,7.396335,0.000000,0.000000,2,0.423036,3,0.423036,TA,1.945910,Typ,0.693147,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,5.361292,3.555348,0.0,0.0,0.000000,0.0,Ex,MnPrv,Shed,0.000000,3,2010,WD,Normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.000000,0.000000,0.000000,0.000000,0.423036,0.000000,0.423036,0.0,14.505963,2.211518,3.555348,2008.852042,12,25,36.981676
3,1464,4.110874,RL,0.986087,1.200927,Pave,Grvl,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1.147977,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,1.397363,TA,TA,PConc,TA,TA,No,GLQ,1.099192,Unf,0.000000,1.069712,1.117830,GasA,Ex,Y,SBrkr,6.831954,6.520621,0.0,7.380879,0.000000,0.000000,2,0.423036,3,0.423036,Gd,2.079442,Typ,0.693147,Gd,Attchd,1998.0,Fin,2.0,470.0,TA,TA,Y,5.888878,3.610918,0.0,0.0,0.000000,0.0,Ex,MnPrv,Shed,0.000000,6,2010,WD,Normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.000000,0.000000,0.000000,0.000000,0.423036,0.423036,0.423036,0.0,14.470405,2.211518,3.610918,2008.852023,12,36,44.285274
4,1465,4.795791,RL,0.942082,1.179647,Pave,Grvl,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1.147866,1992,Gable,CompShg,HdBoard,HdBoard,BrkFace,0.000000,Gd,TA,PConc,Gd,TA,No,ALQ,1.058977,Unf,0.000000,1.121709,1.130976,GasA,Ex,Y,SBrkr,7.155396,0.000000,0.0,7.155396,0.000000,0.000000,2,0.000000,2,0.423036,Gd,1.791759,Typ,0.000000,Gd,Attchd,1992.0,RFn,2.0,506.0,TA,TA,Y,0.000000,4.418841,0.0,0.0,1.025279,0.0,Ex,MnPrv,Shed,0.000000,1,2010,WD,Normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000000,0.000000,0.000000,0.000000,0.423036,0.423036,0.423036,0.0,8.286372,2.000000,5.444120,2008.852134,18,64,57.243170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,5.081404,RM,0.879129,1.146811,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,Twnhs,2Story,4,7,1.147456,1970,Gable,CompShg,CemntBd,CmentBd,BrkFace,0.000000,TA,TA,CBlock,TA,TA,No,Unf,0.000000,Unf,0.000000,1.094766,1.094766,GasA,Gd,Y,SBrkr,6.304449,6.304449,0.0,6.996681,0.000000,0.000000,1,0.423036,3,0.423036,TA,1.791759,Typ,0.000000,Gd,Attchd,1979.0,Unf,0.0,0.0,TA,TA,Y,0.000000,0.000000,0.0,0.0,0.000000,0.0,Ex,MnPrv,Shed,0.000000,6,2006,WD,Normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423036,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.423036,0.423036,0.423036,0.423036,0.423036,0.423036,0.423036,0.0,13.703664,1.211518,0.000000,2004.852544,36,16,27.986726
1455,2916,5.081404,RM,0.879129,1.145997,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,TwnhsE,2Story,4,5,1.147456,1970,Gable,CompShg,CemntBd,CmentBd,BrkFace,0.000000,TA,TA,CBlock,TA,TA,No,Rec,1.056722,Unf,0.000000,1.064766,1.094766,GasA,TA,Y,SBrkr,6.304449,6.304449,0.0,6.996681,0.000000,0.000000,1,0.423036,3,0.423036,TA,1.945910,Typ,0.000000,Gd,CarPort,1970.0,Unf,1.0,286.0,TA,TA,Y,0.000000,3.218876,0.0,0.0,0.000000,0.0,Ex,MnPrv,Shed,0.000000,4,2006,WD,Abnorml,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000000,0.000000,0.000000,0.000000,0.423036,0.423036,0.423036,0.0,13.703664,1.211518,3.218876,2004.852544,36,16,27.986726
1456,2917,3.044522,RL,1.031488,1.220561,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,1Story,5,7,1.147267,1996,Gable,CompShg,VinylSd,VinylSd,BrkFace,0.000000,TA,TA,CBlock,TA,TA,No,ALQ,1.129201,Unf,0.000000,0.000000,1.129201,GasA,Ex,Y,SBrkr,7.110696,0.000000,0.0,7.110696,0.693147,0.000000,1,0.000000,4,0.423036,TA,2.079442,Typ,0.693147,TA,Detchd,1960.0,Unf,2.0,576.0,TA,TA,Y,6.163315,0.000000,0.0,0.0,0.000000,0.0,Ex,MnPrv,Shed,0.000000,9,2006,WD,Abnorml,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.000000,0.000000,0.000000,0.000000,0.423036,0.423036,0.423036,0.0,8.239897,1.693147,0.000000,2004.852733,10,25,35.553481
1457,2918,4.454347,RL,0.969893,1.202260,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,SFoyer,5,5,1.147866,1992,Gable,CompShg,HdBoard,Wd Shng,BrkFace,0.000000,TA,TA,PConc,Gd,TA,Av,GLQ,1.071688,Unf,0.000000,1.097121,1.117194,GasA,TA,Y,SBrkr,6.878326,0.000000,0.0,6.878326,0.000000,0.423036,1,0.000000,3,0.423036,TA,1.945910,Typ,0.000000,Gd,Attchd,1979.0,Unf,0.0,0.0,TA,TA,Y,4.394449,3.496508,0.0,0.0,0.000000,0.0,Ex,MnPrv,Shed,1.105879,7,2006,WD,Normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423036,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.423036,0.423036,0.423036,0.423036,0.423036,0.000000,0.000000,0.0,7.995521,1.211518,3.496508,2004.852134,14,25,34.391632


In [320]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,MSZoning_missing,LotFrontage_missing,MasVnrArea_missing,BsmtFinSF1_missing,BsmtFinSF2_missing,BsmtUnfSF_missing,TotalBsmtSF_missing,BsmtFullBath_missing,BsmtHalfBath_missing,GarageYrBlt_missing,GarageCars_missing,GarageArea_missing,Alley_missing,Utilities_missing,Exterior1st_missing,Exterior2nd_missing,MasVnrType_missing,BsmtQual_missing,BsmtCond_missing,BsmtExposure_missing,BsmtFinType1_missing,BsmtFinType2_missing,KitchenQual_missing,Functional_missing,FireplaceQu_missing,GarageType_missing,GarageFinish_missing,GarageQual_missing,GarageCond_missing,PoolQC_missing,Fence_missing,MiscFeature_missing,SaleType_missing,TotalSF,TotalBathrooms,TotalPorchSF,HouseAge,RemodAge,OverallQual2,QualityArea
0,1461,3.044522,RH,0.987819,1.205379,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1.147286,1961,Gable,CompShg,VinylSd,VinylSd,BrkFace,0.0,TA,TA,CBlock,TA,TA,No,Rec,1.087618,LwQ,1.025279,1.060353,1.115792,GasA,TA,Y,SBrkr,6.799056,0.0,0.0,6.799056,0.0,0.0,1,0.0,2,0.423036,TA,1.791759,Typ,0.0,Gd,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,4.94876,0.0,0.0,0.0,1.014191,0.0,Ex,MnPrv,Shed,0.0,6,2010,WD,Normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.423036,0.0,0.423036,0.0,7.914848,1.0,1.014191,2008.852714,49,25,33.995279
1,1462,3.044522,RL,0.988664,1.211232,Pave,Grvl,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1.14723,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,1.738947,TA,TA,CBlock,TA,TA,No,ALQ,1.117695,Unf,0.0,1.080845,1.132457,GasA,TA,Y,SBrkr,7.192934,0.0,0.0,7.192934,0.0,0.0,1,0.423036,3,0.423036,Gd,1.94591,Typ,0.0,Gd,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,5.976351,3.610918,0.0,0.0,0.0,0.0,Ex,MnPrv,Gar2,1.207475,6,2010,WD,Normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423036,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.423036,0.423036,0.0,0.0,8.325391,1.211518,3.610918,2008.85277,52,36,43.157605
2,1463,4.110874,RL,0.982453,1.210354,Pave,Grvl,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1.147958,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,0.0,TA,TA,PConc,Gd,TA,No,GLQ,1.11117,Unf,0.0,1.022293,1.11792,GasA,Gd,Y,SBrkr,6.834109,6.553933,0.0,7.396335,0.0,0.0,2,0.423036,3,0.423036,TA,1.94591,Typ,0.693147,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,5.361292,3.555348,0.0,0.0,0.0,0.0,Ex,MnPrv,Shed,0.0,3,2010,WD,Normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.423036,0.0,0.423036,0.0,14.505963,2.211518,3.555348,2008.852042,12,25,36.981676
3,1464,4.110874,RL,0.986087,1.200927,Pave,Grvl,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1.147977,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,1.397363,TA,TA,PConc,TA,TA,No,GLQ,1.099192,Unf,0.0,1.069712,1.11783,GasA,Ex,Y,SBrkr,6.831954,6.520621,0.0,7.380879,0.0,0.0,2,0.423036,3,0.423036,Gd,2.079442,Typ,0.693147,Gd,Attchd,1998.0,Fin,2.0,470.0,TA,TA,Y,5.888878,3.610918,0.0,0.0,0.0,0.0,Ex,MnPrv,Shed,0.0,6,2010,WD,Normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423036,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.423036,0.423036,0.423036,0.0,14.470405,2.211518,3.610918,2008.852023,12,36,44.285274
4,1465,4.795791,RL,0.942082,1.179647,Pave,Grvl,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1.147866,1992,Gable,CompShg,HdBoard,HdBoard,BrkFace,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,1.058977,Unf,0.0,1.121709,1.130976,GasA,Ex,Y,SBrkr,7.155396,0.0,0.0,7.155396,0.0,0.0,2,0.0,2,0.423036,Gd,1.791759,Typ,0.0,Gd,Attchd,1992.0,RFn,2.0,506.0,TA,TA,Y,0.0,4.418841,0.0,0.0,1.025279,0.0,Ex,MnPrv,Shed,0.0,1,2010,WD,Normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.423036,0.423036,0.423036,0.0,8.286372,2.0,5.44412,2008.852134,18,64,57.24317


## 4. Categorical Encoding

In [323]:
def encode_categorical_features(df):
    # Encode categorical features
    categorical_features = df.select_dtypes(include=['object']).columns
    # Ordinal encoding for quality and condition features
    quality_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
    quality_features = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond','HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond']
    
    for feature in quality_features:
        if feature in df.columns:
            df[feature] = df[feature].map(quality_mapping)
    # One-hot encoding for remaining categorical features        
    remaining_categorical = [f for f in categorical_features if f not in quality_features]
    
    df = pd.get_dummies(df, columns=remaining_categorical, drop_first=True)
    
    return df

df = encode_categorical_features(df)

In [325]:
encode_categorical_features(df)

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageYrBlt,GarageCars,GarageArea,GarageQual,GarageCond,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,MSZoning_missing,LotFrontage_missing,MasVnrArea_missing,BsmtFinSF1_missing,BsmtFinSF2_missing,BsmtUnfSF_missing,TotalBsmtSF_missing,BsmtFullBath_missing,BsmtHalfBath_missing,GarageYrBlt_missing,GarageCars_missing,GarageArea_missing,Alley_missing,Utilities_missing,Exterior1st_missing,Exterior2nd_missing,MasVnrType_missing,BsmtQual_missing,BsmtCond_missing,BsmtExposure_missing,BsmtFinType1_missing,BsmtFinType2_missing,KitchenQual_missing,Functional_missing,FireplaceQu_missing,GarageType_missing,GarageFinish_missing,GarageQual_missing,GarageCond_missing,PoolQC_missing,Fence_missing,MiscFeature_missing,SaleType_missing,TotalSF,TotalBathrooms,TotalPorchSF,HouseAge,RemodAge,OverallQual2,QualityArea,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,Alley_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,LandContour_Lvl,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Mod,LandSlope_Sev,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkFace,MasVnrType_Stone,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_Rec,BsmtFinType2_Unf,Heating_GasW,Heating_Grav,Heating_Wall,CentralAir_Y,Electrical_FuseF,Electrical_FuseP,Electrical_SBrkr,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageFinish_RFn,GarageFinish_Unf,PavedDrive_P,PavedDrive_Y,PoolQC_Gd,Fence_GdWo,Fence_MnPrv,Fence_MnWw,MiscFeature_Othr,MiscFeature_Shed,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,3.044522,0.987819,1.205379,5,6,1.147286,1961,0.000000,,,,,1.087618,1.025279,1.060353,1.115792,,6.799056,0.000000,0.0,6.799056,0.000000,0.000000,1,0.000000,2,0.423036,,1.791759,0.000000,,1961.0,1.0,730.0,,,4.948760,0.000000,0.0,0.0,1.014191,0.0,0.000000,6,2010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000000,0.000000,0.000000,0.000000,0.423036,0.000000,0.423036,0.0,7.914848,1.000000,1.014191,2008.852714,49,25,33.995279,False,True,False,False,True,False,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False
1,1462,3.044522,0.988664,1.211232,6,6,1.147230,1958,1.738947,,,,,1.117695,0.000000,1.080845,1.132457,,7.192934,0.000000,0.0,7.192934,0.000000,0.000000,1,0.423036,3,0.423036,,1.945910,0.000000,,1958.0,1.0,312.0,,,5.976351,3.610918,0.0,0.0,0.000000,0.0,1.207475,6,2010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000000,0.000000,0.000000,0.000000,0.423036,0.423036,0.000000,0.0,8.325391,1.211518,3.610918,2008.852770,52,36,43.157605,False,False,True,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,True,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False
2,1463,4.110874,0.982453,1.210354,5,5,1.147958,1998,0.000000,,,,,1.111170,0.000000,1.022293,1.117920,,6.834109,6.553933,0.0,7.396335,0.000000,0.000000,2,0.423036,3,0.423036,,1.945910,0.693147,,1997.0,2.0,482.0,,,5.361292,3.555348,0.0,0.0,0.000000,0.0,0.000000,3,2010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.000000,0.000000,0.000000,0.000000,0.423036,0.000000,0.423036,0.0,14.505963,2.211518,3.555348,2008.852042,12,25,36.981676,False,False,True,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False
3,1464,4.110874,0.986087,1.200927,6,6,1.147977,1998,1.397363,,,,,1.099192,0.000000,1.069712,1.117830,,6.831954,6.520621,0.0,7.380879,0.000000,0.000000,2,0.423036,3,0.423036,,2.079442,0.693147,,1998.0,2.0,470.0,,,5.888878,3.610918,0.0,0.0,0.000000,0.0,0.000000,6,2010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.000000,0.000000,0.000000,0.000000,0.423036,0.423036,0.423036,0.0,14.470405,2.211518,3.610918,2008.852023,12,36,44.285274,False,False,True,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False
4,1465,4.795791,0.942082,1.179647,8,5,1.147866,1992,0.000000,,,,,1.058977,0.000000,1.121709,1.130976,,7.155396,0.000000,0.0,7.155396,0.000000,0.000000,2,0.000000,2,0.423036,,1.791759,0.000000,,1992.0,2.0,506.0,,,0.000000,4.418841,0.0,0.0,1.025279,0.0,0.000000,1,2010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000000,0.000000,0.000000,0.000000,0.423036,0.423036,0.423036,0.0,8.286372,2.000000,5.444120,2008.852134,18,64,57.243170,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,False,True,True,False,False,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,5.081404,0.879129,1.146811,4,7,1.147456,1970,0.000000,,,,,0.000000,0.000000,1.094766,1.094766,,6.304449,6.304449,0.0,6.996681,0.000000,0.000000,1,0.423036,3,0.423036,,1.791759,0.000000,,1979.0,0.0,0.0,,,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,6,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423036,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.423036,0.423036,0.423036,0.423036,0.423036,0.423036,0.423036,0.0,13.703664,1.211518,0.000000,2004.852544,36,16,27.986726,False,False,False,True,True,False,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False
1455,2916,5.081404,0.879129,1.145997,4,5,1.147456,1970,0.000000,,,,,1.056722,0.000000,1.064766,1.094766,,6.304449,6.304449,0.0,6.996681,0.000000,0.000000,1,0.423036,3,0.423036,,1.945910,0.000000,,1970.0,1.0,286.0,,,0.000000,3.218876,0.0,0.0,0.000000,0.0,0.000000,4,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000000,0.000000,0.000000,0.000000,0.423036,0.423036,0.423036,0.0,13.703664,1.211518,3.218876,2004.852544,36,16,27.986726,False,False,False,True,True,False,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,True,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False
1456,2917,3.044522,1.031488,1.220561,5,7,1.147267,1996,0.000000,,,,,1.129201,0.000000,0.000000,1.129201,,7.110696,0.000000,0.0,7.110696,0.693147,0.000000,1,0.000000,4,0.423036,,2.079442,0.693147,,1960.0,2.0,576.0,,,6.163315,0.000000,0.0,0.0,0.000000,0.0,0.000000,9,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.000000,0.000000,0.000000,0.000000,0.423036,0.423036,0.423036,0.0,8.239897,1.693147,0.000000,2004.852733,10,25,35.553481,False,False,True,False,True,False,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,True,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False
1457,2918,4.454347,0.969893,1.202260,5,5,1.147866,1992,0.000000,,,,,1.071688,0.000000,1.097121,1.117194,,6.878326,0.000000,0.0,6.878326,0.000000,0.423036,1,0.000000,3,0.423036,,1.945910,0.000000,,1979.0,0.0,0.0,,,4.394449,3.496508,0.0,0.0,0.000000,0.0,1.105879,7,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423036,0.0,0.0,0.423036,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.423036,0.423036,0.423036,0.423036,0.423036,0.000000,0.000000,0.0,7.995521,1.211518,3.496508,2004.852134,14,25,34.391632,False,False,True,False,True,False,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False


## 5. Feature Selection

In [328]:
##Remove low variance features\n",
  

def remove_low_variance(df, threshold=0.01):
    variance = df.var()
    low_variance_features = variance[variance < threshold].index
    df_filtered = df.drop(columns=low_variance_features)
    print(f"Removed {len(low_variance_features)} low variance features")
    
    return df_filtered

df = remove_low_variance(df)

Removed 84 low variance features


In [330]:
# Save processed dataset

df.to_csv('test_processed.csv', index=False)
print("Final dataset shape:", df.shape)
print("Feature engineering complete! Processed data saved to 'test_processed.csv' ")

Final dataset shape: (1459, 161)
Feature engineering complete! Processed data saved to 'test_processed.csv' 
