# FEATURE ENGINEERING

We will be performing all the below steps in Feature Engineering

1. Handle Missing values
2. Temporal variables
3. Categorical variables: remove rare labels
4. Standarise the values of the variables to the same range

In [618]:
# import modules

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Display all the columns of the dataframe

pd.pandas.set_option('display.max_columns',None)

In [619]:
# load the data

# Set the path to the raw data folder
raw_data_path = 'C:\\Users\\prath\\Advanced-House-Price-Prediction\\data\\raw\\'

# Load the train.csv file into a pandas DataFrame
df_train = pd.read_csv(raw_data_path + 'train.csv')

## Handling missing values

### Handling numerical missing values

In [620]:
# Numerical variables the contains missing values

numerical_with_nan=[feature for feature in df_train.columns if df_train[feature].isnull().sum()>1 and df_train[feature].dtypes!='O']

# Print the numerical NaN variables and percentage of missing values

for feature in numerical_with_nan:
    print("{}: {}% missing value".format(feature,np.around(df_train[feature].isnull().mean(),4)))

LotFrontage: 0.1774% missing value
MasVnrArea: 0.0055% missing value
GarageYrBlt: 0.0555% missing value


In [621]:
# Replacing the numerical Missing Values

for feature in numerical_with_nan:
    # We will replace by using median since there are outliers
    median_value=df_train[feature].median()
    
    # create a new feature to capture nan values
    df_train[feature+'nan']=np.where(df_train[feature].isnull(),1,0)
    df_train[feature].fillna(median_value,inplace=True)
    
df_train[numerical_with_nan].isnull().sum()

LotFrontage    0
MasVnrArea     0
GarageYrBlt    0
dtype: int64

### Handling Temporal features

In [622]:
# Temporal features - datetime variables

for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
       
    df_train[feature]=df_train['YrSold']-df_train[feature]

df_train[['YearBuilt','YearRemodAdd','GarageYrBlt']].head()

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt
0,5,5,5.0
1,31,31,31.0
2,7,6,7.0
3,91,36,8.0
4,8,8,8.0


### Logarithmic Transformation

In [623]:
# Using log transformation to reduce the skewness of data

num_features=['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']

for feature in num_features:
    df_train[feature]=np.log(df_train[feature])

### Handling categorical missing values

In [624]:
# Categorical features which are missing

features_nan=[feature for feature in df_train.columns if df_train[feature].isnull().sum()>1 and df_train[feature].dtypes=='O']

for feature in features_nan:
    print("{}: {}% missing values".format(feature,np.round(df_train[feature].isnull().mean(),4)))

Alley: 0.9377% missing values
MasVnrType: 0.0055% missing values
BsmtQual: 0.0253% missing values
BsmtCond: 0.0253% missing values
BsmtExposure: 0.026% missing values
BsmtFinType1: 0.0253% missing values
BsmtFinType2: 0.026% missing values
FireplaceQu: 0.4726% missing values
GarageType: 0.0555% missing values
GarageFinish: 0.0555% missing values
GarageQual: 0.0555% missing values
GarageCond: 0.0555% missing values
PoolQC: 0.9952% missing values
Fence: 0.8075% missing values
MiscFeature: 0.963% missing values


In [625]:
# Replace missing value with a new label/category

def replace_cat_feature(df_train,features_nan):
    data=df_train.copy()
    data[features_nan]=data[features_nan].fillna('Missing')
    return data

df_train=replace_cat_feature(df_train,features_nan)

df_train[features_nan].isnull().sum()

Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

### Handling rare categorical features

* We will remove categorical variables that are present less than 1% of the observations

In [626]:
# list of categorical features
categorical_features=[feature for feature in df_train.columns if df_train[feature].dtype=='O']

In [627]:
# Replacing categories with less than 1%

for feature in categorical_features:
    temp=df_train.groupby(feature)['SalePrice'].count()/len(df_train)
    temp_df=temp[temp>0.01].index
    df_train[feature]=np.where(df_train[feature].isin(temp_df),df_train[feature],'Rare_var')

In [628]:
df_train.head(50)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,60,RL,4.174387,9.041922,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,5,5,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,6.75227,854,0,7.444249,1,0,2,1,3,1,Gd,8,Typ,0,Missing,Attchd,5.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,Missing,Missing,Missing,0,2,2008,WD,Normal,12.247694,0,0,0
1,2,20,RL,4.382027,9.169518,Pave,Missing,Reg,Lvl,AllPub,FR2,Gtl,Rare_var,Feedr,Norm,1Fam,1Story,6,8,31,31,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,7.140453,0,0,7.140453,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,31.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,Missing,Missing,Missing,0,5,2007,WD,Normal,12.109011,0,0,0
2,3,60,RL,4.219508,9.328123,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,7,6,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,6.824374,866,0,7.487734,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,7.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,Missing,Missing,Missing,0,9,2008,WD,Normal,12.317167,0,0,0
3,4,70,RL,4.094345,9.164296,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,91,36,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,6.867974,756,0,7.448334,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,8.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,Missing,Missing,Missing,0,2,2006,WD,Abnorml,11.849398,0,0,0
4,5,60,RL,4.430817,9.565214,Pave,Missing,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,8,8,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,7.04316,1053,0,7.695303,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,8.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,Missing,Missing,Missing,0,12,2008,WD,Normal,12.429216,0,0,0
5,6,50,RL,4.442651,9.554993,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,1.5Fin,5,5,16,14,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,Rare_var,Gd,TA,No,GLQ,732,Unf,0,64,796,GasA,Ex,Y,SBrkr,6.679599,566,0,7.216709,1,0,1,1,1,1,TA,5,Typ,0,Missing,Attchd,16.0,Unf,2,480,TA,TA,Y,40,30,0,320,0,0,Missing,MnPrv,Shed,700,10,2009,WD,Normal,11.8706,0,0,0
6,7,20,RL,4.317488,9.218705,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,Somerst,Norm,Norm,1Fam,1Story,8,5,3,2,Gable,CompShg,VinylSd,VinylSd,Stone,186.0,Gd,TA,PConc,Ex,TA,Av,GLQ,1369,Unf,0,317,1686,GasA,Ex,Y,SBrkr,7.434848,0,0,7.434848,1,0,2,0,3,1,Gd,7,Typ,1,Gd,Attchd,3.0,RFn,2,636,TA,TA,Y,255,57,0,0,0,0,Missing,Missing,Missing,0,8,2007,WD,Normal,12.634603,0,0,0
7,8,60,RL,4.234107,9.247829,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,NWAmes,PosN,Norm,1Fam,2Story,7,6,36,36,Gable,CompShg,HdBoard,HdBoard,Stone,240.0,TA,TA,CBlock,Gd,TA,Mn,ALQ,859,BLQ,32,216,1107,GasA,Ex,Y,SBrkr,7.009409,983,0,7.644919,1,0,2,1,3,1,TA,7,Typ,2,TA,Attchd,36.0,RFn,2,484,TA,TA,Y,235,204,228,0,0,0,Missing,Missing,Shed,350,11,2009,WD,Normal,12.206073,1,0,0
8,9,50,RM,3.931826,8.719317,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Artery,Norm,1Fam,1.5Fin,7,5,77,58,Gable,CompShg,BrkFace,Wd Shng,,0.0,TA,TA,BrkTil,TA,TA,No,Unf,0,Unf,0,952,952,GasA,Gd,Y,FuseF,6.929517,752,0,7.480992,0,0,2,0,2,2,TA,8,Min1,2,TA,Detchd,77.0,Unf,2,468,Fa,TA,Y,90,0,205,0,0,0,Missing,Missing,Missing,0,4,2008,WD,Abnorml,11.77452,0,0,0
9,10,190,RL,3.912023,8.911934,Pave,Missing,Reg,Lvl,AllPub,Corner,Gtl,BrkSide,Artery,Rare_var,2fmCon,Rare_var,5,6,69,58,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,BrkTil,TA,TA,No,GLQ,851,Unf,0,140,991,GasA,Ex,Y,SBrkr,6.981935,0,0,6.981935,1,0,1,0,2,2,TA,5,Typ,2,TA,Attchd,69.0,RFn,1,205,Rare_var,TA,Y,0,4,0,0,0,0,Missing,Missing,Missing,0,1,2008,WD,Normal,11.67844,0,0,0


## Feature Scalling

In [629]:
# encode categorical features based on their relationship with the target variable into numerical.

for feature in categorical_features:
    labels_ordered=df_train.groupby([feature])['SalePrice'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    df_train[feature]=df_train[feature].map(labels_ordered)

df_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,60,3,4.174387,9.041922,1,2,0,1,1,0,0,14,2,1,3,5,7,5,5,5,0,0,10,10,2,196.0,2,3,4,3,3,1,6,706,5,0,150,856,2,4,1,3,6.752270,854,0,7.444249,1,0,2,1,3,1,2,8,4,0,1,4,5.0,2,2,548,2,3,2,0,61,0,0,0,0,0,4,2,0,2,2008,2,3,12.247694,0,0,0
1,2,20,3,4.382027,9.169518,1,2,0,1,1,2,0,11,1,1,3,3,6,8,31,31,0,0,4,3,1,0.0,1,3,2,3,3,4,4,978,5,0,284,1262,2,4,1,3,7.140453,0,0,7.140453,0,1,2,0,3,1,1,6,4,1,3,4,31.0,2,2,460,2,3,2,298,0,0,0,0,0,0,4,2,0,5,2007,2,3,12.109011,0,0,0
2,3,60,3,4.219508,9.328123,1,2,1,1,1,0,0,14,2,1,3,5,7,5,7,6,0,0,10,10,2,162.0,2,3,4,3,3,2,6,486,5,0,434,920,2,4,1,3,6.824374,866,0,7.487734,1,0,2,1,3,1,2,6,4,1,3,4,7.0,2,2,608,2,3,2,0,42,0,0,0,0,0,4,2,0,9,2008,2,3,12.317167,0,0,0
3,4,70,3,4.094345,9.164296,1,2,1,1,1,1,0,16,2,1,3,5,7,5,91,36,0,0,2,4,1,0.0,1,3,1,2,4,1,4,216,5,0,540,756,2,3,1,3,6.867974,756,0,7.448334,1,0,1,0,3,1,2,7,4,1,4,2,8.0,1,3,642,2,3,2,0,35,272,0,0,0,0,4,2,0,2,2006,2,0,11.849398,0,0,0
4,5,60,3,4.430817,9.565214,1,2,1,1,1,2,0,22,2,1,3,5,8,5,8,8,0,0,10,10,2,350.0,2,3,4,3,3,3,6,655,5,0,490,1145,2,4,1,3,7.043160,1053,0,7.695303,1,0,2,1,4,1,2,9,4,1,3,4,8.0,2,3,836,2,3,2,192,84,0,0,0,0,0,4,2,0,12,2008,2,3,12.429216,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,3,4.127134,8.976768,1,2,0,1,1,0,0,13,2,1,3,5,6,5,8,7,0,0,10,10,1,0.0,1,3,4,3,3,1,5,0,5,0,953,953,2,4,1,3,6.859615,694,0,7.406711,0,0,2,1,3,1,1,7,4,1,3,4,8.0,2,2,460,2,3,2,0,40,0,0,0,0,0,4,2,0,8,2007,2,3,12.072541,0,0,0
1456,1457,20,3,4.442651,9.486076,1,2,0,1,1,0,0,12,2,1,3,3,6,6,32,22,0,0,7,7,4,119.0,1,3,2,3,3,1,4,790,2,163,589,1542,2,2,1,3,7.636752,0,0,7.636752,1,0,2,0,3,1,1,7,3,2,3,4,32.0,1,2,500,2,3,2,349,0,0,0,0,0,0,2,2,0,2,2010,2,3,12.254863,0,0,0
1457,1458,70,3,4.189655,9.109636,1,2,0,1,1,0,0,16,2,1,3,5,7,9,69,4,0,0,9,9,1,0.0,3,2,3,2,4,1,6,275,5,0,877,1152,2,4,1,3,7.080026,1152,0,7.757906,0,0,2,0,4,1,2,9,4,2,4,4,69.0,2,1,252,2,3,2,0,60,0,0,0,0,0,3,1,2500,5,2010,2,3,12.493130,0,0,0
1458,1459,20,3,4.219508,9.181632,1,2,0,1,1,0,0,8,2,1,3,3,5,6,60,14,2,0,4,3,1,0.0,1,3,2,2,3,2,6,49,2,1029,0,1078,2,3,1,2,6.982863,0,0,6.982863,1,0,1,0,2,1,2,5,4,0,1,4,60.0,1,1,240,2,3,2,366,0,112,0,0,0,0,4,2,0,4,2010,2,3,11.864462,0,0,0


In [630]:
from sklearn.preprocessing import MinMaxScaler

feature_scale=[feature for feature in df_train.columns if feature not in ['Id','SalePrice']]
dataframe = df_train

def scale_features(dataframe, feature_scale):
    """
    Scales the features in a pandas DataFrame using MinMaxScaler from scikit-learn.

    Parameters:
    dataframe (pandas DataFrame): The input DataFrame to scale.
    feature_scale (list): The list of feature names to scale.

    Returns:
     An array having same number of rows as the original dataset, but the values in each column will be scaled to between 0 and 1.
    """
    # Create a scaler object
    scaler = MinMaxScaler()
    
    # Fit the scaler to the specified features
    scaler.fit(dataframe[feature_scale])

    # Scale the specified features in the DataFrame
    dataframe[feature_scale] = scaler.transform(dataframe[feature_scale])

    return dataframe
scale_features(dataframe,feature_scale)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,0.235294,0.75,0.418208,0.366344,1.0,1.0,0.000000,0.333333,1.0,0.00,0.0,0.636364,0.4,1.0,0.75,1.0,0.666667,0.500,0.036765,0.098361,0.0,0.0,1.0,1.0,0.50,0.122500,0.666667,1.000000,1.00,0.75,0.75,0.25,1.000000,0.125089,0.833333,0.000000,0.064212,0.140098,1.0,1.00,1.0,1.000000,0.356155,0.413559,0.0,0.577712,0.333333,0.0,0.666667,0.5,0.375,0.333333,0.666667,0.500000,1.00,0.000000,0.2,0.8,0.046729,0.666667,0.50,0.386460,0.666667,1.0,1.0,0.000000,0.111517,0.000000,0.0,0.0,0.0,0.0,1.00,1.0,0.00000,0.090909,0.50,0.666667,0.75,12.247694,0.0,0.0,0.0
1,2,0.000000,0.75,0.495064,0.391317,1.0,1.0,0.000000,0.333333,1.0,0.50,0.0,0.500000,0.2,1.0,0.75,0.6,0.555556,0.875,0.227941,0.524590,0.0,0.0,0.4,0.3,0.25,0.000000,0.333333,1.000000,0.50,0.75,0.75,1.00,0.666667,0.173281,0.833333,0.000000,0.121575,0.206547,1.0,1.00,1.0,1.000000,0.503056,0.000000,0.0,0.470245,0.000000,0.5,0.666667,0.0,0.375,0.333333,0.333333,0.333333,1.00,0.333333,0.6,0.8,0.289720,0.666667,0.50,0.324401,0.666667,1.0,1.0,0.347725,0.000000,0.000000,0.0,0.0,0.0,0.0,1.00,1.0,0.00000,0.363636,0.25,0.666667,0.75,12.109011,0.0,0.0,0.0
2,3,0.235294,0.75,0.434909,0.422359,1.0,1.0,0.333333,0.333333,1.0,0.00,0.0,0.636364,0.4,1.0,0.75,1.0,0.666667,0.500,0.051471,0.114754,0.0,0.0,1.0,1.0,0.50,0.101250,0.666667,1.000000,1.00,0.75,0.75,0.50,1.000000,0.086109,0.833333,0.000000,0.185788,0.150573,1.0,1.00,1.0,1.000000,0.383441,0.419370,0.0,0.593095,0.333333,0.0,0.666667,0.5,0.375,0.333333,0.666667,0.333333,1.00,0.333333,0.6,0.8,0.065421,0.666667,0.50,0.428773,0.666667,1.0,1.0,0.000000,0.076782,0.000000,0.0,0.0,0.0,0.0,1.00,1.0,0.00000,0.727273,0.50,0.666667,0.75,12.317167,0.0,0.0,0.0
3,4,0.294118,0.75,0.388581,0.390295,1.0,1.0,0.333333,0.333333,1.0,0.25,0.0,0.727273,0.4,1.0,0.75,1.0,0.666667,0.500,0.669118,0.606557,0.0,0.0,0.2,0.4,0.25,0.000000,0.333333,1.000000,0.25,0.50,1.00,0.25,0.666667,0.038271,0.833333,0.000000,0.231164,0.123732,1.0,0.75,1.0,1.000000,0.399941,0.366102,0.0,0.579157,0.333333,0.0,0.333333,0.0,0.375,0.333333,0.666667,0.416667,1.00,0.333333,0.8,0.4,0.074766,0.333333,0.75,0.452750,0.666667,1.0,1.0,0.000000,0.063985,0.492754,0.0,0.0,0.0,0.0,1.00,1.0,0.00000,0.090909,0.00,0.666667,0.00,11.849398,0.0,0.0,0.0
4,5,0.235294,0.75,0.513123,0.468761,1.0,1.0,0.333333,0.333333,1.0,0.50,0.0,1.000000,0.4,1.0,0.75,1.0,0.777778,0.500,0.058824,0.147541,0.0,0.0,1.0,1.0,0.50,0.218750,0.666667,1.000000,1.00,0.75,0.75,0.75,1.000000,0.116052,0.833333,0.000000,0.209760,0.187398,1.0,1.00,1.0,1.000000,0.466237,0.509927,0.0,0.666523,0.333333,0.0,0.666667,0.5,0.500,0.333333,0.666667,0.583333,1.00,0.333333,0.6,0.8,0.074766,0.666667,0.75,0.589563,0.666667,1.0,1.0,0.224037,0.153565,0.000000,0.0,0.0,0.0,0.0,1.00,1.0,0.00000,1.000000,0.50,0.666667,0.75,12.429216,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,0.235294,0.75,0.400718,0.353592,1.0,1.0,0.000000,0.333333,1.0,0.00,0.0,0.590909,0.4,1.0,0.75,1.0,0.555556,0.500,0.058824,0.131148,0.0,0.0,1.0,1.0,0.25,0.000000,0.333333,1.000000,1.00,0.75,0.75,0.25,0.833333,0.000000,0.833333,0.000000,0.407962,0.155974,1.0,1.00,1.0,1.000000,0.396777,0.336077,0.0,0.564433,0.000000,0.0,0.666667,0.5,0.375,0.333333,0.333333,0.416667,1.00,0.333333,0.6,0.8,0.074766,0.666667,0.50,0.324401,0.666667,1.0,1.0,0.000000,0.073126,0.000000,0.0,0.0,0.0,0.0,1.00,1.0,0.00000,0.636364,0.25,0.666667,0.75,12.072541,0.0,0.0,0.0
1456,1457,0.000000,0.75,0.517503,0.453273,1.0,1.0,0.000000,0.333333,1.0,0.00,0.0,0.545455,0.4,1.0,0.75,0.6,0.555556,0.625,0.235294,0.377049,0.0,0.0,0.7,0.7,1.00,0.074375,0.333333,1.000000,0.50,0.75,0.75,0.25,0.666667,0.139972,0.333333,0.110583,0.252140,0.252373,1.0,0.50,1.0,1.000000,0.690872,0.000000,0.0,0.645810,0.333333,0.0,0.666667,0.0,0.375,0.333333,0.333333,0.416667,0.75,0.666667,0.6,0.8,0.299065,0.333333,0.50,0.352609,0.666667,1.0,1.0,0.407235,0.000000,0.000000,0.0,0.0,0.0,0.0,0.50,1.0,0.00000,0.090909,1.00,0.666667,0.75,12.254863,0.0,0.0,0.0
1457,1458,0.294118,0.75,0.423859,0.379597,1.0,1.0,0.000000,0.333333,1.0,0.00,0.0,0.727273,0.4,1.0,0.75,1.0,0.666667,1.000,0.507353,0.081967,0.0,0.0,0.9,0.9,0.25,0.000000,1.000000,0.666667,0.75,0.50,1.00,0.25,1.000000,0.048724,0.833333,0.000000,0.375428,0.188543,1.0,1.00,1.0,1.000000,0.480189,0.557869,0.0,0.688669,0.000000,0.0,0.666667,0.0,0.500,0.333333,0.666667,0.583333,1.00,0.666667,0.8,0.8,0.644860,0.666667,0.25,0.177715,0.666667,1.0,1.0,0.000000,0.109689,0.000000,0.0,0.0,0.0,0.0,0.75,0.5,0.16129,0.363636,1.00,0.666667,0.75,12.493130,0.0,0.0,0.0
1458,1459,0.000000,0.75,0.434909,0.393688,1.0,1.0,0.000000,0.333333,1.0,0.00,0.0,0.363636,0.4,1.0,0.75,0.6,0.444444,0.625,0.441176,0.245902,1.0,0.0,0.4,0.3,0.25,0.000000,0.333333,1.000000,0.50,0.50,0.75,0.50,1.000000,0.008682,0.333333,0.698100,0.000000,0.176432,1.0,0.75,1.0,0.666667,0.443419,0.000000,0.0,0.414497,0.333333,0.0,0.333333,0.0,0.250,0.333333,0.666667,0.250000,1.00,0.000000,0.2,0.8,0.560748,0.333333,0.25,0.169252,0.666667,1.0,1.0,0.427071,0.000000,0.202899,0.0,0.0,0.0,0.0,1.00,1.0,0.00000,0.272727,1.00,0.666667,0.75,11.864462,0.0,0.0,0.0


# Feature Selection

In [631]:
# load modules

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [632]:
dataframe

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,0.235294,0.75,0.418208,0.366344,1.0,1.0,0.000000,0.333333,1.0,0.00,0.0,0.636364,0.4,1.0,0.75,1.0,0.666667,0.500,0.036765,0.098361,0.0,0.0,1.0,1.0,0.50,0.122500,0.666667,1.000000,1.00,0.75,0.75,0.25,1.000000,0.125089,0.833333,0.000000,0.064212,0.140098,1.0,1.00,1.0,1.000000,0.356155,0.413559,0.0,0.577712,0.333333,0.0,0.666667,0.5,0.375,0.333333,0.666667,0.500000,1.00,0.000000,0.2,0.8,0.046729,0.666667,0.50,0.386460,0.666667,1.0,1.0,0.000000,0.111517,0.000000,0.0,0.0,0.0,0.0,1.00,1.0,0.00000,0.090909,0.50,0.666667,0.75,12.247694,0.0,0.0,0.0
1,2,0.000000,0.75,0.495064,0.391317,1.0,1.0,0.000000,0.333333,1.0,0.50,0.0,0.500000,0.2,1.0,0.75,0.6,0.555556,0.875,0.227941,0.524590,0.0,0.0,0.4,0.3,0.25,0.000000,0.333333,1.000000,0.50,0.75,0.75,1.00,0.666667,0.173281,0.833333,0.000000,0.121575,0.206547,1.0,1.00,1.0,1.000000,0.503056,0.000000,0.0,0.470245,0.000000,0.5,0.666667,0.0,0.375,0.333333,0.333333,0.333333,1.00,0.333333,0.6,0.8,0.289720,0.666667,0.50,0.324401,0.666667,1.0,1.0,0.347725,0.000000,0.000000,0.0,0.0,0.0,0.0,1.00,1.0,0.00000,0.363636,0.25,0.666667,0.75,12.109011,0.0,0.0,0.0
2,3,0.235294,0.75,0.434909,0.422359,1.0,1.0,0.333333,0.333333,1.0,0.00,0.0,0.636364,0.4,1.0,0.75,1.0,0.666667,0.500,0.051471,0.114754,0.0,0.0,1.0,1.0,0.50,0.101250,0.666667,1.000000,1.00,0.75,0.75,0.50,1.000000,0.086109,0.833333,0.000000,0.185788,0.150573,1.0,1.00,1.0,1.000000,0.383441,0.419370,0.0,0.593095,0.333333,0.0,0.666667,0.5,0.375,0.333333,0.666667,0.333333,1.00,0.333333,0.6,0.8,0.065421,0.666667,0.50,0.428773,0.666667,1.0,1.0,0.000000,0.076782,0.000000,0.0,0.0,0.0,0.0,1.00,1.0,0.00000,0.727273,0.50,0.666667,0.75,12.317167,0.0,0.0,0.0
3,4,0.294118,0.75,0.388581,0.390295,1.0,1.0,0.333333,0.333333,1.0,0.25,0.0,0.727273,0.4,1.0,0.75,1.0,0.666667,0.500,0.669118,0.606557,0.0,0.0,0.2,0.4,0.25,0.000000,0.333333,1.000000,0.25,0.50,1.00,0.25,0.666667,0.038271,0.833333,0.000000,0.231164,0.123732,1.0,0.75,1.0,1.000000,0.399941,0.366102,0.0,0.579157,0.333333,0.0,0.333333,0.0,0.375,0.333333,0.666667,0.416667,1.00,0.333333,0.8,0.4,0.074766,0.333333,0.75,0.452750,0.666667,1.0,1.0,0.000000,0.063985,0.492754,0.0,0.0,0.0,0.0,1.00,1.0,0.00000,0.090909,0.00,0.666667,0.00,11.849398,0.0,0.0,0.0
4,5,0.235294,0.75,0.513123,0.468761,1.0,1.0,0.333333,0.333333,1.0,0.50,0.0,1.000000,0.4,1.0,0.75,1.0,0.777778,0.500,0.058824,0.147541,0.0,0.0,1.0,1.0,0.50,0.218750,0.666667,1.000000,1.00,0.75,0.75,0.75,1.000000,0.116052,0.833333,0.000000,0.209760,0.187398,1.0,1.00,1.0,1.000000,0.466237,0.509927,0.0,0.666523,0.333333,0.0,0.666667,0.5,0.500,0.333333,0.666667,0.583333,1.00,0.333333,0.6,0.8,0.074766,0.666667,0.75,0.589563,0.666667,1.0,1.0,0.224037,0.153565,0.000000,0.0,0.0,0.0,0.0,1.00,1.0,0.00000,1.000000,0.50,0.666667,0.75,12.429216,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,0.235294,0.75,0.400718,0.353592,1.0,1.0,0.000000,0.333333,1.0,0.00,0.0,0.590909,0.4,1.0,0.75,1.0,0.555556,0.500,0.058824,0.131148,0.0,0.0,1.0,1.0,0.25,0.000000,0.333333,1.000000,1.00,0.75,0.75,0.25,0.833333,0.000000,0.833333,0.000000,0.407962,0.155974,1.0,1.00,1.0,1.000000,0.396777,0.336077,0.0,0.564433,0.000000,0.0,0.666667,0.5,0.375,0.333333,0.333333,0.416667,1.00,0.333333,0.6,0.8,0.074766,0.666667,0.50,0.324401,0.666667,1.0,1.0,0.000000,0.073126,0.000000,0.0,0.0,0.0,0.0,1.00,1.0,0.00000,0.636364,0.25,0.666667,0.75,12.072541,0.0,0.0,0.0
1456,1457,0.000000,0.75,0.517503,0.453273,1.0,1.0,0.000000,0.333333,1.0,0.00,0.0,0.545455,0.4,1.0,0.75,0.6,0.555556,0.625,0.235294,0.377049,0.0,0.0,0.7,0.7,1.00,0.074375,0.333333,1.000000,0.50,0.75,0.75,0.25,0.666667,0.139972,0.333333,0.110583,0.252140,0.252373,1.0,0.50,1.0,1.000000,0.690872,0.000000,0.0,0.645810,0.333333,0.0,0.666667,0.0,0.375,0.333333,0.333333,0.416667,0.75,0.666667,0.6,0.8,0.299065,0.333333,0.50,0.352609,0.666667,1.0,1.0,0.407235,0.000000,0.000000,0.0,0.0,0.0,0.0,0.50,1.0,0.00000,0.090909,1.00,0.666667,0.75,12.254863,0.0,0.0,0.0
1457,1458,0.294118,0.75,0.423859,0.379597,1.0,1.0,0.000000,0.333333,1.0,0.00,0.0,0.727273,0.4,1.0,0.75,1.0,0.666667,1.000,0.507353,0.081967,0.0,0.0,0.9,0.9,0.25,0.000000,1.000000,0.666667,0.75,0.50,1.00,0.25,1.000000,0.048724,0.833333,0.000000,0.375428,0.188543,1.0,1.00,1.0,1.000000,0.480189,0.557869,0.0,0.688669,0.000000,0.0,0.666667,0.0,0.500,0.333333,0.666667,0.583333,1.00,0.666667,0.8,0.8,0.644860,0.666667,0.25,0.177715,0.666667,1.0,1.0,0.000000,0.109689,0.000000,0.0,0.0,0.0,0.0,0.75,0.5,0.16129,0.363636,1.00,0.666667,0.75,12.493130,0.0,0.0,0.0
1458,1459,0.000000,0.75,0.434909,0.393688,1.0,1.0,0.000000,0.333333,1.0,0.00,0.0,0.363636,0.4,1.0,0.75,0.6,0.444444,0.625,0.441176,0.245902,1.0,0.0,0.4,0.3,0.25,0.000000,0.333333,1.000000,0.50,0.50,0.75,0.50,1.000000,0.008682,0.333333,0.698100,0.000000,0.176432,1.0,0.75,1.0,0.666667,0.443419,0.000000,0.0,0.414497,0.333333,0.0,0.333333,0.0,0.250,0.333333,0.666667,0.250000,1.00,0.000000,0.2,0.8,0.560748,0.333333,0.25,0.169252,0.666667,1.0,1.0,0.427071,0.000000,0.202899,0.0,0.0,0.0,0.0,1.00,1.0,0.00000,0.272727,1.00,0.666667,0.75,11.864462,0.0,0.0,0.0


In [633]:
# split dataset into independent and dependent features

x_train = dataframe.drop(columns=['SalePrice','Id'], axis=1)  # Independent features
y_train = dataframe.SalePrice               # Dependent feature
y_train.name = 'SalePrice'

In [635]:
#Apply Feature Selection

# select a suitable alpha (equivalent of penalty), The bigger the alpha the less features that will be selected.

feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0))
feature_sel_model.fit(x_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, random_state=0))

In [637]:
feature_sel_model.get_support()

array([ True,  True, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False,  True, False,
       False,  True,  True, False, False, False, False, False, False,
       False, False,  True, False,  True, False, False, False, False,
       False, False, False,  True,  True, False,  True, False, False,
        True,  True, False, False, False, False, False,  True, False,
       False,  True,  True,  True, False,  True,  True, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False])

In [639]:
# Print the number of total and selected features

# list of the selected features
selected_feat = x_train.columns[(feature_sel_model.get_support())]

# let's print some stats
print('total features: {}'.format((x_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
x_train = x_train[selected_feat]

total features: 82
selected features: 21


In [659]:
x_train

Unnamed: 0,MSSubClass,MSZoning,Neighborhood,OverallQual,YearRemodAdd,RoofStyle,BsmtQual,BsmtExposure,HeatingQC,CentralAir,1stFlrSF,GrLivArea,BsmtFullBath,KitchenQual,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,PavedDrive,SaleCondition
0,0.235294,0.75,0.636364,0.666667,0.098361,0.0,0.75,0.25,1.00,1.0,0.356155,0.577712,0.333333,0.666667,0.000000,0.2,0.8,0.666667,0.50,1.0,0.75
1,0.000000,0.75,0.500000,0.555556,0.524590,0.0,0.75,1.00,1.00,1.0,0.503056,0.470245,0.000000,0.333333,0.333333,0.6,0.8,0.666667,0.50,1.0,0.75
2,0.235294,0.75,0.636364,0.666667,0.114754,0.0,0.75,0.50,1.00,1.0,0.383441,0.593095,0.333333,0.666667,0.333333,0.6,0.8,0.666667,0.50,1.0,0.75
3,0.294118,0.75,0.727273,0.666667,0.606557,0.0,0.50,0.25,0.75,1.0,0.399941,0.579157,0.333333,0.666667,0.333333,0.8,0.4,0.333333,0.75,1.0,0.00
4,0.235294,0.75,1.000000,0.777778,0.147541,0.0,0.75,0.75,1.00,1.0,0.466237,0.666523,0.333333,0.666667,0.333333,0.6,0.8,0.666667,0.75,1.0,0.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.235294,0.75,0.590909,0.555556,0.131148,0.0,0.75,0.25,1.00,1.0,0.396777,0.564433,0.000000,0.333333,0.333333,0.6,0.8,0.666667,0.50,1.0,0.75
1456,0.000000,0.75,0.545455,0.555556,0.377049,0.0,0.75,0.25,0.50,1.0,0.690872,0.645810,0.333333,0.333333,0.666667,0.6,0.8,0.333333,0.50,1.0,0.75
1457,0.294118,0.75,0.727273,0.666667,0.081967,0.0,0.50,0.25,1.00,1.0,0.480189,0.688669,0.000000,0.666667,0.666667,0.8,0.8,0.666667,0.25,1.0,0.75
1458,0.000000,0.75,0.363636,0.444444,0.245902,1.0,0.50,0.50,0.75,1.0,0.443419,0.414497,0.333333,0.666667,0.000000,0.2,0.8,0.333333,0.25,1.0,0.75


# Feature engineering, scalling and selection for test data

In [642]:
# load the data

# Set the path to the raw data folder
raw_data_path = 'C:\\Users\\prath\\Advanced-House-Price-Prediction\\data\\raw\\'

# Load the train.csv file into a pandas DataFrame
df_test = pd.read_csv(raw_data_path + 'test.csv')

In [643]:
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,TA,TA,PConc,TA,TA,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,Ex,Y,SBrkr,926,678,0,1604,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,1998.0,Fin,2.0,470.0,TA,TA,Y,360,36,0,0,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,Ex,Y,SBrkr,1280,0,0,1280,0.0,0.0,2,0,2,1,Gd,5,Typ,0,,Attchd,1992.0,RFn,2.0,506.0,TA,TA,Y,0,82,0,0,144,0,,,,0,1,2010,WD,Normal


### Handling numerical missing values

In [644]:
# Numerical variables the contains missing values

numerical_with_nan=[feature for feature in df_test.columns if df_test[feature].isnull().sum()>1 and df_test[feature].dtypes!='O']

# Print the numerical NaN variables and percentage of missing values

for feature in numerical_with_nan:
    print("{}: {}% missing value".format(feature,np.around(df_test[feature].isnull().mean(),4)))

LotFrontage: 0.1556% missing value
MasVnrArea: 0.0103% missing value
BsmtFullBath: 0.0014% missing value
BsmtHalfBath: 0.0014% missing value
GarageYrBlt: 0.0535% missing value


In [645]:
# Replacing the numerical Missing Values

for feature in numerical_with_nan:
    # We will replace by using median since there are outliers
    median_value=df_test[feature].median()
    
    # create a new feature to capture nan values
    df_test[feature+'nan']=np.where(df_test[feature].isnull(),1,0)
    df_test[feature].fillna(median_value,inplace=True)
    
df_test[numerical_with_nan].isnull().sum()

LotFrontage     0
MasVnrArea      0
BsmtFullBath    0
BsmtHalfBath    0
GarageYrBlt     0
dtype: int64

### Handling Temporal features

In [646]:
# Temporal features - datetime variables

for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
       
    df_test[feature]=df_test['YrSold']-df_test[feature]

df_test[['YearBuilt','YearRemodAdd','GarageYrBlt']].head()

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt
0,49,49,49.0
1,52,52,52.0
2,13,12,13.0
3,12,12,12.0
4,18,18,18.0


### Logarithmic Transformation

In [647]:
# Using log transformation to reduce the skewness of data

num_features=['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea']

for feature in num_features:
    df_test[feature]=np.log(df_test[feature])

### Handling categorical missing values

In [648]:
# Categorical features which are missing

features_nan=[feature for feature in df_test.columns if df_test[feature].isnull().sum()>1 and df_test[feature].dtypes=='O']

for feature in features_nan:
    print("{}: {}% missing values".format(feature,np.round(df_test[feature].isnull().mean(),4)))

MSZoning: 0.0027% missing values
Alley: 0.9267% missing values
Utilities: 0.0014% missing values
MasVnrType: 0.011% missing values
BsmtQual: 0.0302% missing values
BsmtCond: 0.0308% missing values
BsmtExposure: 0.0302% missing values
BsmtFinType1: 0.0288% missing values
BsmtFinType2: 0.0288% missing values
Functional: 0.0014% missing values
FireplaceQu: 0.5003% missing values
GarageType: 0.0521% missing values
GarageFinish: 0.0535% missing values
GarageQual: 0.0535% missing values
GarageCond: 0.0535% missing values
PoolQC: 0.9979% missing values
Fence: 0.8012% missing values
MiscFeature: 0.965% missing values


In [649]:
# Replace missing value with a new label/category

def replace_cat_feature(df_test,features_nan):
    data=df_test.copy()
    data[features_nan]=data[features_nan].fillna('Missing')
    return data

df_test=replace_cat_feature(df_test,features_nan)

df_test[features_nan].isnull().sum()

MSZoning        0
Alley           0
Utilities       0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
Functional      0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

## Feature Scalling

In [650]:
# encode categorical features based on their relationship with the target variable into numerical.

for feature in categorical_features:
    # Use the mapping obtained from the training dataset to map the categories in the test dataset
    df_test[feature] = df_test[feature].map(labels_ordered).fillna(0)
    # Calculate the mode of the feature in the training dataset
    default_value = df_train[feature].mode()[0]
    # Replace any categories that are not present in the training dataset with the mode value
    df_test[feature] = np.where(df_test[feature]==0, default_value, df_test[feature])




In [651]:
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,BsmtFullBathnan,BsmtHalfBathnan,GarageYrBltnan
0,1461,20,0.75,4.382027,9.360655,1.0,1.0,0.0,0.333333,1.0,0.0,0.0,0.363636,0.4,1.0,0.75,0.6,5,6,49,49,0.0,0.0,1.0,1.0,0.25,0.0,0.333333,1.0,1.0,0.5,0.75,0.25,0.833333,468.0,0.833333,144.0,270.0,882.0,1.0,1.0,1.0,1.0,6.79794,0,0,6.79794,0.0,0.0,1,0,2,1,0.333333,5,1.0,0,0.2,0.8,49.0,0.333333,1.0,730.0,0.666667,1.0,1.0,140,0,0,0,120,0,0.0,1.0,1.0,0,6,2010,0.666667,3.0,0,0,0,0,0
1,1462,20,0.75,4.394449,9.565704,1.0,1.0,0.0,0.333333,1.0,0.0,0.0,0.363636,0.4,1.0,0.75,0.6,6,6,52,52,0.0,0.0,1.0,1.0,0.25,108.0,0.333333,1.0,1.0,0.5,0.75,0.25,0.833333,923.0,0.833333,0.0,406.0,1329.0,1.0,1.0,1.0,1.0,7.192182,0,0,7.192182,0.0,0.0,1,1,3,1,0.333333,6,1.0,0,0.2,0.8,52.0,0.333333,1.0,312.0,0.666667,1.0,1.0,393,36,0,0,0,0,0.0,1.0,1.0,12500,6,2010,0.666667,3.0,0,0,0,0,0
2,1463,60,0.75,4.304065,9.534595,1.0,1.0,0.0,0.333333,1.0,0.0,0.0,0.363636,0.4,1.0,0.75,0.6,5,5,13,12,0.0,0.0,1.0,1.0,0.25,0.0,0.333333,1.0,1.0,0.5,0.75,0.25,0.833333,791.0,0.833333,0.0,137.0,928.0,1.0,1.0,1.0,1.0,6.833032,701,0,7.395722,0.0,0.0,2,1,3,1,0.333333,6,1.0,1,0.2,0.8,13.0,0.333333,2.0,482.0,0.666667,1.0,1.0,212,34,0,0,0,0,0.0,1.0,1.0,0,3,2010,0.666667,3.0,0,0,0,0,0
3,1464,60,0.75,4.356709,9.208138,1.0,1.0,0.0,0.333333,1.0,0.0,0.0,0.363636,0.4,1.0,0.75,0.6,6,6,12,12,0.0,0.0,1.0,1.0,0.25,20.0,0.333333,1.0,1.0,0.5,0.75,0.25,0.833333,602.0,0.833333,0.0,324.0,926.0,1.0,1.0,1.0,1.0,6.830874,678,0,7.380256,0.0,0.0,2,1,3,1,0.333333,7,1.0,1,0.2,0.8,12.0,0.333333,2.0,470.0,0.666667,1.0,1.0,360,36,0,0,0,0,0.0,1.0,1.0,0,6,2010,0.666667,3.0,0,0,0,0,0
4,1465,120,0.75,3.7612,8.518193,1.0,1.0,0.0,0.333333,1.0,0.0,0.0,0.363636,0.4,1.0,0.75,0.6,8,5,18,18,0.0,0.0,1.0,1.0,0.25,0.0,0.333333,1.0,1.0,0.5,0.75,0.25,0.833333,263.0,0.833333,0.0,1017.0,1280.0,1.0,1.0,1.0,1.0,7.154615,0,0,7.154615,0.0,0.0,2,0,2,1,0.333333,5,1.0,0,0.2,0.8,18.0,0.333333,2.0,506.0,0.666667,1.0,1.0,0,82,0,0,144,0,0.0,1.0,1.0,0,1,2010,0.666667,3.0,0,0,0,0,0


In [652]:
from sklearn.preprocessing import MinMaxScaler

feature_scale=[feature for feature in df_test.columns if feature not in ['Id']]
dataframe = df_test

def scale_features(dataframe, feature_scale):
    """
    Scales the features in a pandas DataFrame using MinMaxScaler from scikit-learn.

    Parameters:
    dataframe (pandas DataFrame): The input DataFrame to scale.
    feature_scale (list): The list of feature names to scale.

    Returns:
     An array having same number of rows as the original dataset, but the values in each column will be scaled to between 0 and 1.
    """
    # Create a scaler object
    scaler = MinMaxScaler()
    
    # Fit the scaler to the specified features
    scaler.fit(dataframe[feature_scale])

    # Scale the specified features in the DataFrame
    dataframe[feature_scale] = scaler.transform(dataframe[feature_scale])

    return dataframe
scale_features(dataframe,feature_scale)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,BsmtFullBathnan,BsmtHalfBathnan,GarageYrBltnan
0,1461,0.000000,0.0,0.593445,0.566360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.444444,0.625,0.384615,0.822581,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116708,0.0,0.094364,0.126168,0.173111,0.0,0.0,0.0,0.0,0.312253,0.000000,0.0,0.312253,0.000000,0.0,0.25,0.0,0.333333,0.5,0.0,0.166667,0.0,0.00,0.0,0.0,0.792994,0.0,0.2,0.490591,0.0,0.0,0.0,0.098315,0.000000,0.0,0.0,0.208333,0.0,0.0,0.0,0.0,0.000000,0.454545,1.0,0.0,0.692308,0.0,0.0,0.0,0.0,0.0
1,1462,0.000000,0.0,0.598957,0.622527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.555556,0.625,0.407692,0.870968,0.0,0.0,0.0,0.0,0.0,0.083721,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.230175,0.0,0.000000,0.189720,0.260844,0.0,0.0,0.0,0.0,0.468253,0.000000,0.0,0.468253,0.000000,0.0,0.25,0.5,0.500000,0.5,0.0,0.250000,0.0,0.00,0.0,0.0,0.802548,0.0,0.2,0.209677,0.0,0.0,0.0,0.275983,0.048518,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.735294,0.454545,1.0,0.0,0.692308,0.0,0.0,0.0,0.0,0.0
2,1463,0.235294,0.0,0.558854,0.614005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.444444,0.500,0.107692,0.225806,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.197257,0.0,0.000000,0.064019,0.182139,0.0,0.0,0.0,0.0,0.326139,0.376477,0.0,0.548792,0.000000,0.0,0.50,0.5,0.500000,0.5,0.0,0.250000,0.0,0.25,0.0,0.0,0.678344,0.0,0.4,0.323925,0.0,0.0,0.0,0.148876,0.045822,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.181818,1.0,0.0,0.692308,0.0,0.0,0.0,0.0,0.0
3,1464,0.235294,0.0,0.582212,0.524583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.555556,0.625,0.100000,0.225806,0.0,0.0,0.0,0.0,0.0,0.015504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150125,0.0,0.000000,0.151402,0.181747,0.0,0.0,0.0,0.0,0.325285,0.364125,0.0,0.542672,0.000000,0.0,0.50,0.5,0.500000,0.5,0.0,0.333333,0.0,0.25,0.0,0.0,0.675159,0.0,0.4,0.315860,0.0,0.0,0.0,0.252809,0.048518,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.454545,1.0,0.0,0.692308,0.0,0.0,0.0,0.0,0.0
4,1465,0.588235,0.0,0.317987,0.335596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.777778,0.500,0.146154,0.322581,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065586,0.0,0.000000,0.475234,0.251227,0.0,0.0,0.0,0.0,0.453388,0.000000,0.0,0.453388,0.000000,0.0,0.50,0.0,0.333333,0.5,0.0,0.166667,0.0,0.00,0.0,0.0,0.694268,0.0,0.4,0.340054,0.0,0.0,0.0,0.000000,0.110512,0.0,0.0,0.250000,0.0,0.0,0.0,0.0,0.000000,0.000000,1.0,0.0,0.692308,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,0.823529,0.0,0.000000,0.075426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.750,0.284615,0.612903,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.255140,0.107164,0.0,0.0,0.0,0.0,0.116257,0.293233,0.0,0.390532,0.000000,0.0,0.25,0.5,0.500000,0.5,0.0,0.166667,0.0,0.00,0.0,0.0,0.722930,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.454545,0.0,0.0,0.692308,0.0,0.0,0.0,0.0,1.0
1455,2916,0.823529,0.0,0.000000,0.069418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.500,0.284615,0.612903,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062843,0.0,0.000000,0.137383,0.107164,0.0,0.0,0.0,0.0,0.116257,0.293233,0.0,0.390532,0.000000,0.0,0.25,0.5,0.500000,0.5,0.0,0.250000,0.0,0.00,0.0,0.0,0.751592,0.0,0.2,0.192204,0.0,0.0,0.0,0.000000,0.032345,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.272727,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1456,2917,0.000000,0.0,0.900992,0.715051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.444444,0.750,0.361538,0.193548,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.305237,0.0,0.000000,0.000000,0.240236,0.0,0.0,0.0,0.0,0.435686,0.000000,0.0,0.435686,0.333333,0.0,0.25,0.0,0.666667,0.5,0.0,0.333333,0.0,0.25,0.0,0.0,0.783439,0.0,0.4,0.387097,0.0,0.0,0.0,0.332865,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.727273,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1457,2918,0.382353,0.0,0.480351,0.537007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.444444,0.500,0.115385,0.258065,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.084040,0.0,0.000000,0.268692,0.178999,0.0,0.0,0.0,0.0,0.343654,0.000000,0.0,0.343654,0.000000,0.5,0.25,0.0,0.500000,0.5,0.0,0.250000,0.0,0.00,0.0,0.0,0.722930,0.0,0.0,0.000000,0.0,0.0,0.0,0.056180,0.043127,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.041176,0.545455,0.0,0.0,0.692308,0.0,0.0,0.0,0.0,1.0


# Feature Selection

In [653]:
# load modules

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [654]:
x_test = dataframe.drop(columns=['Id'], axis=1)  # Independent features

In [655]:
x_test.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,BsmtFullBathnan,BsmtHalfBathnan,GarageYrBltnan
0,0.0,0.0,0.593445,0.56636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.444444,0.625,0.384615,0.822581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116708,0.0,0.094364,0.126168,0.173111,0.0,0.0,0.0,0.0,0.312253,0.0,0.0,0.312253,0.0,0.0,0.25,0.0,0.333333,0.5,0.0,0.166667,0.0,0.0,0.0,0.0,0.792994,0.0,0.2,0.490591,0.0,0.0,0.0,0.098315,0.0,0.0,0.0,0.208333,0.0,0.0,0.0,0.0,0.0,0.454545,1.0,0.0,0.692308,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.598957,0.622527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.555556,0.625,0.407692,0.870968,0.0,0.0,0.0,0.0,0.0,0.083721,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.230175,0.0,0.0,0.18972,0.260844,0.0,0.0,0.0,0.0,0.468253,0.0,0.0,0.468253,0.0,0.0,0.25,0.5,0.5,0.5,0.0,0.25,0.0,0.0,0.0,0.0,0.802548,0.0,0.2,0.209677,0.0,0.0,0.0,0.275983,0.048518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.735294,0.454545,1.0,0.0,0.692308,0.0,0.0,0.0,0.0,0.0
2,0.235294,0.0,0.558854,0.614005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.444444,0.5,0.107692,0.225806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.197257,0.0,0.0,0.064019,0.182139,0.0,0.0,0.0,0.0,0.326139,0.376477,0.0,0.548792,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.25,0.0,0.25,0.0,0.0,0.678344,0.0,0.4,0.323925,0.0,0.0,0.0,0.148876,0.045822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181818,1.0,0.0,0.692308,0.0,0.0,0.0,0.0,0.0
3,0.235294,0.0,0.582212,0.524583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.555556,0.625,0.1,0.225806,0.0,0.0,0.0,0.0,0.0,0.015504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150125,0.0,0.0,0.151402,0.181747,0.0,0.0,0.0,0.0,0.325285,0.364125,0.0,0.542672,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.333333,0.0,0.25,0.0,0.0,0.675159,0.0,0.4,0.31586,0.0,0.0,0.0,0.252809,0.048518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.454545,1.0,0.0,0.692308,0.0,0.0,0.0,0.0,0.0
4,0.588235,0.0,0.317987,0.335596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.777778,0.5,0.146154,0.322581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065586,0.0,0.0,0.475234,0.251227,0.0,0.0,0.0,0.0,0.453388,0.0,0.0,0.453388,0.0,0.0,0.5,0.0,0.333333,0.5,0.0,0.166667,0.0,0.0,0.0,0.0,0.694268,0.0,0.4,0.340054,0.0,0.0,0.0,0.0,0.110512,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.692308,0.0,0.0,0.0,0.0,0.0


In [656]:
# drop all the other columns
x_test = x_test.drop(columns=x_test.columns.difference(['MSSubClass', 'MSZoning', 'Neighborhood', 'OverallQual', 'YearRemodAdd',
       'RoofStyle', 'BsmtQual', 'BsmtExposure', 'HeatingQC', 'CentralAir',
       '1stFlrSF', 'GrLivArea', 'BsmtFullBath', 'KitchenQual', 'Fireplaces',
       'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageCars', 'PavedDrive','SaleCondition']))

In [657]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1459 non-null   float64
 1   MSZoning       1459 non-null   float64
 2   Neighborhood   1459 non-null   float64
 3   OverallQual    1459 non-null   float64
 4   YearRemodAdd   1459 non-null   float64
 5   RoofStyle      1459 non-null   float64
 6   BsmtQual       1459 non-null   float64
 7   BsmtExposure   1459 non-null   float64
 8   HeatingQC      1459 non-null   float64
 9   CentralAir     1459 non-null   float64
 10  1stFlrSF       1459 non-null   float64
 11  GrLivArea      1459 non-null   float64
 12  BsmtFullBath   1459 non-null   float64
 13  KitchenQual    1459 non-null   float64
 14  Fireplaces     1459 non-null   float64
 15  FireplaceQu    1459 non-null   float64
 16  GarageType     1459 non-null   float64
 17  GarageFinish   1459 non-null   float64
 18  GarageCa

In [660]:
# save to CSV file
x_train.to_csv('x_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
x_test.to_csv('x_test.csv', index=False)