In [33]:
import pandas as pd

#Loading the data from EDA
df =pd.read_csv('../cleaned_data.csv')

df.shape

(1354, 76)

In [34]:
#Creating a copy of the Data
df_cleaned_copy = df.copy()

In [35]:
#Identifying which column are non-numeric
categorical_cols = df_cleaned_copy.select_dtypes(include = ['object']).columns
print(f'Categorical Columns: {categorical_cols}')

Categorical Columns: Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
       'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')


In [36]:
#Converting categorical columns into numerical features
df_encoded = pd.get_dummies(df_cleaned_copy, drop_first = True)
df_encoded

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,False,False,False,False,True,False,False,False,True,False
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,False,False,False,False,True,False,False,False,True,False
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,False,False,False,False,True,False,False,False,True,False
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,False,False,False,False,True,False,False,False,False,False
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,False,False,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1349,1456,60,62.0,7917,6,5,1999,2000,0.0,0,...,False,False,False,False,True,False,False,False,True,False
1350,1457,20,85.0,13175,6,6,1978,1988,119.0,790,...,False,False,False,False,True,False,False,False,True,False
1351,1458,70,66.0,9042,7,9,1941,2006,0.0,275,...,False,False,False,False,True,False,False,False,True,False
1352,1459,20,68.0,9717,5,6,1950,1996,0.0,49,...,False,False,False,False,True,False,False,False,True,False


In [37]:
#Inspecting the modified dataset to verify the changes
print(df_encoded.head())
print(df_encoded.info())

   Id  MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
0   1          60         65.0     8450            7            5       2003   
1   2          20         80.0     9600            6            8       1976   
2   3          60         68.0    11250            7            5       2001   
3   4          70         60.0     9550            7            5       1915   
4   5          60         84.0    14260            8            5       2000   

   YearRemodAdd  MasVnrArea  BsmtFinSF1  ...  SaleType_ConLI  SaleType_ConLw  \
0          2003       196.0         706  ...           False           False   
1          1976         0.0         978  ...           False           False   
2          2002       162.0         486  ...           False           False   
3          1970         0.0         216  ...           False           False   
4          2000       350.0         655  ...           False           False   

   SaleType_New  SaleType_Oth  SaleTyp

In [38]:
#Recalculating Correlation
correlation = df_encoded.corr()['SalePrice'].sort_values(ascending = True)
print(correlation)

ExterQual_TA        -0.603218
KitchenQual_TA      -0.523705
BsmtQual_TA         -0.464746
GarageFinish_Unf    -0.413046
GarageType_Detchd   -0.359254
                       ...   
TotalBsmtSF          0.637076
GarageCars           0.643244
GrLivArea            0.717293
OverallQual          0.802588
SalePrice            1.000000
Name: SalePrice, Length: 227, dtype: float64


In [39]:
#Selecting the most relevant features. Keeping the strong correlation and dropping the weakly correlated feature

#Setting the correlated threshold(absolute_value >= 0.3)
important_feature = correlation[abs(correlation) >= 0.3].index
df_selected = df_encoded[important_feature]

#Checking the final selected features
print(f'Selected Features: {list(df_selected.columns)}')
print(df_selected.head())

Selected Features: ['ExterQual_TA', 'KitchenQual_TA', 'BsmtQual_TA', 'GarageFinish_Unf', 'GarageType_Detchd', 'Foundation_CBlock', 'HeatingQC_TA', 'Exterior1st_VinylSd', 'Exterior2nd_VinylSd', '2ndFlrSF', 'WoodDeckSF', 'HalfBath', 'BsmtExposure_Gd', 'KitchenQual_Gd', 'OpenPorchSF', 'GarageType_Attchd', 'LotFrontage', 'FireplaceQu_Gd', 'SaleCondition_Partial', 'SaleType_New', 'Neighborhood_NridgHt', 'BsmtFinSF1', 'BsmtFinType1_GLQ', 'ExterQual_Gd', 'Fireplaces', 'GarageYrBlt', 'MasVnrArea', 'Foundation_PConc', 'YearRemodAdd', 'TotRmsAbvGrd', 'YearBuilt', 'FullBath', '1stFlrSF', 'GarageArea', 'TotalBsmtSF', 'GarageCars', 'GrLivArea', 'OverallQual', 'SalePrice']
   ExterQual_TA  KitchenQual_TA  BsmtQual_TA  GarageFinish_Unf  \
0         False           False        False             False   
1          True            True        False             False   
2         False           False        False             False   
3          True           False         True              True   
4 

In [40]:
#Splitting the data into features and target variable

#Separate features(X) and target variable(y)
X = df_selected.drop(columns = ['SalePrice'])
y = df_selected['SalePrice']

#Checking the columns in X and y
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

X shape: (1354, 38)
y shape: (1354,)


In [41]:
from sklearn.model_selection import train_test_split

#Splitting the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#Checking the shapes of training and testing sets
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (1083, 38)
X_test shape: (271, 38)
y_train shape: (1083,)
y_test shape: (271,)


In [42]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

#step 1: Initialize the Linear Regression model
model = LinearRegression()

#step 2: Train the model using the training data
model.fit(X_train, y_train)

#step 3: Make predictions on the test set
y_pred = model.predict(X_test)

#step 4: Evaluate the model's performance
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

#step 5: printing the results
print(f'R² Score: {r2:.4f}')
print(f'Root Mean Squared Error(RMSE): {rmse:.4f}')

R² Score: 0.8700
Root Mean Squared Error(RMSE): 26098.9423


In [43]:
#Checking the mean or median of y_test to compare with RMSE value
print(f"Mean SalePrice: {y_test.mean():.2f}")
print(f"Median SalePrice: {y_test.median():.2f}")

Mean SalePrice: 176601.87
Median SalePrice: 160000.00


In [44]:
#Analyzing the features increase or decrease the house price

#Creating a Dataframe of features and their coefficients
feature_importance = pd.DataFrame({'Feature': X_train.columns, 'Coefficient':model.coef_})

#Sorting the absolute coefficient values
feature_importance['Abs_Coefficient'] = feature_importance['Coefficient'].abs()
feature_importance = feature_importance.sort_values(by = 'Abs_Coefficient', ascending = False)

#Dropping the extra column for clarity
feature_importance = feature_importance.drop(columns = ['Abs_Coefficient'])

#Displaying the important features
print(feature_importance)

                  Feature   Coefficient
0            ExterQual_TA -39128.462686
23           ExterQual_Gd -36604.543323
19           SaleType_New  35185.029386
12        BsmtExposure_Gd  22352.867199
1          KitchenQual_TA -19421.439994
13         KitchenQual_Gd -17317.383906
18  SaleCondition_Partial -16298.557252
37            OverallQual  12880.353437
20   Neighborhood_NridgHt   9893.739887
6            HeatingQC_TA  -6519.808274
24             Fireplaces   5822.084293
7     Exterior1st_VinylSd  -4619.499521
8     Exterior2nd_VinylSd   4433.086237
31               FullBath  -4071.058762
4       GarageType_Detchd   3917.159730
27       Foundation_PConc   3219.531265
22       BsmtFinType1_GLQ  -2162.669840
35             GarageCars  -1783.865844
29           TotRmsAbvGrd  -1729.979533
3        GarageFinish_Unf   1569.127154
17         FireplaceQu_Gd   1320.444973
5       Foundation_CBlock   -931.334467
2             BsmtQual_TA   -762.201806
11               HalfBath   -466.060438


In [45]:
#Checking the Data types of X_train
print(X_train.dtypes[X_train.dtypes == 'object'])

# Convert categorical columns to numerical using One-Hot Encoding
X_train_encoded = pd.get_dummies(X_train, drop_first=True)

Series([], dtype: object)


In [46]:
print(X_train.isnull().sum().sum())  # Check total NaN values
print(np.isinf(X_train).sum().sum())  # Check total infinite values

0
0


In [47]:
X_train = X_train.replace([np.inf, -np.inf], np.nan)  # Replace infinities with NaN
X_train = X_train.dropna()  # Drop rows with NaN values

In [48]:
print(X_train.dtypes)

ExterQual_TA                bool
KitchenQual_TA              bool
BsmtQual_TA                 bool
GarageFinish_Unf            bool
GarageType_Detchd           bool
Foundation_CBlock           bool
HeatingQC_TA                bool
Exterior1st_VinylSd         bool
Exterior2nd_VinylSd         bool
2ndFlrSF                   int64
WoodDeckSF                 int64
HalfBath                   int64
BsmtExposure_Gd             bool
KitchenQual_Gd              bool
OpenPorchSF                int64
GarageType_Attchd           bool
LotFrontage              float64
FireplaceQu_Gd              bool
SaleCondition_Partial       bool
SaleType_New                bool
Neighborhood_NridgHt        bool
BsmtFinSF1                 int64
BsmtFinType1_GLQ            bool
ExterQual_Gd                bool
Fireplaces                 int64
GarageYrBlt              float64
MasVnrArea               float64
Foundation_PConc            bool
YearRemodAdd               int64
TotRmsAbvGrd               int64
YearBuilt 

In [49]:
# Convert boolean columns to integers (0 and 1)
X_train_encoded = X_train_encoded.astype(int)

In [50]:
print(X_train_encoded.dtypes.value_counts())

int64    38
Name: count, dtype: int64


In [51]:
#Checking for the Multicollinearity using Variance Inflation Factor(VIF)
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame()
vif_data['Feature'] = X_train_encoded.columns
vif_data['VIF'] = [variance_inflation_factor(X_train_encoded.values, i) for i in range(X_train_encoded.shape[1])]

vif_data = vif_data.sort_values(by='VIF', ascending=False)
print(vif_data)

                  Feature           VIF
25            GarageYrBlt  20431.532216
30              YearBuilt  19673.953144
28           YearRemodAdd  15175.388778
36              GrLivArea   1527.544051
32               1stFlrSF    929.348259
9                2ndFlrSF    183.741173
37            OverallQual     71.842783
29           TotRmsAbvGrd     67.500763
35             GarageCars     40.247103
7     Exterior1st_VinylSd     35.248382
33             GarageArea     35.182682
8     Exterior2nd_VinylSd     34.117781
16            LotFrontage     31.838043
19           SaleType_New     31.511729
18  SaleCondition_Partial     31.408933
34            TotalBsmtSF     30.515112
0            ExterQual_TA     27.995476
31               FullBath     26.795488
23           ExterQual_Gd     13.229957
1          KitchenQual_TA     10.747168
27       Foundation_PConc      9.341218
13         KitchenQual_Gd      8.136335
15      GarageType_Attchd      7.636892
5       Foundation_CBlock      6.468691


In [52]:
#Dropping highle correlated features
X_train_encoded = X_train_encoded.drop(columns=['GarageYrBlt', 'YearRemodAdd'])

In [53]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame()
vif_data['Feature'] = X_train_encoded.columns
vif_data['VIF'] = [variance_inflation_factor(X_train_encoded.values, i) for i in range(X_train_encoded.shape[1])]

vif_data = vif_data.sort_values(by='VIF', ascending=False)
print(vif_data)

                  Feature          VIF
34              GrLivArea  1440.358027
30               1stFlrSF   885.260247
9                2ndFlrSF   174.891892
28              YearBuilt   148.328681
35            OverallQual    71.313071
27           TotRmsAbvGrd    67.381933
33             GarageCars    39.781044
7     Exterior1st_VinylSd    35.192255
8     Exterior2nd_VinylSd    34.088046
31             GarageArea    33.820896
16            LotFrontage    31.713216
19           SaleType_New    31.507217
18  SaleCondition_Partial    31.399772
32            TotalBsmtSF    30.416267
0            ExterQual_TA    27.936611
29               FullBath    26.486613
23           ExterQual_Gd    13.192473
1          KitchenQual_TA    10.662395
26       Foundation_PConc     8.579787
13         KitchenQual_Gd     8.130180
15      GarageType_Attchd     7.467064
5       Foundation_CBlock     5.931889
4       GarageType_Detchd     5.046296
21             BsmtFinSF1     4.105116
2             BsmtQual_TA

In [54]:
X_train_encoded = X_train_encoded.drop(columns=['1stFlrSF'])

In [55]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_encoded_scaled = pd.DataFrame(scaler.fit_transform(X_train_encoded), columns=X_train_encoded.columns)

In [56]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# VIF calculation
vif_data = pd.DataFrame()
vif_data['Feature'] = X_train_encoded_scaled.columns
vif_data['VIF'] = [variance_inflation_factor(X_train_encoded_scaled.values, i) for i in range(X_train_encoded_scaled.shape[1])]

# Sort VIF data in descending order
vif_data = vif_data.sort_values(by='VIF', ascending=False)

# Print VIF data
print(vif_data)

                  Feature        VIF
19           SaleType_New  28.917927
18  SaleCondition_Partial  28.746279
7     Exterior1st_VinylSd  22.617433
8     Exterior2nd_VinylSd  22.155470
33              GrLivArea  10.928582
0            ExterQual_TA  10.478413
23           ExterQual_Gd   8.835425
9                2ndFlrSF   6.476511
32             GarageCars   6.126482
26       Foundation_PConc   5.765685
30             GarageArea   5.660222
1          KitchenQual_TA   5.308481
28              YearBuilt   5.300029
13         KitchenQual_Gd   4.877262
5       Foundation_CBlock   4.055210
27           TotRmsAbvGrd   3.981409
4       GarageType_Detchd   3.842404
31            TotalBsmtSF   3.840214
34            OverallQual   3.432754
29               FullBath   2.994483
15      GarageType_Attchd   2.986634
2             BsmtQual_TA   2.413984
11               HalfBath   2.267924
3        GarageFinish_Unf   2.160393
22       BsmtFinType1_GLQ   2.085227
21             BsmtFinSF1   1.979425
2

In [57]:
#Dropping the 'SaleType_New' as it has the high VIF values from the above o/p
X_train_encoded = X_train_encoded.drop(columns = ['SaleType_New'])

#Re-scaling the dataset again
X_train_encoded_scaled = pd.DataFrame(scaler.fit_transform(X_train_encoded), columns = X_train_encoded.columns)
X_train_encoded_scaled

Unnamed: 0,ExterQual_TA,KitchenQual_TA,BsmtQual_TA,GarageFinish_Unf,GarageType_Detchd,Foundation_CBlock,HeatingQC_TA,Exterior1st_VinylSd,Exterior2nd_VinylSd,2ndFlrSF,...,MasVnrArea,Foundation_PConc,TotRmsAbvGrd,YearBuilt,FullBath,GarageArea,TotalBsmtSF,GarageCars,GrLivArea,OverallQual
0,-1.291312,-1.008345,-0.897218,-0.836754,-0.593346,-0.859280,-0.647609,-0.748678,-0.733725,-0.776289,...,-0.230689,1.122925,-0.943462,0.927998,0.791275,0.159681,1.547359,0.303827,0.380685,2.135346
1,0.774406,-1.008345,1.114556,1.195095,1.685356,-0.859280,-0.647609,-0.748678,-0.733725,0.513392,...,-0.578353,-0.890532,0.276561,-1.722306,-1.043738,-0.912971,-0.923155,-1.022965,-0.528630,-0.795680
2,0.774406,-1.008345,-0.897218,1.195095,-0.593346,-0.859280,-0.647609,-0.748678,-0.733725,2.706306,...,-0.578353,1.122925,2.716608,0.633519,2.626289,0.577350,1.486328,1.630618,3.378057,0.669833
3,0.774406,0.991724,1.114556,-0.836754,-0.593346,1.163765,1.544142,-0.748678,-0.733725,1.661299,...,-0.578353,-0.890532,1.496585,0.110003,0.791275,0.719738,1.117704,0.303827,2.173561,-0.062924
4,0.774406,0.991724,-0.897218,1.195095,1.685356,1.163765,1.544142,-0.748678,-0.733725,-0.776289,...,-0.578353,-0.890532,-0.943462,0.142722,-1.043738,-0.912971,-0.342145,-1.022965,-1.174461,-0.795680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1078,0.774406,0.991724,1.114556,1.195095,-0.593346,-0.859280,-0.647609,-0.748678,-0.733725,-0.154315,...,-0.578353,-0.890532,0.276561,-1.525987,-1.043738,-0.381391,-0.683915,-1.022965,-0.586081,-0.795680
1079,0.774406,0.991724,1.114556,1.195095,1.685356,1.163765,1.544142,-0.748678,-0.733725,-0.776289,...,-0.323400,-0.890532,0.276561,-0.642553,0.791275,0.264098,2.616613,0.303827,1.226606,-0.795680
1080,0.774406,-1.008345,1.114556,1.195095,1.685356,-0.859280,-0.647609,-0.748678,-0.733725,0.248138,...,-0.578353,-0.890532,0.276561,-1.493268,0.791275,-1.216730,-0.215201,-1.022965,0.180596,-0.062924
1081,0.774406,0.991724,-0.897218,1.195095,-0.593346,1.163765,1.544142,-0.748678,-0.733725,-0.776289,...,0.331366,-0.890532,0.276561,0.306321,0.791275,0.662783,1.518064,0.303827,0.335121,-0.062924


In [58]:
#Recalculating VIF
vif_data = pd.DataFrame()
vif_data['Feature'] = X_train_encoded_scaled.columns
vif_data['VIF'] =[variance_inflation_factor(X_train_encoded_scaled.values, i) for i in range(X_train_encoded_scaled.shape[1])]

#Sorting the VIF
vif_data = vif_data.sort_values(by = 'VIF', ascending = False)
print(vif_data)

                  Feature        VIF
7     Exterior1st_VinylSd  22.615889
8     Exterior2nd_VinylSd  22.155432
32              GrLivArea  10.927048
0            ExterQual_TA  10.443292
22           ExterQual_Gd   8.813069
9                2ndFlrSF   6.476505
31             GarageCars   6.117163
25       Foundation_PConc   5.765642
29             GarageArea   5.655791
27              YearBuilt   5.299995
1          KitchenQual_TA   5.295587
13         KitchenQual_Gd   4.867039
5       Foundation_CBlock   4.050778
26           TotRmsAbvGrd   3.980898
4       GarageType_Detchd   3.841699
30            TotalBsmtSF   3.839957
33            OverallQual   3.432737
28               FullBath   2.994185
15      GarageType_Attchd   2.985419
2             BsmtQual_TA   2.413546
11               HalfBath   2.267752
3        GarageFinish_Unf   2.157654
21       BsmtFinType1_GLQ   2.084299
20             BsmtFinSF1   1.978981
23             Fireplaces   1.935891
17         FireplaceQu_Gd   1.664287
2

In [59]:
# Dropping 'Exterior1st_VinylSd' as it has a high VIF
X_train_encoded = X_train_encoded.drop(columns=['Exterior1st_VinylSd'])

# Re-scaling the dataset again
X_train_encoded_scaled = pd.DataFrame(scaler.fit_transform(X_train_encoded), columns=X_train_encoded.columns)

# Recalculating VIF
vif_data = pd.DataFrame()
vif_data['Feature'] = X_train_encoded_scaled.columns
vif_data['VIF'] = [variance_inflation_factor(X_train_encoded_scaled.values, i) for i in range(X_train_encoded_scaled.shape[1])]

# Sorting VIF
vif_data = vif_data.sort_values(by='VIF', ascending=False)

# Print updated VIF values
print(vif_data)


                  Feature        VIF
31              GrLivArea  10.921635
0            ExterQual_TA  10.443274
21           ExterQual_Gd   8.812697
8                2ndFlrSF   6.436588
30             GarageCars   6.102119
24       Foundation_PConc   5.764955
28             GarageArea   5.645737
1          KitchenQual_TA   5.295162
26              YearBuilt   5.277217
12         KitchenQual_Gd   4.861927
5       Foundation_CBlock   4.050770
25           TotRmsAbvGrd   3.979532
4       GarageType_Detchd   3.831347
29            TotalBsmtSF   3.822808
32            OverallQual   3.429523
27               FullBath   2.994014
14      GarageType_Attchd   2.983351
2             BsmtQual_TA   2.412639
10               HalfBath   2.260396
3        GarageFinish_Unf   2.157654
20       BsmtFinType1_GLQ   2.079446
19             BsmtFinSF1   1.978888
22             Fireplaces   1.932266
7     Exterior2nd_VinylSd   1.768868
16         FireplaceQu_Gd   1.663599
23             MasVnrArea   1.493347
1

In [60]:
# Dropping 'GrLivArea' as it has a high VIF
X_train_encoded = X_train_encoded.drop(columns=['GrLivArea'])

# Re-scaling the dataset again
X_train_encoded_scaled = pd.DataFrame(scaler.fit_transform(X_train_encoded), columns=X_train_encoded.columns)

# Recalculating VIF
vif_data = pd.DataFrame()
vif_data['Feature'] = X_train_encoded_scaled.columns
vif_data['VIF'] = [variance_inflation_factor(X_train_encoded_scaled.values, i) for i in range(X_train_encoded_scaled.shape[1])]

# Sorting VIF
vif_data = vif_data.sort_values(by='VIF', ascending=False)

# Print updated VIF values
print(vif_data)

                  Feature        VIF
0            ExterQual_TA  10.436271
21           ExterQual_Gd   8.811149
30             GarageCars   6.087616
24       Foundation_PConc   5.735145
28             GarageArea   5.527344
1          KitchenQual_TA   5.290772
26              YearBuilt   5.056882
12         KitchenQual_Gd   4.861221
8                2ndFlrSF   4.260900
5       Foundation_CBlock   4.047740
4       GarageType_Detchd   3.711741
31            OverallQual   3.407599
14      GarageType_Attchd   2.956920
27               FullBath   2.856110
25           TotRmsAbvGrd   2.828319
29            TotalBsmtSF   2.813546
2             BsmtQual_TA   2.360741
10               HalfBath   2.257475
3        GarageFinish_Unf   2.145131
20       BsmtFinType1_GLQ   2.079047
19             BsmtFinSF1   1.978831
22             Fireplaces   1.848726
7     Exterior2nd_VinylSd   1.757119
16         FireplaceQu_Gd   1.660346
23             MasVnrArea   1.476854
18   Neighborhood_NridgHt   1.450572
6

In [61]:
# Dropping 'ExterQual_TA' as it has a high VIF
X_train_encoded = X_train_encoded.drop(columns=['ExterQual_TA'])

# Re-scaling the dataset again
X_train_encoded_scaled = pd.DataFrame(scaler.fit_transform(X_train_encoded), columns=X_train_encoded.columns)

# Recalculating VIF
vif_data = pd.DataFrame()
vif_data['Feature'] = X_train_encoded_scaled.columns
vif_data['VIF'] = [variance_inflation_factor(X_train_encoded_scaled.values, i) for i in range(X_train_encoded_scaled.shape[1])]

# Sorting VIF
vif_data = vif_data.sort_values(by='VIF', ascending=False)

# Print updated VIF values
print(vif_data)

                  Feature       VIF
29             GarageCars  6.087041
23       Foundation_PConc  5.735144
27             GarageArea  5.526988
25              YearBuilt  5.051583
0          KitchenQual_TA  4.431491
7                2ndFlrSF  4.259830
11         KitchenQual_Gd  4.254838
4       Foundation_CBlock  4.047393
3       GarageType_Detchd  3.706457
30            OverallQual  3.326407
13      GarageType_Attchd  2.942608
26               FullBath  2.853254
24           TotRmsAbvGrd  2.823055
28            TotalBsmtSF  2.805855
20           ExterQual_Gd  2.378393
1             BsmtQual_TA  2.356247
9                HalfBath  2.257374
2        GarageFinish_Unf  2.144517
19       BsmtFinType1_GLQ  2.077876
18             BsmtFinSF1  1.978115
21             Fireplaces  1.837785
6     Exterior2nd_VinylSd  1.754887
15         FireplaceQu_Gd  1.660049
22             MasVnrArea  1.451629
17   Neighborhood_NridgHt  1.423335
5            HeatingQC_TA  1.392710
16  SaleCondition_Partial  1

In [62]:
# Dropping all high-VIF features at once
high_vif_features = ['GarageCars', 'Foundation_PConc', 'GarageArea', 'YearBuilt']
X_train_encoded = X_train_encoded.drop(columns=high_vif_features)

# Re-scaling the dataset again
X_train_encoded_scaled = pd.DataFrame(scaler.fit_transform(X_train_encoded), columns=X_train_encoded.columns)

# Recalculating VIF
vif_data = pd.DataFrame()
vif_data['Feature'] = X_train_encoded_scaled.columns
vif_data['VIF'] = [variance_inflation_factor(X_train_encoded_scaled.values, i) for i in range(X_train_encoded_scaled.shape[1])]

# Sorting VIF
vif_data = vif_data.sort_values(by='VIF', ascending=False)

# Print updated VIF values
print(vif_data)

                  Feature       VIF
0          KitchenQual_TA  4.400043
11         KitchenQual_Gd  4.214807
7                2ndFlrSF  3.908358
3       GarageType_Detchd  3.224256
26            OverallQual  3.189832
13      GarageType_Attchd  2.750052
23           TotRmsAbvGrd  2.748720
25            TotalBsmtSF  2.683685
24               FullBath  2.680901
20           ExterQual_Gd  2.336171
1             BsmtQual_TA  2.132965
2        GarageFinish_Unf  2.110168
9                HalfBath  2.044401
19       BsmtFinType1_GLQ  2.003850
18             BsmtFinSF1  1.955357
4       Foundation_CBlock  1.929098
21             Fireplaces  1.817581
15         FireplaceQu_Gd  1.649762
6     Exterior2nd_VinylSd  1.582507
22             MasVnrArea  1.414305
17   Neighborhood_NridgHt  1.407352
5            HeatingQC_TA  1.381815
16  SaleCondition_Partial  1.310987
14            LotFrontage  1.275411
10        BsmtExposure_Gd  1.243070
12            OpenPorchSF  1.171876
8              WoodDeckSF  1