In [1]:
import numpy as np
import pandas as pd

## Loading Dataset

In [2]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Cleaning Dataset

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
corr_matrix = train.corr()

In [5]:
corr_matrix['SalePrice'].sort_values(ascending=False)

SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
GarageYrBlt      0.486362
MasVnrArea       0.477493
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.351799
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
Id              -0.021917
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
Name: SalePr

In [6]:
# dropping all the columns having dtype "object"

train = train.select_dtypes(exclude=['object'])

In [7]:
# dropping all those columns which have less relation with salesprice to avoid overfitting

train.drop(['GarageYrBlt','Id','YearBuilt','BsmtHalfBath','LowQualFinSF','PoolArea','GarageYrBlt','OverallCond',
            'YearRemodAdd','3SsnPorch','WoodDeckSF','EnclosedPorch','BsmtFinSF2','MSSubClass','KitchenAbvGr',
            'ScreenPorch','MoSold','YrSold','MiscVal'], axis=1,inplace=True)

In [8]:
# making one column for halfbathroom and full bathroom and droping other two

train["bath"] = train['HalfBath'] + train["FullBath"]

In [9]:
train.drop(['HalfBath','FullBath'],axis=1, inplace=True)

In [10]:
train["TotalFlrSF"] = train['1stFlrSF'] + train["2ndFlrSF"]

In [11]:
train.drop(['1stFlrSF'],axis=1,inplace=True)

In [12]:
# unfinished surface is a negative point for a house therefore making it negative for correlation matrix

train["BsmtUnFinSf"] =  - train["BsmtUnfSF"]

In [13]:
train.drop(['BsmtUnfSF'],axis=1,inplace=True)

In [14]:
corr_matrix = train.corr()

In [15]:
corr_matrix['SalePrice'].sort_values(ascending=False)

SalePrice       1.000000
OverallQual     0.790982
TotalFlrSF      0.716883
GrLivArea       0.708624
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
bath            0.568267
TotRmsAbvGrd    0.533723
MasVnrArea      0.477493
Fireplaces      0.466929
BsmtFinSF1      0.386420
LotFrontage     0.351799
2ndFlrSF        0.319334
OpenPorchSF     0.315856
LotArea         0.263843
BsmtFullBath    0.227122
BedroomAbvGr    0.168213
BsmtUnFinSf    -0.214479
Name: SalePrice, dtype: float64

In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LotFrontage   1201 non-null   float64
 1   LotArea       1460 non-null   int64  
 2   OverallQual   1460 non-null   int64  
 3   MasVnrArea    1452 non-null   float64
 4   BsmtFinSF1    1460 non-null   int64  
 5   TotalBsmtSF   1460 non-null   int64  
 6   2ndFlrSF      1460 non-null   int64  
 7   GrLivArea     1460 non-null   int64  
 8   BsmtFullBath  1460 non-null   int64  
 9   BedroomAbvGr  1460 non-null   int64  
 10  TotRmsAbvGrd  1460 non-null   int64  
 11  Fireplaces    1460 non-null   int64  
 12  GarageCars    1460 non-null   int64  
 13  GarageArea    1460 non-null   int64  
 14  OpenPorchSF   1460 non-null   int64  
 15  SalePrice     1460 non-null   int64  
 16  bath          1460 non-null   int64  
 17  TotalFlrSF    1460 non-null   int64  
 18  BsmtUnFinSf   1460 non-null 

## Filling NaN Values

In [17]:
mean_value=train['MasVnrArea'].mean()
  
# Replace NaNs in column with the mean of values in the same column
train['MasVnrArea'].fillna(value=mean_value, inplace=True)

In [18]:
mean_value=train['LotFrontage'].mean()
  
# Replace NaNs in column with the mean of values in the same column
train['LotFrontage'].fillna(value=mean_value, inplace=True)

## Preparing model

In [20]:
labels = train['SalePrice']
features = train.drop(['SalePrice'],axis = 1)

In [21]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size = 0.2, random_state = 30)

## RandomForestRegressor

In [22]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train,Y_train)

In [23]:
y_predict = rf.predict(X_test)

In [24]:
import math
MSE = np.square(np.subtract(Y_test,y_predict)).mean() 
 
RMSE = math.sqrt(MSE)
print("Root Mean Square Error:")
print(RMSE)

Root Mean Square Error:
37927.851133849006


In [25]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(Y_test, y_predict)

0.11739881771583563

## Linear Regression

In [26]:
from sklearn.linear_model import LinearRegression

In [27]:
reg = LinearRegression()
reg.fit(X_train, Y_train)

In [28]:
y_predict = reg.predict(X_test)

In [29]:
mean_absolute_percentage_error(Y_test, y_predict)

0.1319736545762085

## Ridge

In [30]:
from sklearn.linear_model import Ridge

In [31]:
rr = Ridge(alpha=10e7) 
# higher the alpha value, more restriction on the coefficients; low alpha > more generalization,
# in this case linear and ridge regression resembles
rr.fit(X_train, Y_train)

In [32]:
y_predict = rr.predict(X_test)
mean_absolute_percentage_error(Y_test, y_predict)

0.17445764910339207

In [33]:
"""
For SVR
Mean absolute percentage error is - 0.2669

For Decision Tree
Mean absolute percentage error is - 0.1772

For Linear Regression
Mean absolute percentage error is - 0.1319

For Random Forest Regressor
Mean absolute percentage error is - 0.1167

For ridge regression 
Mean absolute percentage error is - 0.1744

linear regression has lowest percentage error but shows overfitting while testing therefore using ridge regression
"""

'\nFor SVR\nMean absolute percentage error is - 0.2669\n\nFor Decision Tree\nMean absolute percentage error is - 0.1772\n\nFor Linear Regression\nMean absolute percentage error is - 0.1319\n\nFor Random Forest Regressor\nMean absolute percentage error is - 0.1167\n\nFor ridge regression \nMean absolute percentage error is - 0.1744\n\nlinear regression has lowest percentage error but shows overfitting while testing therefore using ridge regression\n'

## Testing the model on actual testing data

In [34]:
df1 = pd.read_csv("sample_submission.csv")
df2 = pd.read_csv("test.csv")
test = pd.merge(df1, df2, on='Id')     # merging two dataframes based on the id
test.head(5)

Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,169277.052498,20,RH,80.0,11622,Pave,,Reg,Lvl,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,187758.393989,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,183583.68357,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,179317.477511,60,RL,78.0,9978,Pave,,IR1,Lvl,...,0,0,,,,0,6,2010,WD,Normal
4,1465,150730.079977,120,RL,43.0,5005,Pave,,IR1,HLS,...,144,0,,,,0,1,2010,WD,Normal


In [35]:
test["bath"] = test['HalfBath'] + test["FullBath"]

In [36]:
test.drop(['HalfBath','FullBath'],axis=1, inplace=True)

In [37]:
test["TotalFlrSF"] = test['1stFlrSF'] + test["2ndFlrSF"]

In [38]:
test.drop(['1stFlrSF'],axis=1,inplace=True)

In [39]:
test["BsmtUnFinSf"] =  - test["BsmtUnfSF"]

In [40]:
test.drop(['BsmtUnfSF'],axis=1,inplace=True)

In [41]:
test = test.select_dtypes(exclude=['object'])

In [42]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 0 to 1458
Data columns (total 37 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   SalePrice      1459 non-null   float64
 2   MSSubClass     1459 non-null   int64  
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   OverallQual    1459 non-null   int64  
 6   OverallCond    1459 non-null   int64  
 7   YearBuilt      1459 non-null   int64  
 8   YearRemodAdd   1459 non-null   int64  
 9   MasVnrArea     1444 non-null   float64
 10  BsmtFinSF1     1458 non-null   float64
 11  BsmtFinSF2     1458 non-null   float64
 12  TotalBsmtSF    1458 non-null   float64
 13  2ndFlrSF       1459 non-null   int64  
 14  LowQualFinSF   1459 non-null   int64  
 15  GrLivArea      1459 non-null   int64  
 16  BsmtFullBath   1457 non-null   float64
 17  BsmtHalfBath   1457 non-null   float64
 18  BedroomA

In [43]:
test.drop(['GarageYrBlt','Id','YearBuilt','BsmtHalfBath','LowQualFinSF','PoolArea','GarageYrBlt','OverallCond',
            'YearRemodAdd','3SsnPorch','WoodDeckSF','EnclosedPorch','BsmtFinSF2','MSSubClass','KitchenAbvGr',
            'ScreenPorch','MoSold','YrSold','MiscVal'], axis=1,inplace=True)


In [44]:
test = test.fillna(test.mean())     # filling all the NaN values with the mean of their respective columns

In [47]:
label = test['SalePrice']
feature = test.drop(['SalePrice'],axis = 1)

In [48]:
prediction = rr.predict(feature)

In [49]:
mean_absolute_percentage_error(label, prediction)

0.19690533384952116