In [1]:
import numpy as np
import pandas as pd

## Loading Dataset

In [2]:
train = pd.read_csv("train.csv")
train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


## Cleaning Dataset

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
corr_matrix = train.corr()

In [5]:
corr_matrix['SalePrice'].sort_values(ascending=False)

SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
GarageYrBlt      0.486362
MasVnrArea       0.477493
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.351799
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
Id              -0.021917
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
Name: SalePr

In [6]:
# dropping all the columns having dtype "object"

train = train.select_dtypes(exclude=['object'])

In [7]:
# dropping all those columns which have less relation with salesprice to avoid overfitting

train.drop(['GarageYrBlt','Id','YearBuilt','BsmtHalfBath','LowQualFinSF','PoolArea','GarageYrBlt','OverallCond',
            'YearRemodAdd','3SsnPorch','WoodDeckSF','EnclosedPorch','BsmtFinSF2','MSSubClass','KitchenAbvGr',
            'ScreenPorch','MoSold','YrSold','MiscVal','LotArea'], axis=1,inplace=True)

In [8]:
# making one column for halfbathroom and full bathroom and droping other two

train["bath"] = train['HalfBath'] + train["FullBath"]

In [9]:
train.drop(['HalfBath','FullBath'],axis=1, inplace=True)

In [10]:
train["TotalFlrSF"] = train['1stFlrSF'] + train["2ndFlrSF"]

In [11]:
train.drop(['1stFlrSF'],axis=1,inplace=True)

In [12]:
# unfinished surface is a negative point for a house therefore making it negative for correlation matrix

train["BsmtUnFinSf"] =  - train["BsmtUnfSF"]

In [13]:
train.drop(['BsmtUnfSF'],axis=1,inplace=True)

In [14]:
corr_matrix = train.corr()

In [15]:
corr_matrix['SalePrice'].sort_values(ascending=False)

SalePrice       1.000000
OverallQual     0.790982
TotalFlrSF      0.716883
GrLivArea       0.708624
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
bath            0.568267
TotRmsAbvGrd    0.533723
MasVnrArea      0.477493
Fireplaces      0.466929
BsmtFinSF1      0.386420
LotFrontage     0.351799
2ndFlrSF        0.319334
OpenPorchSF     0.315856
BsmtFullBath    0.227122
BedroomAbvGr    0.168213
BsmtUnFinSf    -0.214479
Name: SalePrice, dtype: float64

In [16]:
train.skew().sort_values()

BsmtUnFinSf    -0.920268
GarageCars     -0.342549
GarageArea      0.179981
BedroomAbvGr    0.211790
OverallQual     0.216944
bath            0.257103
BsmtFullBath    0.596067
Fireplaces      0.649565
TotRmsAbvGrd    0.676341
2ndFlrSF        0.813030
TotalFlrSF      1.330470
GrLivArea       1.366560
TotalBsmtSF     1.524255
BsmtFinSF1      1.685503
SalePrice       1.882876
LotFrontage     2.163569
OpenPorchSF     2.364342
MasVnrArea      2.669084
dtype: float64

In [17]:
train['MasVnrArea'] = np.log(train['MasVnrArea'] + 1)
train['OpenPorchSF'] = np.log(train['OpenPorchSF'] + 1)
train['BsmtFinSF1'] = np.log(train['BsmtFinSF1'] + 1)
train['LotFrontage'] = np.log(train['LotFrontage'] + 1)

In [18]:
train.skew().sort_values()

BsmtUnFinSf    -0.920268
LotFrontage    -0.728728
BsmtFinSF1     -0.618410
GarageCars     -0.342549
OpenPorchSF    -0.023397
GarageArea      0.179981
BedroomAbvGr    0.211790
OverallQual     0.216944
bath            0.257103
MasVnrArea      0.494418
BsmtFullBath    0.596067
Fireplaces      0.649565
TotRmsAbvGrd    0.676341
2ndFlrSF        0.813030
TotalFlrSF      1.330470
GrLivArea       1.366560
TotalBsmtSF     1.524255
SalePrice       1.882876
dtype: float64

## Filling NaN Values

In [19]:
mean_value=train['MasVnrArea'].mean()
  
# Replace NaNs in column with the mean of values in the same column
train['MasVnrArea'].fillna(value=mean_value, inplace=True)

In [20]:
mean_value=train['LotFrontage'].mean()
  
# Replace NaNs in column with the mean of values in the same column
train['LotFrontage'].fillna(value=mean_value, inplace=True)

## Preparing model

In [21]:
labels = train['SalePrice']
features = train.drop(['SalePrice'],axis = 1)

In [22]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 32)

In [23]:
## Standardizing the dataset
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)  

## Checking for best model

In [24]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.svm import LinearSVR
import warnings

In [25]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [26]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=170,max_depth=13),
    # "XGBRegressor": XGBRegressor(), 
    # "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 35764.6831
- Mean Absolute Error: 23174.2330
- R2 Score: 0.7959
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 40636.1177
- Mean Absolute Error: 24896.6221
- R2 Score: 0.7447


Lasso
Model performance for Training set
- Root Mean Squared Error: 35764.6838
- Mean Absolute Error: 23173.2747
- R2 Score: 0.7959
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 40635.5572
- Mean Absolute Error: 24895.6762
- R2 Score: 0.7447


Ridge
Model performance for Training set
- Root Mean Squared Error: 35765.1513
- Mean Absolute Error: 23168.0241
- R2 Score: 0.7959
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 40628.4680
- Mean Absolute Error: 24895.9975
- R2 Score: 0.7448


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 28788.6054
- Mean Absolute Erro

In [27]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
4,Random Forest Regressor,0.793176
5,AdaBoost Regressor,0.755765
2,Ridge,0.744765
1,Lasso,0.744676
0,Linear Regression,0.744669
3,K-Neighbors Regressor,0.721981


## Using Random Forest Regressor

In [28]:
# model5 = RandomForestRegressor(n_estimators=170,max_depth=13,min_samples_leaf=1,min_samples_split=3)
# model5 = model5.fit(X_train, y_train)
# y_pred = model5.predict(X_test)
# score = r2_score(y_test, y_pred)*100
# print(" Accuracy of the model 5 is %.2f" %score)

# model3 = RandomForestRegressor(n_estimators=170,max_depth=13,min_samples_leaf=2,min_samples_split=3)
# model3 = model3.fit(X_train, y_train)
# y_pred = model3.predict(X_test)
# score = r2_score(y_test, y_pred)*100
# print(" Accuracy of the model 3 is %.2f" %score)


# model4 = RandomForestRegressor(n_estimators=130,max_depth=15)
# model4 = model4.fit(X_train, y_train)
# y_pred = model4.predict(X_test)
# score = r2_score(y_test, y_pred)*100
# print(" Accuracy of the model 4 is %.2f" %score)

# Checked with different parameters

model1 = RandomForestRegressor()
model1 = model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)
score = r2_score(y_test, y_pred)*100
print(" Accuracy of the model 1 is %.2f" %score)


 Accuracy of the model 1 is 78.90


## Testing the model on actual testing data

In [29]:
df1 = pd.read_csv("sample_submission.csv")
df2 = pd.read_csv("test.csv")
test = pd.merge(df1, df2, on='Id')     # merging two dataframes based on the id
test.head(5)

Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,169277.052498,20,RH,80.0,11622,Pave,,Reg,Lvl,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,187758.393989,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,183583.68357,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,179317.477511,60,RL,78.0,9978,Pave,,IR1,Lvl,...,0,0,,,,0,6,2010,WD,Normal
4,1465,150730.079977,120,RL,43.0,5005,Pave,,IR1,HLS,...,144,0,,,,0,1,2010,WD,Normal


In [30]:
test["bath"] = test['HalfBath'] + test["FullBath"]

In [31]:
test.drop(['HalfBath','FullBath'],axis=1, inplace=True)

In [32]:
test["TotalFlrSF"] = test['1stFlrSF'] + test["2ndFlrSF"]

In [33]:
test.drop(['1stFlrSF'],axis=1,inplace=True)

In [34]:
test["BsmtUnFinSf"] =  - test["BsmtUnfSF"]

In [35]:
test.drop(['BsmtUnfSF'],axis=1,inplace=True)

In [36]:
test = test.select_dtypes(exclude=['object'])

In [37]:
test.drop(['GarageYrBlt','Id','YearBuilt','BsmtHalfBath','LowQualFinSF','PoolArea','GarageYrBlt','OverallCond',
            'YearRemodAdd','3SsnPorch','WoodDeckSF','EnclosedPorch','BsmtFinSF2','MSSubClass','KitchenAbvGr',
            'ScreenPorch','MoSold','YrSold','MiscVal','LotArea'], axis=1,inplace=True)


In [38]:
test = test.fillna(test.mean())     # filling all the NaN values with the mean of their respective columns

In [39]:
label = test['SalePrice']
feature = test.drop(['SalePrice'],axis = 1)

In [40]:
feature['MasVnrArea'] = np.log(feature['MasVnrArea'] + 1)
feature['OpenPorchSF'] = np.log(feature['OpenPorchSF'] + 1)
feature['BsmtFinSF1'] = np.log(feature['BsmtFinSF1'] + 1)
feature['LotFrontage'] = np.log(feature['LotFrontage'] + 1)

In [41]:
feature.skew().sort_values()

LotFrontage    -1.131332
BsmtUnFinSf    -0.920231
BsmtFinSF1     -0.619259
GarageCars     -0.107178
OpenPorchSF    -0.060337
OverallQual     0.181196
GarageArea      0.300342
BedroomAbvGr    0.436623
MasVnrArea      0.529334
bath            0.584101
BsmtFullBath    0.650143
TotalBsmtSF     0.813868
Fireplaces      0.819858
TotRmsAbvGrd    0.842597
2ndFlrSF        0.912883
GrLivArea       1.130402
TotalFlrSF      1.142634
dtype: float64

In [42]:
feature = scaler.transform(feature)

In [43]:
prediction = model1.predict(feature)

In [44]:
mean_absolute_percentage_error(label, prediction)

0.29459743498242585