In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('new_train.csv')
df.head(10)

Unnamed: 0.1,Unnamed: 0,MSZoning,Street,LandContour,Utilities,LandSlope,Neighborhood,Condition1,Condition2,BldgType,...,TotalRooms,TotalBathRooms,Functional,Fireplaces,GarageType,GarageYrBlt,GarageQual,SaleType,SaleCondition,SalePrice
0,0,RL,Pave,Lvl,AllPub,Gtl,CollgCr,Norm,Norm,1Fam,...,8,4,Typ,0,Attchd,2003.0,TA,WD,Normal,208500
1,1,RL,Pave,Lvl,AllPub,Gtl,Veenker,Feedr,Norm,1Fam,...,6,3,Typ,1,Attchd,1976.0,TA,WD,Normal,181500
2,2,RL,Pave,Lvl,AllPub,Gtl,CollgCr,Norm,Norm,1Fam,...,6,4,Typ,1,Attchd,2001.0,TA,WD,Normal,223500
3,3,RL,Pave,Lvl,AllPub,Gtl,Crawfor,Norm,Norm,1Fam,...,7,2,Typ,1,Detchd,1998.0,TA,WD,Abnorml,140000
4,4,RL,Pave,Lvl,AllPub,Gtl,NoRidge,Norm,Norm,1Fam,...,9,4,Typ,1,Attchd,2000.0,TA,WD,Normal,250000
5,5,RL,Pave,Lvl,AllPub,Gtl,Mitchel,Norm,Norm,1Fam,...,5,3,Typ,0,Attchd,1993.0,TA,WD,Normal,143000
6,6,RL,Pave,Lvl,AllPub,Gtl,Somerst,Norm,Norm,1Fam,...,7,3,Typ,1,Attchd,2004.0,TA,WD,Normal,307000
7,7,RL,Pave,Lvl,AllPub,Gtl,NWAmes,PosN,Norm,1Fam,...,7,4,Typ,2,Attchd,1973.0,TA,WD,Normal,200000
8,8,RM,Pave,Lvl,AllPub,Gtl,OldTown,Artery,Norm,1Fam,...,8,2,Min1,2,Detchd,1931.0,Fa,WD,Abnorml,129900
9,9,RL,Pave,Lvl,AllPub,Gtl,BrkSide,Artery,Artery,2fmCon,...,5,2,Typ,2,Attchd,1939.0,Gd,WD,Normal,118000


In [3]:
df.drop(['Unnamed: 0', 'AgeBeforeRemodel'], axis=1, inplace = True)

In [4]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

print(X.shape)
print(y.shape)

(1460, 31)
(1460,)


In [5]:
X = pd.get_dummies(X)
print(X.shape)

(1460, 148)


### Removing the last column of each dummy categorical data to avoid the dummy variable trap

In [6]:
X.drop(labels=['MSZoning_RM', 'Street_Pave', 'LandContour_Lvl', 'Utilities_NoSeWa', 'LandSlope_Sev', 'Neighborhood_Veenker',
               'Condition1_RRNn', 'Condition2_RRNn', 'BldgType_TwnhsE', 'HouseStyle_SLvl', 'Foundation_Wood', 'BsmtCond_TA',
               'Heating_Wall', 'HeatingQC_TA', 'CentralAir_Y', 'Electrical_SBrkr', 'KitchenQual_TA', 'Functional_Typ',
               'GarageType_NG', 'GarageQual_TA', 'SaleType_WD', 'SaleCondition_Partial'], axis=1, inplace=True)
print(X.shape)

(1460, 126)


### Importing the necessary packages from scikit-learn library

In [7]:
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_log_error

### Importing models I'll be using and testing with

In [19]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

### Data normalizatiin and test train splitting

In [9]:
X.head(2)

Unnamed: 0,TotalHouseSizeSF,OverallQual,OverallCond,YearRemodAdd,TotalKitchens,TotalRooms,TotalBathRooms,Fireplaces,GarageYrBlt,MSZoning_C (all),...,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal
0,3175,7,5,2003,1,8,4,0,2003.0,0,...,0,0,0,0,0,0,0,0,0,1
1,3282,6,8,1976,1,6,3,1,1976.0,0,...,0,0,0,0,0,0,0,0,0,1


In [10]:
X = X.values
y = y.values

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#data normilization
n = Normalizer()

#fitting the normalizer to the data and transforming it
X_train[:, 0:9] = n.fit_transform(X_train[:,0:9])
X_test[:, 0:9] = n.transform(X_test[:, 0:9])

## Time for model building!

### Decision Tree Regression

In [26]:
dt = DecisionTreeRegressor(random_state=42)

scores = np.mean(cross_val_score(estimator=dt, X=X_train, y=y_train, scoring='neg_mean_squared_log_error', cv=3))
print("avg_mean_squared_log_error: ", np.sqrt(scores*(-1)))

#param tuning
params = [{'criterion':("mse", "friedman_mse", "mae"), 'splitter':('random','best'), 'min_samples_split': range(2,12,2),
          'min_samples_leaf':range(4,20,4), 'max_features': ("auto", "sqrt", "log2", None), 'ccp_alpha': (0.1,0.2,0.3)}]
gs = GridSearchCV(dt, params, scoring='neg_mean_squared_log_error', cv=3)

gs.fit(X_train, y_train)

print("Best Score: ", np.sqrt(gs.best_score_*(-1)))
print("Best parameters: ", gs.best_params_)
print("Best model(estimator): ", gs.best_estimator_)

y_pred = gs.best_estimator_.predict(X_test)
print("rmsle=", np.sqrt(mean_squared_log_error(y_test, y_pred)))

avg_mean_squared_log_error:  0.21025434854957292
Best Score:  0.17941064310585736
Best parameters:  {'ccp_alpha': 0.1, 'criterion': 'mae', 'max_features': 'auto', 'min_samples_leaf': 12, 'min_samples_split': 2, 'splitter': 'best'}
Best model(estimator):  DecisionTreeRegressor(ccp_alpha=0.1, criterion='mae', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=12, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=42, splitter='best')
rmsle= 0.18994404668390794


### Random Forest Regression

In [27]:
rf = RandomForestRegressor(random_state=42)

#cross validation
scores = np.mean(cross_val_score(estimator=rf, X=X_train, y=y_train,scoring='neg_mean_squared_log_error', cv=3))
print("avg_root_mean_squared_log_error: ", np.sqrt(scores*(-1)))

#param tuning
params = [{'n_estimators':range(100,400,20), 'criterion':('mse','mae'), 'max_features':('auto','sqrt','log2', None)}]
gs = GridSearchCV(rf, params, scoring='neg_mean_squared_log_error', cv=3)

gs.fit(X_train, y_train)

print("Best Score: ", np.sqrt(gs.best_score_*(-1)))
print("Best parameters: ", gs.best_params_)
print("Best model(estimator): ", gs.best_estimator_)

y_pred = gs.best_estimator_.predict(X_test)
print("rmsle=", np.sqrt(mean_squared_log_error(y_test, y_pred)))

avg_mean_squared_log_error:  0.1567630462132498
Best Score:  0.15292365662983917
Best parameters:  {'criterion': 'mse', 'max_features': 'sqrt', 'n_estimators': 100}
Best model(estimator):  RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
rmsle= 0.16514168158305967


### Support Vector Regression (SVR)

In [24]:
svr = SVR()

#cross validation
scores = np.mean(cross_val_score(estimator=svr, X=X_train, y=y_train,scoring='neg_mean_squared_log_error', cv=3))
print("avg_root_mean_squared_log_error: ", np.sqrt(scores*(-1)))

#param tuning
params = [{'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 'degree': range(3,15,3), 'gamma':('scale','auto')
           ,'epsilon':(0.1,0.3,0.5)}]
gs = GridSearchCV(svr, params, scoring='neg_mean_squared_log_error', cv=3)

gs.fit(X_train, y_train)

print("Best Score: ", np.sqrt(gs.best_score_*(-1)))
print("Best parameters: ", gs.best_params_)
print("Best model(estimator): ", gs.best_estimator_)

y_pred = gs.best_estimator_.predict(X_test)
print("rmsle=", np.sqrt(mean_squared_log_error(y_test, y_pred)))

avg_root_mean_squared_log_error:  0.3908837161069843
Best Score:  0.3893208423356473
Best parameters:  {'degree': 3, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Best model(estimator):  SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
rmsle= 0.4294524320148378
