In [1]:
!pip install vecstack



In [2]:
!pip install imblearn



In [3]:
from vecstack import stacking
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score #works
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import SGDRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from imblearn.over_sampling import SMOTE 
from collections import Counter #for Smote, 

import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


# Data file- loading and data cleaning

In [4]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

print(train_data.shape)
print(test_data.shape)

(1460, 81)
(1459, 80)


In [5]:
#Drop the percentage of missing value larger than 0.7
train_missing = train_data.isna().sum() / train_data.shape[0]
train_missing[train_missing > 0.7]

Alley          0.937671
PoolQC         0.995205
Fence          0.807534
MiscFeature    0.963014
dtype: float64

In [6]:
test_missing = test_data.isna().sum() / test_data.shape[0]
test_missing[test_missing > 0.7]

Alley          0.926662
PoolQC         0.997944
Fence          0.801234
MiscFeature    0.965045
dtype: float64

In [7]:
train_data.loc[:, (train_data.isna().sum() / train_data.shape[0] > 0.7)].columns

Index(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], dtype='object')

In [8]:
test_data.loc[:, (test_data.isna().sum() / test_data.shape[0] > 0.7)].columns

Index(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], dtype='object')

In [9]:
train_data.drop(train_missing[train_missing > 0.7].index, axis = 1, inplace = True)

In [10]:
test_data.drop(test_missing[test_missing > 0.7].index, axis = 1, inplace = True)

In [11]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [12]:
train_data.loc[:, (train_data.isna().sum() > 0).values].isna().sum()

LotFrontage     259
MasVnrType        8
MasVnrArea        8
BsmtQual         37
BsmtCond         37
BsmtExposure     38
BsmtFinType1     37
BsmtFinType2     38
Electrical        1
FireplaceQu     690
GarageType       81
GarageYrBlt      81
GarageFinish     81
GarageQual       81
GarageCond       81
dtype: int64

In [13]:
# Manage the missing value in Train dataset
train_data['LotFrontage'].interpolate(axis=0, inplace=True)
train_data[['MasVnrType']].fillna('None', inplace=True)
train_data.dropna(subset=['GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond'], inplace=True)
train_data.dropna(subset=['BsmtQual', 'BsmtCond', 'BsmtFinType1'], inplace=True)
train_data.dropna(subset=['MasVnrType', 'MasVnrArea'], inplace=True)
train_data.drop('FireplaceQu', axis=1, inplace=True)
train_data['BsmtExposure'].fillna('No', inplace=True)
train_data['BsmtFinType2'].fillna('Unf', inplace=True)
train_data['Electrical'].fillna('SBrkr', inplace=True)

In [14]:
test_data.loc[:, (test_data.isna().sum() > 0).values].isna().sum()

MSZoning          4
LotFrontage     227
Utilities         2
Exterior1st       1
Exterior2nd       1
MasVnrType       16
MasVnrArea       15
BsmtQual         44
BsmtCond         45
BsmtExposure     44
BsmtFinType1     42
BsmtFinSF1        1
BsmtFinType2     42
BsmtFinSF2        1
BsmtUnfSF         1
TotalBsmtSF       1
BsmtFullBath      2
BsmtHalfBath      2
KitchenQual       1
Functional        2
FireplaceQu     730
GarageType       76
GarageYrBlt      78
GarageFinish     78
GarageCars        1
GarageArea        1
GarageQual       78
GarageCond       78
SaleType          1
dtype: int64

In [15]:
test_data.fillna(method='ffill', inplace=True)
test_data.fillna(method='bfill', inplace=True)

In [16]:
# Copy Train data excluding target
trainData_Copy = train_data.drop(['SalePrice', 'Id'], axis=1).copy()
testData_Copy = test_data.drop('Id', axis=1).copy()

# Combine Train and test for one Hot Encoding
combined_Data = pd.concat([trainData_Copy,testData_Copy], keys = [0,1])

# Do one Hot encoding for categorical features
combined_Data = pd.get_dummies(combined_Data)

# Separate Train data and test data
X_train = combined_Data.xs(0)
X_test = combined_Data.xs(1)

y_train = train_data["SalePrice"]

print(X_train.shape)
print(X_test.head()) 

print(y_train.shape)

(1341, 274)
   1stFlrSF  2ndFlrSF  3SsnPorch  BedroomAbvGr  BsmtFinSF1  BsmtFinSF2  \
0       896         0          0             2       468.0       144.0   
1      1329         0          0             3       923.0         0.0   
2       928       701          0             3       791.0         0.0   
3       926       678          0             3       602.0         0.0   
4      1280         0          0             2       263.0         0.0   

   BsmtFullBath  BsmtHalfBath  BsmtUnfSF  EnclosedPorch  ...  SaleType_ConLD  \
0           0.0           0.0      270.0              0  ...               0   
1           0.0           0.0      406.0              0  ...               0   
2           0.0           0.0      137.0              0  ...               0   
3           0.0           0.0      324.0              0  ...               0   
4           0.0           0.0     1017.0              0  ...               0   

   SaleType_ConLI  SaleType_ConLw  SaleType_New  SaleType_Oth 

## Build various regression models 

### Decision Tree regressor

In [17]:
#Obtain Respective Accuracy
clf = DecisionTreeRegressor()
clf.fit(X_train, y_train)
clf_predict = clf.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': clf_predict}).to_csv('result_decisiontree', index=None)

In [18]:
# Hyperparameter tuning done for decision tree regressor
parameters={'min_samples_split' : range(10,100,10),'max_depth': range(1,20,2)}
clf_random = RandomizedSearchCV(clf, parameters, n_iter=15)
clf_random.fit(X_train, y_train)
grid_parm = clf_random.best_params_
print(grid_parm)

# Using the parameters obtained from HyperParameterTuning in the DecisionTreeRegressor
clf = DecisionTreeRegressor(**grid_parm)
clf.fit(X_train, y_train)
clf_predict = clf.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': clf_predict}).to_csv('result_decisiontree_best', index=None)

{'min_samples_split': 40, 'max_depth': 15}


### Random Forest Regressor

In [19]:
rfc = RandomForestRegressor()
rfc.fit(X_train, y_train)
rfc_predict = rfc.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': rfc_predict}).to_csv('result_randomforest', index=None)

In [20]:
# Hyperparameter tuning for Random Forest
parameters = {'n_estimators': range(50,150,20), 'min_samples_split' : range(10,100,10), 'max_depth': range(1,20,2)}
rfc_random = RandomizedSearchCV(rfc, parameters, n_iter=15)
rfc_random.fit(X_train, y_train)
grid_parm_rfc = rfc_random.best_params_
print(grid_parm_rfc)

# Contruct Random Forest using the best parameters
rfc = RandomForestRegressor(**grid_parm_rfc)
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': rfc_predict}).to_csv('result_randomforest_best', index=None)

{'n_estimators': 70, 'min_samples_split': 20, 'max_depth': 15}


### Multi layer perceptron regressor

In [21]:
mlp = MLPRegressor()
mlp.fit(X_train, y_train)
mlp_predict = mlp.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': mlp_predict}).to_csv('result_mlpregressor', index=None)

In [22]:
# Hyperparameter tuning for MLP Regressor
parameters = {'hidden_layer_sizes': range(100, 500, 10), 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'learning_rate': ['constant', 'invscaling', 'adaptive'], 'learning_rate_init': [0.001, 0.01, 0.005]}
mlp_random = RandomizedSearchCV(mlp, parameters, n_iter=15)
mlp_random.fit(X_train, y_train)
grid_parm_mlp = mlp_random.best_params_
print(grid_parm_mlp)

# Contruct MLP Regressor using the best parameters
mlp = MLPRegressor(**grid_parm_mlp)
mlp.fit(X_train,y_train)
mlp_predict = mlp.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': mlp_predict}).to_csv('result_mlpregressor_best', index=None)

{'learning_rate_init': 0.01, 'learning_rate': 'invscaling', 'hidden_layer_sizes': 340, 'activation': 'identity'}


### Support Vector Regressor

In [23]:
svr = SVR()
svr.fit(X_train, y_train)
svr_predict = svr.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': svr_predict}).to_csv('result_svmregressor', index=None)

In [None]:
# Hyperparameter tuning for Support Vector Regressor
parameters = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], 'degree': range(3, 12, 1)}
svr_random = RandomizedSearchCV(svr, parameters, n_iter=15)
svr_random.fit(X_train, y_train)
grid_parm_svr = svr_random.best_params_
print(grid_parm_svr)

# Contruct Support Vector Regressor using the best parameters
svr = SVR(**grid_parm_svr)
svr.fit(X_train,y_train)
svr_predict = svr.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': svr_predict}).to_csv('result_svmregressor_best', index=None)

### Gradient Descent Regressor

In [None]:
search_grid = {'n_estimators':[5, 10, 20, 30, 50], 'learning_rate':[0.01, 0.1, 0.05]}
abc = SGDRegressor()
abc.fit(X_train, y_train)
abc_predict = abc.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': svr_predict}).to_csv('result_gradientdescent', index=None)

In [None]:
# Hyperparameter tuning for Gradient Descent Regressor
parameters = {'loss': ['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'], 'penalty': ['none', 'l2', 'l1', 'elasticnet']}
abc_random = RandomizedSearchCV(abc, parameters, n_iter=15)
abc_random.fit(X_train, y_train)
grid_parm_abc = abc_random.best_params_
print(grid_parm_abc)

# Contruct Gradient Descent Regressor using the best parameters
abc = SVR(**grid_parm_abc)
abc.fit(X_train,y_train)
abc_predict = abc.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': abc_predict}).to_csv('result_gradientdescent_best', index=None)

## Ensemble method- Stacking models

In [None]:
print("Ensemble Methods Predictions using Gradient Descent Regressor, Random Forest Regressor and Decision Tree Regressor\n")

models = [SGDRegressor(**grid_parm_abc), RandomForestRegressor(**grid_parm_rfc), DecisionTreeRegressor(**grid_parm)]
      
S_Train, S_Test = stacking(models,                   
                           X_res, y_res, X_test,   
                           regression = False, 
                           mode = 'oof_pred_bag', 
                           needs_proba = False,
                           save_dir = None, 
                           metric = accuracy_score, 
                           n_folds = 4, 
                           stratified = True,
                           shuffle = True,  
                           random_state = 0,    
                           verbose = 2)

In [None]:
#Stacking - Construct a Gradient Descent Regressor
model = SGDRegressor()
    
model = model.fit(S_Train, y_test)
y_pred = model.predict(S_Test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': model_predict}).to_csv('result_stacking', index=None)