# Implementing different kinds of Regressors

## Getting Started

In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import category_encoders as ce
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import Imputer
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor

## Handling NaN values in categorical and normal values

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train_float = train.select_dtypes(exclude=['object']).copy()
test_float = test.select_dtypes(exclude=['object']).copy()
train_category = train.select_dtypes(include=['object']).copy()
test_category = test.select_dtypes(include=['object']).copy()

In [4]:
Y = train_float.SalePrice
train_float = train_float.drop(['Id','SalePrice'],axis=1)
test_float = test_float.drop(['Id'],axis=1)

In [5]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(train_float)

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [6]:
Xtrain_float = imp.transform(train_float)
Xtest_float = imp.transform(test_float)

In [7]:
cols_with_missing = [col for col in train_category.columns 
                                 if train_category[col].isnull().any()]

In [8]:
Xtrain_category = train_category.drop(cols_with_missing, axis=1)
Xtest_category  = test_category.drop(cols_with_missing, axis=1)

In [9]:
Xtrain_category.columns

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating',
       'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [10]:
Xtest_category.columns

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating',
       'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [11]:
encoder = ce.BackwardDifferenceEncoder(cols=['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating',
       'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive',
       'SaleType', 'SaleCondition'])

In [12]:
Xtrain_encoded = encoder.fit_transform(Xtrain_category)
Xtest_encoded = encoder.transform(Xtest_category)

In [13]:
Xtrain_float = pd.DataFrame(Xtrain_float, columns=train_float.columns)
Xtest_float = pd.DataFrame(Xtest_float, columns=test_float.columns)

## Scaling data

In [14]:
min_max_scaler = preprocessing.MinMaxScaler()
Xtrain_float = pd.DataFrame(min_max_scaler.fit_transform(Xtrain_float), columns = Xtrain_float.columns)
Xtest_float = pd.DataFrame(min_max_scaler.transform(Xtest_float), columns=Xtest_float.columns)

In [15]:
Xtrain = pd.concat([Xtrain_float, Xtrain_encoded], axis=1)
Xtest = pd.concat([Xtest_float, Xtest_encoded], axis=1)

In [16]:
Xtest.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,col_SaleType_6,col_SaleType_7,col_SaleType_8,col_SaleType_9,col_SaleCondition_0,col_SaleCondition_1,col_SaleCondition_2,col_SaleCondition_3,col_SaleCondition_4,col_SaleCondition_5
0,0.0,0.202055,0.048246,0.444444,0.625,0.644928,0.183333,0.0,0.08292,0.097693,...,-0.4,-0.3,-0.2,-0.1,1.0,-0.833333,-0.666667,-0.5,-0.333333,-0.166667
1,0.0,0.205479,0.060609,0.555556,0.625,0.623188,0.133333,0.0675,0.163536,0.0,...,-0.4,-0.3,-0.2,-0.1,1.0,-0.833333,-0.666667,-0.5,-0.333333,-0.166667
2,0.235294,0.181507,0.058566,0.444444,0.5,0.905797,0.8,0.0,0.140149,0.0,...,-0.4,-0.3,-0.2,-0.1,1.0,-0.833333,-0.666667,-0.5,-0.333333,-0.166667
3,0.235294,0.195205,0.040562,0.555556,0.625,0.913043,0.8,0.0125,0.106662,0.0,...,-0.4,-0.3,-0.2,-0.1,1.0,-0.833333,-0.666667,-0.5,-0.333333,-0.166667
4,0.588235,0.075342,0.017318,0.777778,0.5,0.869565,0.7,0.0,0.046598,0.0,...,-0.4,-0.3,-0.2,-0.1,1.0,-0.833333,-0.666667,-0.5,-0.333333,-0.166667


In [17]:
Xtest = Xtest.drop(['col_Functional_7', 'col_SaleType_9', 'col_KitchenQual_4', 'col_MSZoning_5'], axis=1)

In [18]:
Xtrain = Xtrain.drop(['col_RoofMatl_4', 'col_HouseStyle_7', 'col_Exterior1st_14', 
                      'col_Heating_4', 'col_Heating_5', 'col_RoofMatl_6', 'col_Condition2_7', 
                      'col_RoofMatl_7', 'col_RoofMatl_5', 'col_Condition2_6', 'col_Condition2_5'], axis=1)

# Regression

## XGBoost

In [63]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', learning_rate=0.1, max_depth=3, n_estimators=200)
xg_reg.fit(Xtrain ,Y)
Ypredict = xg_reg.predict(Xtest)
test = pd.read_csv('test.csv')
Ypred = pd.DataFrame({'SalePrice':Ypredict})
prediction = pd.concat([test['Id'], Ypred], axis=1)
prediction.to_csv('predictions_new_2.csv', sep=',', index=False)

In [64]:
prediction.head()

Unnamed: 0,Id,SalePrice
0,1461,116234.007812
1,1462,149810.5625
2,1463,178060.578125
3,1464,179427.5625
4,1465,183026.09375


## SVM Regression

In [61]:
clf = SVR(C=1, epsilon=0.2, kernel='rbf')
clf.fit(Xtrain, Y)
Ypredict = clf.predict(Xtest)
test = pd.read_csv('test.csv')
Ypred = pd.DataFrame({'SalePrice':Ypredict})
prediction = pd.concat([test['Id'], Ypred], axis=1)
prediction.to_csv('predictions_svmr.csv', sep=',', index=False)

In [62]:
prediction.head()

Unnamed: 0,Id,SalePrice
0,1461,162998.62103
1,1462,162995.723687
2,1463,163014.519926
3,1464,163016.96478
4,1465,163011.862574


## Random Forest

In [59]:
regr = RandomForestRegressor(n_estimators=100)
regr.fit(Xtrain, Y)
Ypredict = regr.predict(Xtest)
test = pd.read_csv('test.csv')
Ypred = pd.DataFrame({'SalePrice':Ypredict})
prediction = pd.concat([test['Id'], Ypred], axis=1)
prediction.to_csv('predictions_randomforest.csv', sep=',', index=False)

In [60]:
prediction.head()

Unnamed: 0,Id,SalePrice
0,1461,127830.34
1,1462,151384.25
2,1463,168147.0
3,1464,173938.0
4,1465,200302.18


## AdaBoost

In [57]:
ada = AdaBoostRegressor(n_estimators=100, learning_rate=0.1)
ada.fit(Xtrain, Y)
Ypredict = ada.predict(Xtest)
test = pd.read_csv('test.csv')
Ypred = pd.DataFrame({'SalePrice':Ypredict})
prediction = pd.concat([test['Id'], Ypred], axis=1)
prediction.to_csv('predictions_adaBoost.csv', sep=',', index=False)

In [58]:
prediction.head()

Unnamed: 0,Id,SalePrice
0,1461,120614.223404
1,1462,126994.376554
2,1463,168801.125
3,1464,171413.882246
4,1465,227541.858156


## Gradient Boosted Trees

In [55]:
gbt = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1, max_depth=2)
gbt.fit(Xtrain, Y)
Ypredict = gbt.predict(Xtest)
test = pd.read_csv('test.csv')
Ypred = pd.DataFrame({'SalePrice':Ypredict})
prediction = pd.concat([test['Id'], Ypred], axis=1)
prediction.to_csv('predictions_GradientBoostingTree.csv', sep=',', index=False)

In [56]:
prediction.head()

Unnamed: 0,Id,SalePrice
0,1461,118500.147487
1,1462,154107.847331
2,1463,183407.225981
3,1464,187724.752178
4,1465,181244.144983
