In [1]:
import numpy as np
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV, ShuffleSplit

from sklearn.preprocessing import PolynomialFeatures, StandardScaler, Imputer, OneHotEncoder, LabelEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_selection import RFE, SelectKBest, f_regression
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [3]:
#loading the dataset
housing = pd.read_csv('data/house-prices-advanced-regression-techniques/train.csv', index_col=0)
housing.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


# 1. Data preprocessing

During the EDA, we discovered which variables could be a good predictor of the SalePrice variable, let's select these variables and transform them when it seems necessary.

In [4]:
# select numerical and categorical columns to use
num_cols = ['OverallQual','GrLivArea','GarageCars','TotalBsmtSF','FullBath','YearBuilt','BsmtFinSF1','YearRemodAdd']
cat_cols = ['Neighborhood', 'ExterQual', 'BsmtQual', 'KitchenQual', 'GarageFinish']
all_cols = num_cols + cat_cols

In [5]:
# Clean dataset of false nans by converting them to categorical value 'None'

cols_nan = housing.isnull().sum()
cols_with_nans = cols_nan[cols_nan>0].index
#select false nans columns
false_nan_cols = [col for col in cols_with_nans if col not in ['LotFrontage', 'Electrical', 'SalePrice']]

def false_nan_to_cat(df, cols=false_nan_cols):
    '''
    change nan values for the column in the list cols to the value 'None'
    '''
    for col in cols:
        df.loc[df[col].isnull(), col] = 'None'
    
    return df

housing_no_false_nans = false_nan_to_cat(housing)

In [6]:
# encoder of categorical (object) features  to dummy features

from sklearn.base import BaseEstimator, TransformerMixin

class encoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.cols = X.loc[:, X.dtypes == np.object].columns
        self.dummy_cols = pd.get_dummies(X, columns=self.cols).columns
        
        return self
    
    def transform(self, X):
        X_transformed = pd.get_dummies(X, columns=self.cols)
        X_transformed = X_transformed.reindex(columns=self.dummy_cols, fill_value=0)
        
        return X_transformed

In [30]:
#  select predictors and target values and split into train and test set
X = housing_no_false_nans[all_cols]
y = housing_no_false_nans.SalePrice

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# check nans values
print('X nans :', X.isnull().sum().sum())
print('-'*20)
print('y nans :', y.isnull().sum())

X nans : 0
--------------------
y nans : 0


### Preprocess steps

In [8]:
cat_mask = X.dtypes == 'object'
preprocess_steps = [encoder(),
                    #Imputer(strategy='mean'),
                    #PolynomialFeatures(degree=2),
                    StandardScaler(),
                    #RFE(LinearRegression(),20)
                    #SelectKBest(score_func=f_regression, k=20)
                    #PCA(n_components=15)
                   ]

### Estimators

In [9]:
lin_reg_steps = preprocess_steps + [LinearRegression()]
line_reg_pipe = make_pipeline(*lin_reg_steps)

scores = cross_val_score(line_reg_pipe, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
np.sqrt((-scores).mean())

335487220356864.88

In [65]:
lin_reg_steps = preprocess_steps + [Ridge(alpha=10)]
line_reg_pipe = make_pipeline(*lin_reg_steps)

scores = cross_val_score(line_reg_pipe, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
np.sqrt((-scores).mean())

37993.597036324085

In [63]:
lin_reg_steps = preprocess_steps + [RandomForestRegressor()]
line_reg_pipe = make_pipeline(*lin_reg_steps)

scores = cross_val_score(line_reg_pipe, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
np.sqrt((-scores).mean())

33782.206425832381

In [11]:
cross_validate(make_pipeline(*preprocess_steps, KNeighborsRegressor()), X_train, y_train, cv  = 5)



{'fit_time': array([ 0.01250792,  0.01050687,  0.01000643,  0.01050735,  0.01050735]),
 'score_time': array([ 0.01450992,  0.01300907,  0.01250863,  0.01250815,  0.01250863]),
 'test_score': array([ 0.68839655,  0.65216223,  0.70627154,  0.692996  ,  0.74874456]),
 'train_score': array([ 0.80961799,  0.82135477,  0.82340844,  0.81523599,  0.81168887])}

In [46]:
estimators = [
        KNeighborsRegressor(),
        LinearRegression(),
        Ridge(),
        Lasso(),
        RandomForestRegressor(),
        GradientBoostingRegressor(),
        SVR()
         ]

estimators_cols = ['Parameters', 'Train Accuracy Mean', 'Test Accuracy Mean', 'Test Accuracy STD' ,'Fitting Time Mean']
estimators_results = pd.DataFrame(columns = estimators_cols)
estimators_results.index.name = 'Estimator Name'
for estimator in estimators:
    
    #make pipeline : preprocess + estimator
    estimator_pipe = make_pipeline(*preprocess_steps, estimator)
    
    # Cross-validation
    cv_results = cross_validate(estimator_pipe, X_train, y_train, cv  = 5)
    
    #get estimator name
    estimator_name = estimator.__class__.__name__
    # write estimator parameter
    estimators_results.loc[estimator_name, 'Parameters'] = str(estimator_pipe.get_params())
    # write training score mean
    estimators_results.loc[estimator_name, 'Train Accuracy Mean'] = cv_results['train_score'].mean()
    # write test score mean
    estimators_results.loc[estimator_name, 'Test Accuracy Mean'] = cv_results['test_score'].mean()
    # write 3*std of the test scores
    estimators_results.loc[estimator_name, 'Test Accuracy STD'] = cv_results['test_score'].std()
    # write estimator fit time mean
    estimators_results.loc[estimator_name, 'Fitting Time Mean'] = cv_results['fit_time'].mean()

estimators_results



Unnamed: 0_level_0,Parameters,Train Accuracy Mean,Test Accuracy Mean,Test Accuracy STD,Fitting Time Mean
Estimator Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KNeighborsRegressor,"{'memory': None, 'steps': [('encoder', encoder...",0.816261,0.697714,0.0311814,0.0110074
LinearRegression,"{'memory': None, 'steps': [('encoder', encoder...",0.835582,-2.11511e+19,4.23021e+19,0.0100068
Ridge,"{'memory': None, 'steps': [('encoder', encoder...",0.835585,0.761993,0.156753,0.00920649
Lasso,"{'memory': None, 'steps': [('encoder', encoder...",0.835586,0.761719,0.157224,0.0475319
RandomForestRegressor,"{'memory': None, 'steps': [('encoder', encoder...",0.964153,0.788093,0.119885,0.0481322
GradientBoostingRegressor,"{'memory': None, 'steps': [('encoder', encoder...",0.956191,0.763746,0.184272,0.0988661
SVR,"{'memory': None, 'steps': [('encoder', encoder...",-0.0431795,-0.0462783,0.0227124,0.0569382


In [43]:
estimator_pipe.get_params()

{'encoder': encoder(),
 'memory': None,
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'standardscaler__copy': True,
 'standardscaler__with_mean': True,
 'standardscaler__with_std': True,
 'steps': [('encoder', encoder()),
  ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('svr',
   SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
     kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))],
 'svr': SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
   kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
 'svr__C': 1.0,
 'svr__cache_size': 200,
 'svr__coef0': 0.0,
 'svr__degree': 3,
 'svr__epsilon': 0.1,
 'svr__gamma': 'auto',
 'svr__kernel': 'rbf',
 'svr__max_iter': -1,
 'svr__shrinking': True,
 'svr__tol': 0.001,
 'svr__verbose': False}