In [1]:
import numpy as np
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#loading the dataset
housing = pd.read_csv('data/house-prices-advanced-regression-techniques/train.csv', index_col=0)
housing.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


# 1. Data preprocessing

During the EDA, we discovered which variables could be a good predictor of the SalePrice variable, let's select these variables and transform them when it seems necessary.

In [3]:
num_cols = ['OverallQual','GrLivArea','GarageCars','TotalBsmtSF','FullBath','YearBuilt','BsmtFinSF1','YearRemodAdd']
cat_cols = ['Neighborhood', 'ExterQual', 'BsmtQual', 'KitchenQual', 'GarageFinish']
all_cols = num_cols + cat_cols

As we saw in the EDA, a lot of nans are **false** nans (the nan value has information and was controlled during the data collection), let's write our function to transform those nans.

In [4]:
cols_nan = housing.isnull().sum()
cols_with_nans = cols_nan[cols_nan>0].index
false_nan_cols = [col for col in cols_with_nans if col not in ['LotFrontage', 'Electrical', 'SalePrice']]

def false_nan_to_cat(df, cols=false_nan_cols):
    '''
    change nan values for the column in the list cols to the value 'None'
    '''
    for col in cols:
        df.loc[df[col].isnull(), col] = 'None'
    
    return df

Let's begin to write the functions to clean and process the numerical variables.

In [5]:
def cat_to_dummies(df, cols=cat_cols):
    '''
    transform categorical variables for the columns in cols to dummies variable 
    and return the new datafram with dummies variable
    '''
    for col in cols:
        dummies = pd.get_dummies(df[col], prefix=col, drop_first=True)
        df = pd.concat([df.drop(col, axis=1), dummies],axis=1)
    return df

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class preprocess(BaseEstimator, TransformerMixin):
    
    def __init__(self, num_colss=num_cols, cat_cols=cat_cols):
        self.num_cols = num_cols
        self.cat_cols = cat_cols
        self.all_cols = num_cols + cat_cols
        
    def fit(self, X, y=None):
        # replace false nans
        df = false_nan_to_cat(X)
        
        # select relevant columns and categorical to dummies
        df = df[self.all_cols]
        df = cat_to_dummies(df, cols=self.cat_cols)
        
        self.means = df.mean()
        self.train_cols = df.columns
        return self
    
    def transform(self, X):
        # replace false nans
        df = false_nan_to_cat(X)
        
        # select relevant columns and categorical to dummies
        df = df[self.all_cols]
        df = cat_to_dummies(df, cols=self.cat_cols)
        
        # replace any nan by the meab of the variable
        df = df.fillna(value=self.means)
        
        return df[self.train_cols]

In [7]:
class squarer(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
        return
    def fit(self,X,y=None):
        return self
    def transform(self, X):
        for col in self.cols:
            X[col+'sq'] = X[col]**2
        return X

In [17]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler

num_cols = ['OverallQual','GrLivArea','GarageCars','TotalBsmtSF','FullBath','YearBuilt','BsmtFinSF1','YearRemodAdd']
cat_cols = ['Neighborhood', 'ExterQual', 'BsmtQual', 'KitchenQual', 'GarageFinish']
all_cols = num_cols + cat_cols

preprocessing = Pipeline([
    ('cleaning', preprocess(num_colss=num_cols, cat_cols=cat_cols)),
    ('square', squarer(cols=['OverallQual', 'YearBuilt', 'FullBath'])),
    ('standardize', StandardScaler())
])

X = preprocessing.fit_transform(housing)
y = housing.SalePrice
#y = np.log(housing.SalePrice)

# 2. Model testing

Now that we have a preprocessing pipeline which clean our data, let's see what we get with a few simple model.

In [18]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

lin_reg = LinearRegression()

scores = cross_val_score(lin_reg, X, y, cv=5, scoring='neg_mean_squared_error')
np.sqrt(-np.mean(scores))

33084.365552494179

In [19]:
ridge = Ridge()

scores = cross_val_score(ridge, X, y, cv=5, scoring='neg_mean_squared_error')
np.sqrt(-np.mean(scores))

33209.512583841628

In [20]:
lasso = Lasso()

scores = cross_val_score(lasso, X, y, cv=5, scoring='neg_mean_squared_error')
np.sqrt(-np.mean(scores))



33217.924766194774

## gridsearch Lasso

In [21]:
pipeline_lasso = Pipeline([
    ('lasso', Lasso())
])

param_grid_lasso = {
    'lasso__alpha':[0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]
}
grid_search_lasso = GridSearchCV(pipeline_lasso, param_grid=param_grid_lasso, cv=5,scoring='neg_mean_squared_error')
grid_search_lasso.fit(X, y)
print('best score : %.3f' % np.sqrt(-grid_search_lasso.best_score_))
print('best params :', grid_search_lasso.best_params_)





best score : 33216.866
best params : {'lasso__alpha': 0.0001}




## gridsearch Ridge

In [22]:
pipeline_ridge = Pipeline([
    ('ridge', Ridge())
])

param_grid_ridge = {
    'ridge__alpha':[1, 10, 50, 100, 150]
}
grid_search_ridge = GridSearchCV(pipeline_ridge, param_grid=param_grid_ridge, cv=5,scoring='neg_mean_squared_error')
grid_search_ridge.fit(X, y)
print('best score : %.3f' % np.sqrt(-grid_search_ridge.best_score_))
print('best params :', grid_search_ridge.best_params_)

best score : 33128.165
best params : {'ridge__alpha': 10}


## gridsearch RandomForest

In [23]:
from sklearn.ensemble import RandomForestRegressor
pipeline_forest = Pipeline([
    ('forest', RandomForestRegressor())
])

param_grid_forest = {
    'forest__n_estimators':[100,200],
}
grid_search_forest = GridSearchCV(pipeline_forest, param_grid=param_grid_forest, cv=5,scoring='neg_mean_squared_error')
grid_search_forest.fit(X, y)
print('best score : %.3f' % np.sqrt(-grid_search_forest.best_score_))
print('best params :', grid_search_forest.best_params_)

best score : 29856.005
best params : {'forest__n_estimators': 200}


In [24]:
scores = cross_val_score(grid_search_forest.best_estimator_, X, y, scoring='neg_mean_squared_error')
np.sqrt(-scores.mean())

29550.646149727265

## Preparation of submission file

# prediction for competition set
X_comp_test = pd.read_csv('data/house-prices-advanced-regression-techniques/test.csv', index_col=0)
X_comp_test_cleanend = preprocessing.transform(X_comp_test)

y_comp_test_pred = grid_search_forest.best_estimator_.predict(X_comp_test_cleanend)

y_comp_pred_final = pd.DataFrame({'SalePrice':y_comp_test_pred}, index=X_comp_test.index)
y_comp_pred_final = np.exp(y_comp_pred_final)
y_comp_pred_final.to_csv('submission.csv')