# House Pricing

<a href="https://www.kaggle.com/c/house-prices-advanced-regression-techniques" target="_blank">Exercice</a>

In [46]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import make_scorer
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR

from collections import defaultdict

%matplotlib inline

In [47]:
dataset = pd.read_csv("train.csv")
print(dataset.head())
print(dataset.describe())
print(dataset.info())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities    ...     PoolArea PoolQC Fence MiscFeature MiscVal  \
0         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
1         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
2         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
3         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
4         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   

  MoSold YrSold  SaleType  SaleCondition  SalePrice  
0      2   2008     

# Cleanup

Filling missing values and converting datatype to ease regression afterward

In [48]:
def fill_dataset(df, encoder=None):
    if encoder is None:
        generate_encoder = True
        encoder = {}
    else:
        generate_encoder = False
    
    for column, type_ in zip(df, df.dtypes):
        if type_ == "object":
            df[column] = df[column].fillna("None")
            if generate_encoder:
                LE = LabelEncoder()
                df[column] = LE.fit_transform(df[column])
                encoder[column] = LE
            else:
                df[column] = encoder[column].transform(df[column])
        else:
            df[column] = df[column].fillna(0)
    return df, encoder

In [49]:
def fill_dataset_2(df):
    for column, type_ in zip(df, df.dtypes):
        if type_ == "object":
            pd.get_dummies(df, prefix=[column])
    return df

In [50]:
#dataset, LabelEnc = fill_dataset(dataset)
#dataset = fill_dataset_2(dataset)
dataset = pd.get_dummies(dataset)

In [51]:
print(dataset.head())
print(dataset.describe())
print(dataset.info())

   Id  MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
0   1          60         65.0     8450            7            5       2003   
1   2          20         80.0     9600            6            8       1976   
2   3          60         68.0    11250            7            5       2001   
3   4          70         60.0     9550            7            5       1915   
4   5          60         84.0    14260            8            5       2000   

   YearRemodAdd  MasVnrArea  BsmtFinSF1          ...            \
0          2003       196.0         706          ...             
1          1976         0.0         978          ...             
2          2002       162.0         486          ...             
3          1970         0.0         216          ...             
4          2000       350.0         655          ...             

   SaleType_ConLw  SaleType_New  SaleType_Oth  SaleType_WD  \
0               0             0             0            1  

So we only drop 1 entry whiwh is the one with a Nan in Electricity as we don't know how to set it properly. For garage/alley and so on, we can impute some values (0 or None) as that means that they don't have it. A good point is that data are  precise (for example on all inputs regarding garage, everytime we had 81 NaN, this would have been more complex if it was different)

In [52]:
X = dataset.drop(["SalePrice", "Id"], axis=1)
y = dataset["SalePrice"].as_matrix()

In [44]:
# LE = LabelEncoder()
# d = defaultdict(LabelEncoder)
# X = X.apply(lambda x: d[x.name].fit_transform(x))
#X = X.apply(LE.fit_transform)  #https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn

In [45]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
print(X_scaled)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

# Model

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# skf = StratifiedKFold(n_splits=2)  # not enought smaple for lot of split

In [54]:
def rmsle(real, predicted):
    sum=0.0
    for x in range(len(predicted)):
        if predicted[x]<0 or real[x]<0: #check for negative values
            continue
        p = np.log(predicted[x]+1)
        r = np.log(real[x]+1)
        sum = sum + (p - r)**2
    return (sum/len(predicted))**0.5

rmsle_score = make_scorer(rmsle, greater_is_better=False)

In [68]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC

grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, scoring=rmsle_score)
grid.fit(X_train, y_train)

y_pred = grid.best_estimator_.predict(X_test)
score = rmsle(y_test, y_pred)
print(score)
# cvres = grid.cv_results_
# for mean, param in zip(cvres["mean_score"], cvres["params"]):
#     print(mean, param)



0.308200840128


In [55]:
params= [
    {"l1_ratio": .1},
    {"l1_ratio": .5},
    {"l1_ratio": .7},
    {"l1_ratio": .9},
    {"l1_ratio": .95},
    {"l1_ratio": .99},
    {"l1_ratio": 1}  
]
for param in params:
    regr = ElasticNetCV(**param)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    score = rmsle(y_test, y_pred)
    print(param, "\t", score)

{'l1_ratio': 0.1} 	 0.444811815881
{'l1_ratio': 0.5} 	 0.435477165631
{'l1_ratio': 0.7} 	 0.422667675704
{'l1_ratio': 0.9} 	 0.374078203885
{'l1_ratio': 0.95} 	 0.328973275029
{'l1_ratio': 0.99} 	 0.227417964504
{'l1_ratio': 1} 	 0.174654750008


In [56]:
params= [
    {"n_estimators": 3, "max_depth":10},
    {"n_estimators": 5, "max_depth":10},
    {"n_estimators": 10, "max_depth":10},
    {"n_estimators": 20, "max_depth":10},
    {"n_estimators": 50, "max_depth":10},
    {"n_estimators": 100, "max_depth":10},
    {"n_estimators": 200, "max_depth":10}
]
for param in params:
    regr = RandomForestRegressor(**param)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    score = rmsle(y_test, y_pred)
    print(param, "\t", score)

{'n_estimators': 3, 'max_depth': 10} 	 0.183874059146
{'n_estimators': 5, 'max_depth': 10} 	 0.169064137971
{'n_estimators': 10, 'max_depth': 10} 	 0.170864998893
{'n_estimators': 20, 'max_depth': 10} 	 0.154809456853
{'n_estimators': 50, 'max_depth': 10} 	 0.154291809947
{'n_estimators': 100, 'max_depth': 10} 	 0.15214046225
{'n_estimators': 200, 'max_depth': 10} 	 0.153522350863


In [57]:
params= [
    {"n_estimators": 75, "max_depth":7},
    {"n_estimators": 75, "max_depth":10},
    {"n_estimators": 75, "max_depth":15},
    {"n_estimators": 75, "max_depth":25},
    {"n_estimators": 75, "max_depth":None}
]
for param in params:
    regr = RandomForestRegressor(**param)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    score = rmsle(y_test, y_pred)
    print(param, "\t", score)

{'n_estimators': 75, 'max_depth': 7} 	 0.16174804029
{'n_estimators': 75, 'max_depth': 10} 	 0.153576344696
{'n_estimators': 75, 'max_depth': 15} 	 0.152816230804
{'n_estimators': 75, 'max_depth': 25} 	 0.154361778162
{'n_estimators': 75, 'max_depth': None} 	 0.151297745598


In [58]:
for i in range(1,10):
    regr = GradientBoostingRegressor(max_depth = i)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    score = rmsle(y_test, y_pred)
    print(score)

0.182595857283
0.15210235878
0.145721328124
0.142467691972
0.140464508244
0.143755555921
0.14482696451
0.155942149753
0.157392277807


In [60]:
params= [
    {"hidden_layer_sizes": (150, 150,)},
    {"hidden_layer_sizes": (150, 300,)},
    {"hidden_layer_sizes": (300, 150, 50, 150, 300), "solver":"lbfgs"},
    {"hidden_layer_sizes": (300, 150,)}
]
for param in params:
    regr = MLPRegressor(**param)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    score = rmsle(y_test, y_pred)
    print(param, "\t", score)



{'hidden_layer_sizes': (150, 150)} 	 0.372806829784
{'hidden_layer_sizes': (150, 300)} 	 0.342342854942
{'hidden_layer_sizes': (300, 150, 50, 150, 300), 'solver': 'lbfgs'} 	 0.160157987665
{'hidden_layer_sizes': (300, 150)} 	 0.353335316223


In [61]:
params= [
    {"epsilon": 0.1},
    {"epsilon": 0.5},
    {"epsilon": 0.9},
    {"epsilon": 1},
    {"epsilon": 5},
    {"epsilon": 15},
    {"epsilon": 50},
    {"epsilon": 100}
]
for param in params:
    regr = SVR(**param)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    score = rmsle(y_test, y_pred)
    print(param, "\t", score)

{'epsilon': 0.1} 	 0.432225352033
{'epsilon': 0.5} 	 0.432225352033
{'epsilon': 0.9} 	 0.432225352033
{'epsilon': 1} 	 0.432225352033
{'epsilon': 5} 	 0.432225059901
{'epsilon': 15} 	 0.432223245908
{'epsilon': 50} 	 0.4322157397
{'epsilon': 100} 	 0.432215144712


# Evaluation

In [159]:
dataset = pd.read_csv("test.csv")
print(dataset.head())
print(dataset.describe())
print(dataset.info())

     Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0  1461          20       RH         80.0    11622   Pave   NaN      Reg   
1  1462          20       RL         81.0    14267   Pave   NaN      IR1   
2  1463          60       RL         74.0    13830   Pave   NaN      IR1   
3  1464          60       RL         78.0     9978   Pave   NaN      IR1   
4  1465         120       RL         43.0     5005   Pave   NaN      IR1   

  LandContour Utilities      ...       ScreenPorch PoolArea PoolQC  Fence  \
0         Lvl    AllPub      ...               120        0    NaN  MnPrv   
1         Lvl    AllPub      ...                 0        0    NaN    NaN   
2         Lvl    AllPub      ...                 0        0    NaN  MnPrv   
3         Lvl    AllPub      ...                 0        0    NaN    NaN   
4         HLS    AllPub      ...               144        0    NaN    NaN   

  MiscFeature MiscVal MoSold  YrSold  SaleType  SaleCondition  
0         NaN   

In [160]:
X = dataset.drop("Id", axis=1)

In [161]:
dataset = fill_dataset(dataset, LabelEnc)

ValueError: y contains new labels: ['None']

In [None]:
regr = GradientBoostingRegressor(max_depth = 5)
regr.fit(X_train, y_train)
y_pred = regr.predict(X)