# House Pricing

<a href="https://www.kaggle.com/c/house-prices-advanced-regression-techniques" target="_blank">Exercice</a>

In [48]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import make_scorer
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR


%matplotlib inline

In [2]:
dataset = pd.read_csv("train.csv")
print(dataset.head())
print(dataset.describe())
print(dataset.info())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities    ...     PoolArea PoolQC Fence MiscFeature MiscVal  \
0         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
1         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
2         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
3         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
4         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   

  MoSold YrSold  SaleType  SaleCondition  SalePrice  
0      2   2008     

# Cleanup

Filling missing values and converting datatype to ease regression afterward

In [3]:
dataset["LotFrontage"] = dataset["LotFrontage"].fillna(0).astype(int)
dataset["MasVnrArea"] = dataset["MasVnrArea"].fillna(0).astype(int)
dataset["GarageYrBlt"] = dataset["GarageYrBlt"].fillna("0").astype(int)

dataset["Alley"] = dataset["Alley"].fillna("None")
dataset["MasVnrType"] = dataset["MasVnrType"].fillna("None")
dataset["BsmtQual"] = dataset["BsmtQual"].fillna("None")
dataset["BsmtCond"] = dataset["BsmtCond"].fillna("None")
dataset["BsmtExposure"] = dataset["BsmtExposure"].fillna("None")
dataset["BsmtFinType1"] = dataset["BsmtFinType1"].fillna("None")
dataset["BsmtFinType2"] = dataset["BsmtFinType2"].fillna("None")
dataset["FireplaceQu"] = dataset["FireplaceQu"].fillna("None")
dataset["GarageType"] = dataset["GarageType"].fillna("None")
dataset["GarageFinish"] = dataset["GarageFinish"].fillna("None")
dataset["GarageQual"] = dataset["GarageQual"].fillna("None")
dataset["GarageCond"] = dataset["GarageCond"].fillna("None")
dataset["PoolQC"] = dataset["PoolQC"].fillna("None")
dataset["Fence"] = dataset["Fence"].fillna("None")
dataset["MiscFeature"] = dataset["MiscFeature"].fillna("None")

dataset = dataset[dataset["Electrical"].notnull()]

In [4]:
print(dataset.head())
print(dataset.describe())
print(dataset.info())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL           65     8450   Pave  None      Reg   
1   2          20       RL           80     9600   Pave  None      Reg   
2   3          60       RL           68    11250   Pave  None      IR1   
3   4          70       RL           60     9550   Pave  None      IR1   
4   5          60       RL           84    14260   Pave  None      IR1   

  LandContour Utilities    ...     PoolArea PoolQC Fence MiscFeature MiscVal  \
0         Lvl    AllPub    ...            0   None  None        None       0   
1         Lvl    AllPub    ...            0   None  None        None       0   
2         Lvl    AllPub    ...            0   None  None        None       0   
3         Lvl    AllPub    ...            0   None  None        None       0   
4         Lvl    AllPub    ...            0   None  None        None       0   

  MoSold YrSold  SaleType  SaleCondition  SalePrice  
0      2   2008     

So we only drop 1 entry whiwh is the one with a Nan in Electricity as we don't know how to set it properly. For garage/alley and so on, we can impute some values (0 or None) as that means that they don't have it. A good point is that data are  precise (for example on all inputs regarding garage, everytime we had 81 NaN, this would have been more complex if it was different)

In [5]:
print(dataset["MSZoning"].value_counts(dropna = False), "\n")
print(dataset["Street"].value_counts(dropna = False), "\n")
print(dataset["LotShape"].value_counts(dropna = False), "\n")
print(dataset["LandContour"].value_counts(dropna = False), "\n")
print(dataset["Utilities"].value_counts(dropna = False), "\n")
print(dataset["LotConfig"].value_counts(dropna = False), "\n")
print(dataset["LandSlope"].value_counts(dropna = False), "\n")
print(dataset["Neighborhood"].value_counts(dropna = False), "\n")
print(dataset["Condition1"].value_counts(dropna = False), "\n")
print(dataset["Condition2"].value_counts(dropna = False), "\n")
print(dataset["BldgType"].value_counts(dropna = False), "\n")
print(dataset["HouseStyle"].value_counts(dropna = False), "\n")
print(dataset["RoofStyle"].value_counts(dropna = False), "\n")
print(dataset["RoofMatl"].value_counts(dropna = False), "\n")
print(dataset["Exterior1st"].value_counts(dropna = False), "\n")
print(dataset["Exterior2nd"].value_counts(dropna = False), "\n")
print(dataset["MasVnrType"].value_counts(dropna = False), "\n")
print(dataset["ExterQual"].value_counts(dropna = False), "\n")
print(dataset["ExterCond"].value_counts(dropna = False), "\n")
print(dataset["BsmtExposure"].value_counts(dropna = False), "\n")
print(dataset["BsmtFinType1"].value_counts(dropna = False), "\n")
print(dataset["BsmtFinType2"].value_counts(dropna = False), "\n")
print(dataset["Heating"].value_counts(dropna = False), "\n")
print(dataset["HeatingQC"].value_counts(dropna = False), "\n")
print(dataset["CentralAir"].value_counts(dropna = False), "\n")
print(dataset["Electrical"].value_counts(dropna = False), "\n")
print(dataset["KitchenQual"].value_counts(dropna = False), "\n")
print(dataset["Functional"].value_counts(dropna = False), "\n")
print(dataset["FireplaceQu"].value_counts(dropna = False), "\n")
print(dataset["GarageType"].value_counts(dropna = False), "\n")
print(dataset["GarageFinish"].value_counts(dropna = False), "\n")
print(dataset["GarageQual"].value_counts(dropna = False), "\n")
print(dataset["GarageCond"].value_counts(dropna = False), "\n")
print(dataset["PavedDrive"].value_counts(dropna = False), "\n")
print(dataset["PoolQC"].value_counts(dropna = False), "\n")
print(dataset["Fence"].value_counts(dropna = False), "\n")
print(dataset["MiscFeature"].value_counts(dropna = False), "\n")
print(dataset["SaleType"].value_counts(dropna = False), "\n")
print(dataset["SaleCondition"].value_counts(dropna = False), "\n")

RL         1150
RM          218
FV           65
RH           16
C (all)      10
Name: MSZoning, dtype: int64 

Pave    1453
Grvl       6
Name: Street, dtype: int64 

Reg    924
IR1    484
IR2     41
IR3     10
Name: LotShape, dtype: int64 

Lvl    1310
Bnk      63
HLS      50
Low      36
Name: LandContour, dtype: int64 

AllPub    1458
NoSeWa       1
Name: Utilities, dtype: int64 

Inside     1051
Corner      263
CulDSac      94
FR2          47
FR3           4
Name: LotConfig, dtype: int64 

Gtl    1381
Mod      65
Sev      13
Name: LandSlope, dtype: int64 

NAmes      225
CollgCr    150
OldTown    113
Edwards    100
Somerst     86
Gilbert     79
NridgHt     77
Sawyer      74
NWAmes      73
SawyerW     59
BrkSide     58
Crawfor     51
Mitchel     49
NoRidge     41
IDOTRR      37
Timber      37
ClearCr     28
SWISU       25
StoneBr     25
MeadowV     17
Blmngtn     17
BrDale      16
Veenker     11
NPkVill      9
Blueste      2
Name: Neighborhood, dtype: int64 

Norm      1259
Feedr     

In [6]:
X = dataset.drop(["SalePrice"], axis=1)
y = dataset["SalePrice"].as_matrix()

In [7]:
LE = LabelEncoder()
X = X.apply(LE.fit_transform)  #https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn

In [8]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
print(X_scaled)

[[  0.00000000e+00   3.57142857e-01   7.50000000e-01 ...,   5.00000000e-01
    1.00000000e+00   8.00000000e-01]
 [  6.85871056e-04   0.00000000e+00   7.50000000e-01 ...,   2.50000000e-01
    1.00000000e+00   8.00000000e-01]
 [  1.37174211e-03   3.57142857e-01   7.50000000e-01 ...,   5.00000000e-01
    1.00000000e+00   8.00000000e-01]
 ..., 
 [  9.98628258e-01   4.28571429e-01   7.50000000e-01 ...,   1.00000000e+00
    1.00000000e+00   8.00000000e-01]
 [  9.99314129e-01   0.00000000e+00   7.50000000e-01 ...,   1.00000000e+00
    1.00000000e+00   8.00000000e-01]
 [  1.00000000e+00   0.00000000e+00   7.50000000e-01 ...,   5.00000000e-01
    1.00000000e+00   8.00000000e-01]]


# Model

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# skf = StratifiedKFold(n_splits=2)  # not enought smaple for lot of split

In [24]:
def rmsle(real, predicted):
    sum=0.0
    for x in range(len(predicted)):
        if predicted[x]<0 or real[x]<0: #check for negative values
            continue
        p = np.log(predicted[x]+1)
        r = np.log(real[x]+1)
        sum = sum + (p - r)**2
    return (sum/len(predicted))**0.5

rmsle_score = make_scorer(rmsle, greater_is_better=False)

In [30]:
params= [
    {"l1_ratio": .1},
    {"l1_ratio": .5},
    {"l1_ratio": .7},
    {"l1_ratio": .9},
    {"l1_ratio": .95},
    {"l1_ratio": .99},
    {"l1_ratio": 1}  
]
for param in params:
    regr = ElasticNetCV(**param)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    score = rmsle(y_test, y_pred)
    print(param, "\t", score)

{'l1_ratio': 0.1} 	 0.359135237476
{'l1_ratio': 0.5} 	 0.347303931528
{'l1_ratio': 0.7} 	 0.331263219979
{'l1_ratio': 0.9} 	 0.272797237144
{'l1_ratio': 0.95} 	 0.223379071334
{'l1_ratio': 0.99} 	 0.154733690478
{'l1_ratio': 1} 	 0.166146166472


In [35]:
params= [
    {"n_estimators": 3, "max_depth":10},
    {"n_estimators": 5, "max_depth":10},
    {"n_estimators": 10, "max_depth":10},
    {"n_estimators": 20, "max_depth":10},
    {"n_estimators": 50, "max_depth":10},
    {"n_estimators": 100, "max_depth":10},
    {"n_estimators": 200, "max_depth":10}
]
for param in params:
    regr = RandomForestRegressor(**param)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    score = rmsle(y_test, y_pred)
    print(param, "\t", score)

{'n_estimators': 3, 'max_depth': 10} 	 0.168217026328
{'n_estimators': 5, 'max_depth': 10} 	 0.159508007684
{'n_estimators': 10, 'max_depth': 10} 	 0.148828686348
{'n_estimators': 20, 'max_depth': 10} 	 0.147352726344
{'n_estimators': 50, 'max_depth': 10} 	 0.14199321533
{'n_estimators': 100, 'max_depth': 10} 	 0.140842832744
{'n_estimators': 200, 'max_depth': 10} 	 0.141177579936


In [36]:
params= [
    {"n_estimators": 75, "max_depth":7},
    {"n_estimators": 75, "max_depth":10},
    {"n_estimators": 75, "max_depth":15},
    {"n_estimators": 75, "max_depth":25},
    {"n_estimators": 75, "max_depth":None}
]
for param in params:
    regr = RandomForestRegressor(**param)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    score = rmsle(y_test, y_pred)
    print(param, "\t", score)

{'n_estimators': 75, 'max_depth': 7} 	 0.146792271133
{'n_estimators': 75, 'max_depth': 10} 	 0.143002403607
{'n_estimators': 75, 'max_depth': 15} 	 0.140269249261
{'n_estimators': 75, 'max_depth': 25} 	 0.140138761655
{'n_estimators': 75, 'max_depth': None} 	 0.13954911185


In [39]:
regr = GradientBoostingRegressor()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
score = rmsle(y_test, y_pred)
print(score)

0.127560603214


In [47]:
params= [
    #{"hidden_layer_sizes": (150, 150,)},
    #{"hidden_layer_sizes": (150, 300,)},
    {"hidden_layer_sizes": (300, 150, 50, 150, 300), "solver":"lbfgs"}
    #{"hidden_layer_sizes": (300, 150,)}
]
for param in params:
    regr = MLPRegressor(**param)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    score = rmsle(y_test, y_pred)
    print(param, "\t", score)

{'hidden_layer_sizes': (300, 150, 50, 150, 300), 'solver': 'lbfgs'} 	 0.142319613397


In [52]:
params= [
    {"epsilon": 0.1},
    {"epsilon": 0.5},
    {"epsilon": 0.9},
    {"epsilon": 1},
    {"epsilon": 5},
    {"epsilon": 15},
    {"epsilon": 50},
    {"epsilon": 100}
]
for param in params:
    regr = SVR(**param)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    score = rmsle(y_test, y_pred)
    print(param, "\t", score)

{'epsilon': 0.1} 	 0.322124580672
{'epsilon': 0.5} 	 0.322124576614
{'epsilon': 0.9} 	 0.322124571131
{'epsilon': 1} 	 0.322124571131
{'epsilon': 5} 	 0.322124571131
{'epsilon': 15} 	 0.322124571131
{'epsilon': 50} 	 0.322124571131
{'epsilon': 100} 	 0.322124571131
