# House Pricing

<a href="https://www.kaggle.com/c/house-prices-advanced-regression-techniques" target="_blank">Exercice</a>

In [141]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import make_scorer
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR

from collections import defaultdict

%matplotlib inline

In [146]:
dataset = pd.read_csv("train.csv")
print(dataset.head())
print(dataset.describe())
print(dataset.info())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities    ...     PoolArea PoolQC Fence MiscFeature MiscVal  \
0         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
1         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
2         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
3         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
4         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   

  MoSold YrSold  SaleType  SaleCondition  SalePrice  
0      2   2008     

# Cleanup

Filling missing values and converting datatype to ease regression afterward

In [147]:
# dataset["LotFrontage"] = dataset["LotFrontage"].fillna(0).astype(int)
# dataset["MasVnrArea"] = dataset["MasVnrArea"].fillna(0).astype(int)
# dataset["GarageYrBlt"] = dataset["GarageYrBlt"].fillna("0").astype(int)

# dataset["Alley"] = dataset["Alley"].fillna("None")
# dataset["MasVnrType"] = dataset["MasVnrType"].fillna("None")
# dataset["BsmtQual"] = dataset["BsmtQual"].fillna("None")
# dataset["BsmtCond"] = dataset["BsmtCond"].fillna("None")
# dataset["BsmtExposure"] = dataset["BsmtExposure"].fillna("None")
# dataset["BsmtFinType1"] = dataset["BsmtFinType1"].fillna("None")
# dataset["BsmtFinType2"] = dataset["BsmtFinType2"].fillna("None")
# dataset["FireplaceQu"] = dataset["FireplaceQu"].fillna("None")
# dataset["GarageType"] = dataset["GarageType"].fillna("None")
# dataset["GarageFinish"] = dataset["GarageFinish"].fillna("None")
# dataset["GarageQual"] = dataset["GarageQual"].fillna("None")
# dataset["GarageCond"] = dataset["GarageCond"].fillna("None")
# dataset["PoolQC"] = dataset["PoolQC"].fillna("None")
# dataset["Fence"] = dataset["Fence"].fillna("None")
# dataset["MiscFeature"] = dataset["MiscFeature"].fillna("None")

# dataset = dataset[dataset["Electrical"].notnull()]

In [148]:
print(dataset.head())
print(dataset.describe())
print(dataset.info())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities    ...     PoolArea PoolQC Fence MiscFeature MiscVal  \
0         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
1         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
2         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
3         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
4         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   

  MoSold YrSold  SaleType  SaleCondition  SalePrice  
0      2   2008     

In [149]:
def fill_dataset(df, encoder=None):
    if encoder is None:
        generate_encoder = True
        encoder = {}
    else:
        generate_encoder = False
    
    for column, type_ in zip(df, df.dtypes):
        if type_ == "object":
            df[column] = df[column].fillna("None")
            if generate_encoder:
                LE = LabelEncoder()
                df[column] = LE.fit_transform(df[column])
                encoder[column] = LE
            else:
                df[column] = encoder[column].transform(df[column])
        else:
            df[column] = df[column].fillna(0)
    return df, encoder

dataset, LabelEnc = fill_dataset(dataset)

In [150]:
print(dataset.head())
print(dataset.describe())
print(dataset.info())

   Id  MSSubClass  MSZoning  LotFrontage  LotArea  Street  Alley  LotShape  \
0   1          60         3         65.0     8450       1      1         3   
1   2          20         3         80.0     9600       1      1         3   
2   3          60         3         68.0    11250       1      1         0   
3   4          70         3         60.0     9550       1      1         0   
4   5          60         3         84.0    14260       1      1         0   

   LandContour  Utilities    ...      PoolArea  PoolQC  Fence  MiscFeature  \
0            3          0    ...             0       3      4            1   
1            3          0    ...             0       3      4            1   
2            3          0    ...             0       3      4            1   
3            3          0    ...             0       3      4            1   
4            3          0    ...             0       3      4            1   

   MiscVal  MoSold  YrSold  SaleType  SaleCondition  SalePrice

So we only drop 1 entry whiwh is the one with a Nan in Electricity as we don't know how to set it properly. For garage/alley and so on, we can impute some values (0 or None) as that means that they don't have it. A good point is that data are  precise (for example on all inputs regarding garage, everytime we had 81 NaN, this would have been more complex if it was different)

In [151]:
# print(dataset["MSZoning"].value_counts(dropna = False), "\n")
# print(dataset["Street"].value_counts(dropna = False), "\n")
# print(dataset["LotShape"].value_counts(dropna = False), "\n")
# print(dataset["LandContour"].value_counts(dropna = False), "\n")
# print(dataset["Utilities"].value_counts(dropna = False), "\n")
# print(dataset["LotConfig"].value_counts(dropna = False), "\n")
# print(dataset["LandSlope"].value_counts(dropna = False), "\n")
# print(dataset["Neighborhood"].value_counts(dropna = False), "\n")
# print(dataset["Condition1"].value_counts(dropna = False), "\n")
# print(dataset["Condition2"].value_counts(dropna = False), "\n")
# print(dataset["BldgType"].value_counts(dropna = False), "\n")
# print(dataset["HouseStyle"].value_counts(dropna = False), "\n")
# print(dataset["RoofStyle"].value_counts(dropna = False), "\n")
# print(dataset["RoofMatl"].value_counts(dropna = False), "\n")
# print(dataset["Exterior1st"].value_counts(dropna = False), "\n")
# print(dataset["Exterior2nd"].value_counts(dropna = False), "\n")
# print(dataset["MasVnrType"].value_counts(dropna = False), "\n")
# print(dataset["ExterQual"].value_counts(dropna = False), "\n")
# print(dataset["ExterCond"].value_counts(dropna = False), "\n")
# print(dataset["BsmtExposure"].value_counts(dropna = False), "\n")
# print(dataset["BsmtFinType1"].value_counts(dropna = False), "\n")
# print(dataset["BsmtFinType2"].value_counts(dropna = False), "\n")
# print(dataset["Heating"].value_counts(dropna = False), "\n")
# print(dataset["HeatingQC"].value_counts(dropna = False), "\n")
# print(dataset["CentralAir"].value_counts(dropna = False), "\n")
# print(dataset["Electrical"].value_counts(dropna = False), "\n")
# print(dataset["KitchenQual"].value_counts(dropna = False), "\n")
# print(dataset["Functional"].value_counts(dropna = False), "\n")
# print(dataset["FireplaceQu"].value_counts(dropna = False), "\n")
# print(dataset["GarageType"].value_counts(dropna = False), "\n")
# print(dataset["GarageFinish"].value_counts(dropna = False), "\n")
# print(dataset["GarageQual"].value_counts(dropna = False), "\n")
# print(dataset["GarageCond"].value_counts(dropna = False), "\n")
# print(dataset["PavedDrive"].value_counts(dropna = False), "\n")
# print(dataset["PoolQC"].value_counts(dropna = False), "\n")
# print(dataset["Fence"].value_counts(dropna = False), "\n")
# print(dataset["MiscFeature"].value_counts(dropna = False), "\n")
# print(dataset["SaleType"].value_counts(dropna = False), "\n")
# print(dataset["SaleCondition"].value_counts(dropna = False), "\n")
# print(dataset["MSSubClass"].value_counts(dropna = False))

print(dataset["Alley"].value_counts(dropna = False))

1    1369
0      50
2      41
Name: Alley, dtype: int64


In [152]:
X = dataset.drop(["SalePrice", "Id"], axis=1)
y = dataset["SalePrice"].as_matrix()

In [153]:
# LE = LabelEncoder()
# d = defaultdict(LabelEncoder)
# X = X.apply(lambda x: d[x.name].fit_transform(x))
#X = X.apply(LE.fit_transform)  #https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn

In [154]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
print(X_scaled)

[[ 0.23529412  0.75        0.20766773 ...,  0.5         1.          0.8       ]
 [ 0.          0.75        0.25559105 ...,  0.25        1.          0.8       ]
 [ 0.23529412  0.75        0.2172524  ...,  0.5         1.          0.8       ]
 ..., 
 [ 0.29411765  0.75        0.21086262 ...,  1.          1.          0.8       ]
 [ 0.          0.75        0.2172524  ...,  1.          1.          0.8       ]
 [ 0.          0.75        0.23961661 ...,  0.5         1.          0.8       ]]


# Model

In [155]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# skf = StratifiedKFold(n_splits=2)  # not enought smaple for lot of split

In [131]:
def rmsle(real, predicted):
    sum=0.0
    for x in range(len(predicted)):
        if predicted[x]<0 or real[x]<0: #check for negative values
            continue
        p = np.log(predicted[x]+1)
        r = np.log(real[x]+1)
        sum = sum + (p - r)**2
    return (sum/len(predicted))**0.5

rmsle_score = make_scorer(rmsle, greater_is_better=False)

In [11]:
params= [
    {"l1_ratio": .1},
    {"l1_ratio": .5},
    {"l1_ratio": .7},
    {"l1_ratio": .9},
    {"l1_ratio": .95},
    {"l1_ratio": .99},
    {"l1_ratio": 1}  
]
for param in params:
    regr = ElasticNetCV(**param)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    score = rmsle(y_test, y_pred)
    print(param, "\t", score)

{'l1_ratio': 0.1} 	 0.42116459378
{'l1_ratio': 0.5} 	 0.407948939371
{'l1_ratio': 0.7} 	 0.39017197468
{'l1_ratio': 0.9} 	 0.326211385998
{'l1_ratio': 0.95} 	 0.271554422804
{'l1_ratio': 0.99} 	 0.172215815126
{'l1_ratio': 1} 	 0.153035869116


In [33]:
params= [
    {"n_estimators": 3, "max_depth":10},
    {"n_estimators": 5, "max_depth":10},
    {"n_estimators": 10, "max_depth":10},
    {"n_estimators": 20, "max_depth":10},
    {"n_estimators": 50, "max_depth":10},
    {"n_estimators": 100, "max_depth":10},
    {"n_estimators": 200, "max_depth":10}
]
for param in params:
    regr = RandomForestRegressor(**param)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    score = rmsle(y_test, y_pred)
    print(param, "\t", score)

{'n_estimators': 3, 'max_depth': 10} 	 0.188970402082
{'n_estimators': 5, 'max_depth': 10} 	 0.180705753327
{'n_estimators': 10, 'max_depth': 10} 	 0.161326678966
{'n_estimators': 20, 'max_depth': 10} 	 0.159093191189
{'n_estimators': 50, 'max_depth': 10} 	 0.152400209402
{'n_estimators': 100, 'max_depth': 10} 	 0.153863049019
{'n_estimators': 200, 'max_depth': 10} 	 0.152751204142


In [34]:
params= [
    {"n_estimators": 75, "max_depth":7},
    {"n_estimators": 75, "max_depth":10},
    {"n_estimators": 75, "max_depth":15},
    {"n_estimators": 75, "max_depth":25},
    {"n_estimators": 75, "max_depth":None}
]
for param in params:
    regr = RandomForestRegressor(**param)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    score = rmsle(y_test, y_pred)
    print(param, "\t", score)

{'n_estimators': 75, 'max_depth': 7} 	 0.163762827798
{'n_estimators': 75, 'max_depth': 10} 	 0.158058477879
{'n_estimators': 75, 'max_depth': 15} 	 0.150446961222
{'n_estimators': 75, 'max_depth': 25} 	 0.150649126113
{'n_estimators': 75, 'max_depth': None} 	 0.151301483072


In [156]:
for i in range(1,10):
    regr = GradientBoostingRegressor(max_depth = i)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    score = rmsle(y_test, y_pred)
    print(score)

0.183126593479
0.15442006681
0.144666216709
0.143965755397
0.139519700751
0.14384718289
0.144593727613
0.152991974248
0.159594490297


In [15]:
params= [
    #{"hidden_layer_sizes": (150, 150,)},
    #{"hidden_layer_sizes": (150, 300,)},
    {"hidden_layer_sizes": (300, 150, 50, 150, 300), "solver":"lbfgs"}
    #{"hidden_layer_sizes": (300, 150,)}
]
for param in params:
    regr = MLPRegressor(**param)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    score = rmsle(y_test, y_pred)
    print(param, "\t", score)

{'hidden_layer_sizes': (300, 150, 50, 150, 300), 'solver': 'lbfgs'} 	 0.14845606773


In [16]:
params= [
    {"epsilon": 0.1},
    {"epsilon": 0.5},
    {"epsilon": 0.9},
    {"epsilon": 1},
    {"epsilon": 5},
    {"epsilon": 15},
    {"epsilon": 50},
    {"epsilon": 100}
]
for param in params:
    regr = SVR(**param)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    score = rmsle(y_test, y_pred)
    print(param, "\t", score)

{'epsilon': 0.1} 	 0.416429866898
{'epsilon': 0.5} 	 0.41642965724
{'epsilon': 0.9} 	 0.416429447598
{'epsilon': 1} 	 0.416429395189
{'epsilon': 5} 	 0.416427299639
{'epsilon': 15} 	 0.416422067427
{'epsilon': 50} 	 0.416403829645
{'epsilon': 100} 	 0.416377977847


# Evaluation

In [159]:
dataset = pd.read_csv("test.csv")
print(dataset.head())
print(dataset.describe())
print(dataset.info())

     Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0  1461          20       RH         80.0    11622   Pave   NaN      Reg   
1  1462          20       RL         81.0    14267   Pave   NaN      IR1   
2  1463          60       RL         74.0    13830   Pave   NaN      IR1   
3  1464          60       RL         78.0     9978   Pave   NaN      IR1   
4  1465         120       RL         43.0     5005   Pave   NaN      IR1   

  LandContour Utilities      ...       ScreenPorch PoolArea PoolQC  Fence  \
0         Lvl    AllPub      ...               120        0    NaN  MnPrv   
1         Lvl    AllPub      ...                 0        0    NaN    NaN   
2         Lvl    AllPub      ...                 0        0    NaN  MnPrv   
3         Lvl    AllPub      ...                 0        0    NaN    NaN   
4         HLS    AllPub      ...               144        0    NaN    NaN   

  MiscFeature MiscVal MoSold  YrSold  SaleType  SaleCondition  
0         NaN   

In [160]:
X = dataset.drop("Id", axis=1)

In [161]:
dataset = fill_dataset(dataset, LabelEnc)

ValueError: y contains new labels: ['None']

In [None]:
regr = GradientBoostingRegressor(max_depth = 5)
regr.fit(X_train, y_train)
y_pred = regr.predict(X)