In [2]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Normalizer, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.linear_model import LassoLarsCV, LassoLarsCV, ElasticNet, RidgeCV, Ridge, SGDRegressor, LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.impute import SimpleImputer

In [3]:
train_set = pd.read_csv("data/train.csv")
train_set.drop(['Id', 'Street', 'Utilities', 'Condition2', 'LowQualFinSF', 'KitchenAbvGr', '3SsnPorch', 'MiscVal',
                'PoolArea', 'RoofMatl', 'Heating'], axis=1, inplace=True)
print(train_set.shape)
train_set.dropna(axis=1, inplace=True)
train_set.head()

(1460, 70)


Unnamed: 0,MSSubClass,MSZoning,LotArea,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,BldgType,...,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,8450,Reg,Lvl,Inside,Gtl,CollgCr,Norm,1Fam,...,Y,0,61,0,0,2,2008,WD,Normal,208500
1,20,RL,9600,Reg,Lvl,FR2,Gtl,Veenker,Feedr,1Fam,...,Y,298,0,0,0,5,2007,WD,Normal,181500
2,60,RL,11250,IR1,Lvl,Inside,Gtl,CollgCr,Norm,1Fam,...,Y,0,42,0,0,9,2008,WD,Normal,223500
3,70,RL,9550,IR1,Lvl,Corner,Gtl,Crawfor,Norm,1Fam,...,Y,0,35,272,0,2,2006,WD,Abnorml,140000
4,60,RL,14260,IR1,Lvl,FR2,Gtl,NoRidge,Norm,1Fam,...,Y,192,84,0,0,12,2008,WD,Normal,250000


In [3]:
X = train_set.drop(columns=["SalePrice"])
y = train_set["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
numerical_features = make_column_selector(dtype_include=np.number)
categorical_features = make_column_selector(dtype_exclude=np.number)

def to_string(x):
    return pd.DataFrame(x).astype(str)

numerical_pipeline = make_pipeline(SimpleImputer(), StandardScaler())
categorical_pipeline = make_pipeline(FunctionTransformer(to_string),
                                    SimpleImputer(missing_values="NA", strategy="most_frequent"), 
                                     OneHotEncoder(handle_unknown='ignore'))

preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
                                   (categorical_pipeline, categorical_features))

In [7]:
models = [SGDRegressor(penalty='l2', loss='squared_loss'),
        Ridge(alpha=20),
        RandomForestRegressor(max_features=100, n_estimators=150),
        GradientBoostingRegressor(n_estimators=149)]
names = []

for model in models:
    name = type(model).__name__
    names.append(name)
    model = make_pipeline(preprocessor, model)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("-"*30)
    print(name)
    print("MSLE", mean_absolute_error(y_test, y_pred))    
    print("R2", r2_score(y_test, y_pred))

------------------------------
SGDRegressor
MSLE 18091.828518823182
R2 0.8803407369578399
------------------------------
Ridge
MSLE 17470.8502605724
R2 0.8934179364888453
------------------------------
RandomForestRegressor
MSLE 16335.188112633181
R2 0.8952091142373366
------------------------------
GradientBoostingRegressor
MSLE 15169.877861661384
R2 0.913570646758202


In [8]:
stacking = make_pipeline(preprocessor, StackingRegressor([(names[i], models[i]) for i in range(len(models))], cv=11))

stacking.fit(X_train, y_train)
y_pred = stacking.predict(X_test)

print("Stacking")
print("MSLE", mean_squared_log_error(y_test, y_pred))    
print("R2", r2_score(y_test, y_pred))

Stacking
MSLE 0.012510188077977309
R2 0.9175812077042069


In [9]:
print("MSLE", mean_absolute_error(y_test, y_pred))  

MSLE 14841.707761663063


In [None]:
test_set = pd.read_csv("data/test.csv")
examen = test_set[X.columns]

In [None]:
"""import time

s = time.time()

model = make_pipeline(preprocessor, StackingRegressor([(names[i], models[i]) for i in range(len(models))]))

param_grid = {"stackingregressor__cv": [10, 11, 12]}

grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring="neg_root_mean_squared_error")

grid.fit(X_train, y_train)
print(grid.best_score_)
print("time", time.time() - s)

model = gridbest_estimator_

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Stacking")
print("MSLE", mean_squared_log_error(y_test, y_pred))    
print("R2", r2_score(y_test, y_pred))

grid.best_params_"""

In [None]:
stacking = make_pipeline(preprocessor, StackingRegressor([(names[i], models[i]) for i in range(len(models))]))
stacking.fit(X, y)

results = pd.Series(stacking.predict(examen), name='SalePrice')

results = pd.concat([test_set['Id'], results], axis=1)
results.set_index("Id", inplace=True)

In [None]:
results.to_csv("data/results.csv")