In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Normalizer, normalize, FunctionTransformer, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LassoLarsCV, LassoLarsCV, ElasticNet, RidgeCV, Ridge, SGDRegressor, LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor


In [2]:
train_set = pd.read_csv("data/train.csv")
train_set.dropna(axis=1,thresh=len(train_set) * 90/100, inplace=True)
train_set.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [3]:
X = train_set.drop(columns=["SalePrice", "Id"])
y = train_set["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [4]:
low_var = []

for serie in X:
    vc = X[serie].value_counts()
    if len(X[serie]) * 94 / 100 < vc.iloc[0]:
        low_var.append(serie)

X.drop(columns=low_var, inplace=True)

In [5]:
numerical_features = make_column_selector(dtype_include=np.number)
numerical_pipeline = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler())

categorical_features = make_column_selector(dtype_exclude=np.number)
categorical_pipeline = make_pipeline(FunctionTransformer(lambda x: pd.DataFrame(x).astype(str)),
                                     SimpleImputer(missing_values="NA", strategy="most_frequent"),
                                     OneHotEncoder(handle_unknown='ignore'))

preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
                                   (categorical_pipeline, categorical_features))

In [6]:
models = [Ridge(), RandomForestRegressor(), GradientBoostingRegressor()]

param_grids = [{'alpha': range(1, 40)},
               {'n_estimators': range(60,100,10)},
                {'learning_rate': [0.1, 0.08, 0.04],
                  'subsample'    : [0.6, 0.5],
                  'n_estimators' : [900, 1000, 1100],
                  'max_depth'    : [3, 4]}]

pre = preprocessor.fit_transform(X_train, y_train)

for i, model in enumerate(models):
    grid = GridSearchCV(estimator=model, param_grid = param_grids[i], scoring="neg_root_mean_squared_error")
    grid.fit(pre, y_train)
    models[i] = grid.best_estimator_
    print(grid.best_estimator_)

Ridge(alpha=17)
RandomForestRegressor(n_estimators=60)
GradientBoostingRegressor(learning_rate=0.04, n_estimators=1000, subsample=0.6)


In [7]:
models_save = [Ridge(alpha=19),
          RandomForestRegressor(n_estimators=60),
          GradientBoostingRegressor(learning_rate=0.03, max_depth=4, n_estimators=1000, subsample=0.6)]

names = []
for model in models:
    name = type(model).__name__
    names.append(name)
    model = make_pipeline(preprocessor, model)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("-"*30)
    print(name)
    print("RMSE", mean_squared_error(y_test, y_pred, squared=False))
    print("R2", r2_score(y_test, y_pred))


------------------------------
Ridge
RMSE 24119.92339643343
R2 0.8974011974468389
------------------------------
RandomForestRegressor
RMSE 23029.050249208412
R2 0.9064718167198744
------------------------------
GradientBoostingRegressor
RMSE 20708.15067608488
R2 0.9243736611279045


In [8]:
stacking = make_pipeline(preprocessor, StackingRegressor([(names[i], models[i]) for i in range(len(models))]))
stacking.fit(X_train, y_train)
y_pred = stacking.predict(X_test)

print("StackingRegressor")
print("RMSE", mean_squared_error (y_test, y_pred, squared=False))
print("R2", r2_score(y_test, y_pred))


StackingRegressor
RMSE 19352.5652725807
R2 0.9339508054964868


In [9]:
test_set = pd.read_csv("data/test.csv")
test_preprocessed = test_set[X.columns]

stacking = make_pipeline(preprocessor, StackingRegressor([(names[i], models[i]) for i in range(len(models))]))
stacking.fit(X, y)

results = pd.Series(stacking.predict(test_preprocessed), name='SalePrice')

results = pd.concat([test_set['Id'], results], axis=1)
results.set_index("Id", inplace=True)

In [11]:
results.to_csv("data/results.csv")
