In [None]:
import warnings
warnings.filterwarnings('ignore')
#load traning data
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
import xgboost as xgb
from xgboost import XGBRegressor
from mlxtend.regressor import StackingCVRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics
from sklearn import ensemble
from sklearn.utils import shuffle
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from lightgbm import LGBMRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
import csv
import datetime
import os.path
from os import path
from datetime import datetime
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points

In [None]:
REMOVE_ONLY_2_Outliers = True
USE_DUMMY_CAT_FEATURES = True
STACK_MODELLING = True
RANDOM_SEED = 42

In [None]:
df_X_train = pd.read_csv('./data/train_X_clean.csv')
df_X_test =  pd.read_csv('./data/test_X_clean.csv')
df_y_train = pd.read_csv('./data/train_y_clean.csv')

display(df_X_train.head(1))
display(df_X_test.head(1))
display(df_y_train.head(1))
print(df_X_train.columns)

In [None]:
X = df_X_train.iloc[:, 0:].values 
print(X.shape)
#print(X[0])

y = df_y_train.iloc[:, 0:].values 
print(y.shape)
#print(y[0])

In [None]:
#Validation function
n_folds = 5

kfolds = KFold(n_splits=n_folds, shuffle=True, random_state=42)


# rmsle
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))


# build our model scoring function
def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)

# setup models    
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

xgboost = xgb.XGBRegressor(colsample_bytree=0.4, gamma=0,learning_rate=0.03, max_depth=3, min_child_weight=1.5,
                 n_estimators=10000, reg_alpha=0.75, reg_lambda=0.45,subsample=0.6, seed=42) 


ridge = make_pipeline(RobustScaler(),
                      RidgeCV(alphas=alphas_alt, cv=kfolds,))

lasso = make_pipeline(RobustScaler(),
                      LassoCV(max_iter=1e7, alphas=alphas2,
                              random_state=42, cv=kfolds))

elasticnet = make_pipeline(RobustScaler(),
                           ElasticNetCV(max_iter=1e7, alphas=e_alphas,
                                        cv=kfolds, random_state=42, l1_ratio=e_l1ratio))
                                        
svr = make_pipeline(RobustScaler(),
                      SVR(C= 20, epsilon= 0.008, gamma=0.0003,))
rf = RandomForestRegressor(random_state=RANDOM_SEED)
svr = SVR(kernel = 'rbf',gamma='auto')
gbr = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt',
                                min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =42)
lightgbm = LGBMRegressor(objective='regression', num_leaves=4,learning_rate=0.01, n_estimators=5000,
                         max_bin=200, bagging_fraction=0.75,bagging_freq=5, bagging_seed=7,feature_fraction=0.2,
                         feature_fraction_seed=7,verbose=-1)




stack_gen = StackingCVRegressor(regressors=(ridge, lasso, svr, lightgbm, gbr, xgboost, rf),
                                meta_regressor=xgboost,use_features_in_secondary=True)

print('TEST score on CV')

score = cv_rmse(ridge)
print("\nKernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(svr)
print("\nSVR score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(lightgbm)
print("\nLightgbm score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(gbr)
print("\nGradientBoosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(xgboost)
print("\nXgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(rf)
print("\nRandomForestRegressor score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )



print('START Fit')
print(datetime.now(), 'StackingCVRegressor')
stack_gen_model = stack_gen.fit(np.array(X), np.array(y))
print(datetime.now(), 'lasso')
lasso_model_full_data = lasso.fit(X, y)
print(datetime.now(), 'ridge')
ridge_model_full_data = ridge.fit(X, y)
print(datetime.now(), 'svr')
svr_model_full_data = svr.fit(X, y)
print(datetime.now(), 'GradientBoosting')
gbr_model_full_data = gbr.fit(X, y)
print(datetime.now(), 'xgboost')
xgb_model_full_data = xgboost.fit(X, y)
print(datetime.now(), 'lightgbm')
lgb_model_full_data = lightgbm.fit(X, y)
print(datetime.now(), 'RandomForestRegressor')
rf_model_full_data = rf.fit(X, y)


In [None]:

model_weights = {}
model_weights = {'rf':0.01,'lasso':0.05, 'ridge':0.1, 'svr':0.1, 'gbr':0.2, 'xgb':0.15, 'lgb':0, 'stack':0.3}

def blend_models_predict(X_pred):
    return ((model_weights['rf'] * rf_model_full_data.predict(X_pred)) + \
            (model_weights['lasso'] * lasso_model_full_data.predict(X_pred)) + \
            (model_weights['ridge'] * ridge_model_full_data.predict(X_pred)) + \
            (model_weights['svr'] * svr_model_full_data.predict(X_pred)) + \
            (model_weights['gbr'] * gbr_model_full_data.predict(X_pred)) + \
            (model_weights['xgb'] * xgb_model_full_data.predict(X_pred)) + \
            (model_weights['lgb'] * lgb_model_full_data.predict(X_pred)) + \
            (model_weights['stack'] * stack_gen_model.predict(np.array(X_pred))))


print('RMSLE score on train data:')
print(X.shape)
print(y.shape)
#print(y.shape)
#print(rmsle(y, blend_models_predict(X)))

In [None]:
X_test = df_X_test.iloc[:, 0:].values 
print(X_test.shape)
#print(X_test[0:])
print('Predict submission', datetime.now(),)
submission = pd.read_csv("./data/sample_submission.csv")
print(submission.columns)
submission.drop('SalePrice',axis=1)
print("Shape of submission is {0}".format(submission.shape))
print(submission.columns)
#submission.iloc[:,1] = np.floor(np.expm1(blend_models_predict(X_test)))
#y_test = np.floor(np.expm1(blend_models_predict(X_test[0])))
y_test = np.floor(np.expm1(xgb_model_full_data.predict(X_test)))

print("Shape of output is {0}".format(y_test.shape))
print(y_test)
dataset = pd.DataFrame({'SalePrice': y_test})

print(y_test[0:5])
print(len(y_test[0:2]))


result = pd.concat([submission['Id'], dataset], axis=1, sort=False)
print(result.head())
result.to_csv("submission.csv", index=False)
print('Save submission', datetime.now(),)