In [0]:
import sys
import numpy as np
import pandas as pd

import xgboost as xgb
import lightgbm as lgb

from sklearn.decomposition import PCA

from catboost import CatBoostRegressor

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, mean_squared_error

from sklearn.linear_model import Ridge, Lasso, ElasticNet, BayesianRidge

from mlxtend.regressor import StackingCVRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.kernel_ridge import KernelRidge

In [0]:
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [0]:
train   = pd.read_csv('train_cleaning.csv')
test    = pd.read_csv('test_cleaning.csv')
y_train = pd.read_csv('y_train.csv')

In [0]:
y_train = y_train.values.ravel()

In [0]:
# X_train, X_val, Y_train, Y_val = train_test_split(train, y_train, test_size = 0.2, random_state=42)

In [0]:
n_folds = 10

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train)  
    rmse= np.sqrt(-cross_val_score(model, train, y_train, cv = kf, scoring="neg_mean_squared_error"))
    return(rmse)

### Kernel Ridge

In [147]:
KR = make_pipeline(RobustScaler(), KernelRidge(
         alpha= 0.9826,
         kernel= 'polynomial',
         degree= 2,
         coef0= 3.5
        ))           

KR_score = rmsle_cv(KR)
print("Kernel Ridge score: {:.4f} ({:.4f})".format(KR_score.mean(), KR_score.std()))

Kernel Ridge score: 0.1191 (0.0152)


### Elastic Net

In [148]:
ENet = make_pipeline(RobustScaler(), ElasticNet(
        alpha= 0.0002835,
        l1_ratio= 0.9999905,
        random_state= 0 
        ))

ENet_score = rmsle_cv(ENet)
print("ENet score: {:.4f} ({:.4f})".format(ENet_score.mean(), ENet_score.std()))

ENet score: 0.1130 (0.0136)


### Lasso

In [153]:
Lass = make_pipeline(RobustScaler(), Lasso(
        max_iter=3000,
        alpha= 0.000281,
        random_state= 5 
        )) 

Lass_score = rmsle_cv(Lass)
print("Lass score: {:.4f} ({:.4f})".format(Lass_score.mean(), Lass_score.std()))


Lass score: 0.1130 (0.0136)


### Gradient Boosting

In [155]:
Graboost = GradientBoostingRegressor(
    n_estimators= 3000,
    max_depth= 4,
    min_samples_leaf= 2,
    max_features= 'sqrt',
    loss='huber',
    random_state= 0,
    learning_rate= 0.0223,
    min_samples_split= 100 
    )

Graboost_score = rmsle_cv(Graboost)
print("Graboost score: {:.4f} ({:.4f})".format(Graboost_score.mean(), Graboost_score.std()))

Graboost score: 0.1097 (0.0169)


### Ridge

In [161]:
RidgeR = make_pipeline(RobustScaler(), Ridge(
    alpha= 8.8435,
    solver= 'cholesky'
        ))    
                       
RidgeR_score = rmsle_cv(RidgeR)
print("RidgeR score: {:.4f} ({:.4f})".format(RidgeR_score.mean(), RidgeR_score.std()))

RidgeR score: 0.1151 (0.0133)


### XGBoost

In [162]:
XGBoost = xgb.XGBRegressor(
    learning_rate= 0.0225,
    colsample_bytree= 0.1,
    n_estimators= 2500,
    max_depth= 3,
    subsample= 0.6982,
    min_child_weight=1, 
    )

XGBoost_score = rmsle_cv(XGBoost)
print("XGBoost score: {:.4f} ({:.4f})".format(XGBoost_score.mean(), XGBoost_score.std()))

XGBoost score: 0.1094 (0.0158)


### Bayesian Ridge

In [177]:
BRidge = make_pipeline(RobustScaler(), BayesianRidge(
        n_iter= 100,
        alpha_1= 6e-05,
        alpha_2= 0.01,
        lambda_1= 0.01,
        lambda_2= 6e-05,
        tol= 1e-05
        ))
                       
BRidge_score = rmsle_cv(BRidge)
print("BRidge score: {:.4f} ({:.4f})".format(BRidge_score.mean(), BRidge_score.std()))

BRidge score: 0.1152 (0.0133)


### LightGBM

In [183]:
LGB = lgb.LGBMRegressor(
    learning_rate= 0.0166,
    objective= 'regression',
    num_leaves= 30,
    num_iterations= 1500,
    feature_fraction = 0.2175,
    bagging_fraction= .9870,
    min_data_in_leaf= 9,
    max_depth= 68
)

LGB_score = rmsle_cv(LGB)
print("LGB score: {:.4f} ({:.4f})".format(LGB_score.mean(), LGB_score.std()))

LGB score: 0.1175 (0.0165)


### CatBoost

In [181]:
CatB = make_pipeline(RobustScaler(), CatBoostRegressor(
    iterations= 1929,
    depth= 2, 
    learning_rate= 0.0229,
    loss_function= 'RMSE',
    l2_leaf_reg=0,
    rsm= 1.0,
    one_hot_max_size=194
))

CatB_score = rmsle_cv(CatB)
print("CatB score: {:.4f} ({:.4f})".format(CatB_score.mean(), CatB_score.std()))

0:	learn: 11.7582666	total: 9.55ms	remaining: 18.4s
1:	learn: 11.4893127	total: 17.4ms	remaining: 16.7s
2:	learn: 11.2264961	total: 24.6ms	remaining: 15.8s
3:	learn: 10.9697377	total: 32.9ms	remaining: 15.8s
4:	learn: 10.7188648	total: 40.5ms	remaining: 15.6s
5:	learn: 10.4737473	total: 48ms	remaining: 15.4s
6:	learn: 10.2342115	total: 56ms	remaining: 15.4s
7:	learn: 10.0001037	total: 63.6ms	remaining: 15.3s
8:	learn: 9.7714469	total: 70.8ms	remaining: 15.1s
9:	learn: 9.5479483	total: 79.5ms	remaining: 15.3s
10:	learn: 9.3296748	total: 87.3ms	remaining: 15.2s
11:	learn: 9.1164044	total: 94.4ms	remaining: 15.1s
12:	learn: 8.9080139	total: 102ms	remaining: 15s
13:	learn: 8.7044173	total: 114ms	remaining: 15.6s
14:	learn: 8.5054963	total: 122ms	remaining: 15.5s
15:	learn: 8.3110940	total: 129ms	remaining: 15.4s
16:	learn: 8.1211814	total: 137ms	remaining: 15.4s
17:	learn: 7.9356424	total: 144ms	remaining: 15.3s
18:	learn: 7.7543595	total: 151ms	remaining: 15.2s
19:	learn: 7.5772403	total:

### Stacking

In [0]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
    
    #cloning models to fit the data
    def fit(self, X_train, Y_train):
        self.models_ = [clone(x) for x in self.models]
        
        #training the cloned models
        for model in self.models_:
            model.fit(X_train, Y_train)
            
        return self
    
    #predicting based on the cloned models
    def predict(self, X_val):
        predictions = np.column_stack([
            model.predict(X_val) for model in self.models_])
        #averaging the predictions
        return np.mean(predictions, axis=1)

In [0]:
averaged_models = AveragingModels(models = (KR, ENet, Lass, Graboost, RidgeR, XGBoost, BRidge, LGB, CatB))

In [185]:
score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

0:	learn: 11.7582666	total: 7.84ms	remaining: 15.1s
1:	learn: 11.4893127	total: 15.4ms	remaining: 14.9s
2:	learn: 11.2264961	total: 22.7ms	remaining: 14.6s
3:	learn: 10.9697377	total: 30.5ms	remaining: 14.7s
4:	learn: 10.7188648	total: 38.2ms	remaining: 14.7s
5:	learn: 10.4737473	total: 45.8ms	remaining: 14.7s
6:	learn: 10.2342115	total: 53.6ms	remaining: 14.7s
7:	learn: 10.0001037	total: 61.7ms	remaining: 14.8s
8:	learn: 9.7714469	total: 68.6ms	remaining: 14.6s
9:	learn: 9.5479483	total: 77.1ms	remaining: 14.8s
10:	learn: 9.3296748	total: 84.8ms	remaining: 14.8s
11:	learn: 9.1164044	total: 92.1ms	remaining: 14.7s
12:	learn: 8.9080139	total: 99.2ms	remaining: 14.6s
13:	learn: 8.7044173	total: 107ms	remaining: 14.7s
14:	learn: 8.5054963	total: 114ms	remaining: 14.6s
15:	learn: 8.3110940	total: 121ms	remaining: 14.5s
16:	learn: 8.1211814	total: 129ms	remaining: 14.5s
17:	learn: 7.9356424	total: 136ms	remaining: 14.4s
18:	learn: 7.7543595	total: 143ms	remaining: 14.4s
19:	learn: 7.5772403

In [186]:
averaged_models.fit(X_train, Y_train)

0:	learn: 11.7547183	total: 7.91ms	remaining: 15.2s
1:	learn: 11.4858211	total: 15.3ms	remaining: 14.7s
2:	learn: 11.2230571	total: 23.7ms	remaining: 15.2s
3:	learn: 10.9663549	total: 30.4ms	remaining: 14.6s
4:	learn: 10.7155454	total: 37.9ms	remaining: 14.6s
5:	learn: 10.4704393	total: 44.8ms	remaining: 14.3s
6:	learn: 10.2309437	total: 52.2ms	remaining: 14.3s
7:	learn: 9.9969963	total: 59.3ms	remaining: 14.2s
8:	learn: 9.7684170	total: 66.6ms	remaining: 14.2s
9:	learn: 9.5450810	total: 74.2ms	remaining: 14.2s
10:	learn: 9.3268263	total: 82.2ms	remaining: 14.3s
11:	learn: 9.1136172	total: 89ms	remaining: 14.2s
12:	learn: 8.9052934	total: 96.1ms	remaining: 14.2s
13:	learn: 8.7017558	total: 103ms	remaining: 14s
14:	learn: 8.5028043	total: 110ms	remaining: 14.1s
15:	learn: 8.3084988	total: 118ms	remaining: 14.1s
16:	learn: 8.1186486	total: 125ms	remaining: 14s
17:	learn: 7.9331483	total: 132ms	remaining: 14s
18:	learn: 7.7518552	total: 140ms	remaining: 14.1s
19:	learn: 7.5747729	total: 1

AveragingModels(models=(Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('kernelridge', KernelRidge(alpha=0.9826, coef0=3.5, degree=2, gamma=None,
      kernel='polynomial', kernel_params=None))]), Pipeline(memo...scaling=True)), ('catboostregressor', <catboost.core.CatBoostRegressor object at 0x7f9e3c3cf160>)])))

In [0]:
finalstack = averaged_models.predict(test)
resultstack = np.exp(finalstack)