In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression,LassoCV, Ridge, LassoLarsCV,ElasticNetCV, SGDRegressor, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, learning_curve, KFold
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from xgboost import XGBRegressor
import xgboost as xgb
import lightgbm as lgb
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization
from keras.wrappers.scikit_learn import KerasRegressor
import warnings

warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
trainingset_path = '../dataset/training-set.csv'
testingset_path = '../dataset/testing-set.csv'

In [3]:
train_df_910 = pd.read_csv('../dataset/train_df_0910.csv', encoding='utf-8')
test_df_910 = pd.read_csv('../dataset/test_df_0910.csv', encoding='utf-8')
train_df_912 = pd.read_csv('../dataset/train_claim_df_0912.csv', encoding='utf-8')
test_df_912 = pd.read_csv('../dataset/test_claim_df_0912.csv', encoding='utf-8')
trainingset_df = pd.read_csv(trainingset_path, encoding='utf-8')
testingset_df = pd.read_csv(testingset_path, encoding='utf-8')

In [5]:
train_data_910 = train_df_910.iloc[:, 2:]
train_data_912 = train_df_912.iloc[:, 2:]

train_label = train_df_910.iloc[:, 1]

test_data_910 = test_df_910.iloc[:, 2:]
test_data_912 = test_df_912.iloc[:, 2:]

In [6]:
print(train_data_910.shape, train_data_912.shape, test_data_910.shape, test_data_912.shape, train_label.shape)

(210763, 51) (210763, 90) (140510, 51) (140510, 90) (210763,)


# CV

In [7]:
def cv_method(model, X, y):
#     estimators = []
#     estimators.append(('standardize', StandardScaler()))
#     estimators.append(('model', model))
#     pipeline = Pipeline(estimators)
#     print(pipeline.steps)
    kfold = KFold(n_splits=5, random_state=7).get_n_splits(X.values)
    results = cross_val_score(model, X.values, y.values, cv=kfold, n_jobs=4, scoring='mean_absolute_error')
    return {'mae_mean':-results.mean(), 'mae_std':results.std()}

### models

In [8]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
# KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

In [9]:
lgb_param = {'boosting_type': 'gbdt',
             'objective': 'regression_l1',
             'max_depth': 12,
             'num_leaves': 80,
             'min_child_samples': 19,
             'min_child_weight': 0.001,
             'bagging_fraction': 0.6,
             'feature_fraction': 1.0,
             'reg_alpha': 0.001,
             'reg_lambda': 0.3,
             'metric': 'mae',
             'verbose': 0,
             'colsample_bytree': 0.6,
             'subsample': 0.6,
             'n_estimators':420}
LGB = make_pipeline(StandardScaler(), lgb.LGBMRegressor(**lgb_param))

In [10]:
GB = make_pipeline(StandardScaler(), GradientBoostingRegressor(n_estimators=2000, learning_rate=0.02,
                                   max_depth=10, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5))

In [11]:
RF = make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=1500, max_depth=7, random_state =5))

In [12]:
xgb_param = {
    'n_estimators': 2402,
    'learning_rate': 0.02,
    'objective': 'reg:linear',
    'subsample': 0.9,#checked
    'colsample_bytree': 0.9,#checked
    'min_child_weight': 1,#checked
    'max_depth': 9,#checked
    'gamma': 0.0,#checked  
    'scale_pos_weight': 1,
    'reg_alpha': 0.01,
    'reg_lambda': 0.1,
}
XGB = make_pipeline(StandardScaler(), XGBRegressor(**xgb_param))

In [13]:
lasso_cv = cv_method(lasso, train_data_912, train_label)
ENet_cv = cv_method(ENet, train_data_912, train_label)

In [14]:
lgb_cv = cv_method(LGB, train_data_912, train_label)

In [15]:
gb_cv = cv_method(GB, train_data_912, train_label)

In [16]:
xgb_cv = cv_method(XGB, train_data_912, train_label)

In [17]:
rf_cv = cv_method(RF,train_data_912, train_label)

In [18]:
print("Lasso score: {:.4f} ({:.4f})\n".format(lasso_cv['mae_mean'], lasso_cv['mae_std']))
print("ENet score: {:.4f} ({:.4f})\n".format(ENet_cv['mae_mean'], ENet_cv['mae_std']))
# print("KRR score: {:.4f} ({:.4f})\n".format(KRR_cv['mae_mean'], KRR_cv['mae_std']))
print("LGB score: {:.4f} ({:.4f})\n".format(lgb_cv['mae_mean'], lgb_cv['mae_std']))
print("GB score: {:.4f} ({:.4f})\n".format(gb_cv['mae_mean'], gb_cv['mae_std']))
print("XGB score: {:.4f} ({:.4f})\n".format(xgb_cv['mae_mean'], xgb_cv['mae_std']))
print("RF score: {:.4f} ({:.4f})\n".format(rf_cv['mae_mean'], rf_cv['mae_std']))

Lasso score: 2199.0124 (360.1978)

ENet score: 2198.2441 (359.9161)

LGB score: 1750.6379 (298.8883)

GB score: 1903.8929 (300.4203)

XGB score: 1998.0624 (321.7138)

RF score: 2015.6454 (324.8090)



In [18]:
print("Lasso score: {:.4f} ({:.4f})\n".format(lasso_cv['mae_mean'], lasso_cv['mae_std']))
print("ENet score: {:.4f} ({:.4f})\n".format(ENet_cv['mae_mean'], ENet_cv['mae_std']))
# print("KRR score: {:.4f} ({:.4f})\n".format(KRR_cv['mae_mean'], KRR_cv['mae_std']))
print("LGB score: {:.4f} ({:.4f})\n".format(lgb_cv['mae_mean'], lgb_cv['mae_std']))
print("GB score: {:.4f} ({:.4f})\n".format(gb_cv['mae_mean'], gb_cv['mae_std']))
print("XGB score: {:.4f} ({:.4f})\n".format(xgb_cv['mae_mean'], xgb_cv['mae_std']))
print("RF score: {:.4f} ({:.4f})\n".format(rf_cv['mae_mean'], rf_cv['mae_std']))

Lasso score: 2425.9923 (303.0153)

ENet score: 2424.1391 (302.7454)

LGB score: 1751.2164 (299.6216)

GB score: 1904.2164 (300.6003)

XGB score: 1997.8895 (322.3094)

RF score: 2015.7044 (324.9064)



In [17]:
lgb_model = lgb_cv['pipe'].fit(train_claim_data, train_label)

In [18]:
y_pred_lgb = lgb_model.predict(test_claim_data)

In [26]:
gb_model = GB.fit(train_claim_data, train_label)

In [27]:
y_pred_gb = gb_model.predict(test_claim_data)

In [28]:
#result to csv
submit = testingset_df.copy()
submit['Next_Premium'] = y_pred_gb
submit.iloc[submit[submit['Next_Premium'] < 0].index, 1] = 0
submit.to_csv('../result_csv/gb_0911_1903.csv', sep=',', index=None)

# stacking method 
#### from https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard

In [19]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

In [20]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)  

In [21]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [22]:
averaged_models = AveragingModels((XGB, GB, LGB))

In [None]:
avg_cv = cv_method(averaged_models, train_data_912, train_label)

In [None]:
stacked_averaged_models = StackingAveragedModels(base_models = (XGB, GB, LGB),
                                                 meta_model = lasso)

#stacked_cv = cv_method(stacked_averaged_models, train_claim_data, train_label)
#print("Stacking Averaged models score: {:.4f} ({:.4f})".format(stacked_cv.mean(), stacked_cv.std()))

In [None]:
stacked_cv = cv_method(stacked_averaged_models, train_data_912, train_label)

In [None]:
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(stacked_cv['mae_mean'], stacked_cv['mae_std']))