In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression,LassoCV, Ridge, LassoLarsCV,ElasticNetCV, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
import xgboost as xgb
import lightgbm as lgb
import warnings

warnings.filterwarnings('ignore')

In [2]:
trainingset_path = '../dataset/training-set.csv'
testingset_path = '../dataset/testing-set.csv'

In [3]:
# train_df = pd.read_csv('../dataset/train_df_0903.csv', encoding='utf-8')
# test_df = pd.read_csv('../dataset/test_df_0903.csv', encoding='utf-8')
train_claim_df = pd.read_csv('../dataset/train_claim_df_0912.csv', encoding='utf-8')
test_claim_df = pd.read_csv('../dataset/test_claim_df_0912.csv', encoding='utf-8')
trainingset_df = pd.read_csv(trainingset_path, encoding='utf-8')
testingset_df = pd.read_csv(testingset_path, encoding='utf-8')

In [4]:
# train_data = train_df.iloc[:, 2:]
train_claim_data = train_claim_df.iloc[:, 2:]
train_label = train_claim_df.iloc[:, 1]

# test_data = test_df.iloc[:, 2:]
test_claim_data = test_claim_df.iloc[:, 2:]

In [5]:
features = train_claim_data.columns

In [6]:
print(train_claim_data.shape, test_claim_data.shape, train_label.shape)

(210763, 90) (140510, 90) (210763,)


In [7]:
all_data = pd.concat([train_claim_data, test_claim_data], axis=0)

In [8]:
ss_x = StandardScaler()
ss_x.fit(all_data)
x_train = ss_x.transform(train_claim_data)
test = ss_x.transform(test_claim_data)

In [9]:
ss_y = StandardScaler()
ss_y.fit_transform(train_label.reshape(-1, 1))
y_train = ss_y.transform(train_label.reshape(-1, 1))

In [10]:
x_train.shape

(210763, 90)

In [11]:
test.shape

(140510, 90)

In [12]:
def gridsearch_cv(model, test_param, X, y, cv=5):
    gsearch = GridSearchCV(estimator=model, param_grid=test_param, scoring='neg_mean_absolute_error', n_jobs=4,
                           verbose=1, iid=False, cv=cv)
    gsearch.fit(X, y)
    print('CV Results: ', gsearch.cv_results_)
    print('Best Params: ', gsearch.best_params_)
    print('Best Score: ', gsearch.best_score_)
    return gsearch.best_params_

# GB

In [13]:
GB = GradientBoostingRegressor(n_estimators=2000, learning_rate=0.01,
                                   max_depth=10, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

In [14]:
param_test1 = {
    'n_estimators': range(1000, 2001, 1000),
}
gridsearch_cv(GB, param_test1, x_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed: 108.7min finished


CV Results:  {'mean_fit_time': array([1607.65699987, 2671.17259998]), 'std_fit_time': array([  3.83373127, 456.91560822]), 'mean_score_time': array([3.40020003, 5.96279998]), 'std_score_time': array([0.23831956, 0.82647303]), 'param_n_estimators': masked_array(data=[1000, 2000],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_estimators': 1000}, {'n_estimators': 2000}], 'split0_test_score': array([-0.18938551, -0.18912732]), 'split1_test_score': array([-0.30899984, -0.3086217 ]), 'split2_test_score': array([-0.28158136, -0.2811331 ]), 'split3_test_score': array([-0.28418891, -0.28340194]), 'split4_test_score': array([-0.24481206, -0.24476432]), 'mean_test_score': array([-0.26179353, -0.26140968]), 'std_test_score': array([0.04160009, 0.04148121]), 'rank_test_score': array([2, 1]), 'split0_train_score': array([-0.23516164, -0.21912082]), 'split1_train_score': array([-0.21098506, -0.19749697]), 'split2_train_score': array([-0.21629064, -

{'n_estimators': 2000}

# XGB

In [17]:
num_rounds = 5000
params = {
    'eta': 0.01,
    'objective': 'reg:linear',
    'subsample': 0.9,#checked
    'colsample_bytree': 0.9,#checked
    'min_child_weight': 5,#checked
    'max_depth': 9,#checked
    'gamma': 0.4,#checked  
    'scale_pos_weight': 1,
    'reg_alpha': 100,
    'reg_lambda': 1,
}
xgtrain = xgb.DMatrix(x_train, label=train_label)
#求出最佳num_rounds
cvresult = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5, metrics='mae', seed=0,verbose_eval=100,
                  callbacks=[xgb.callback.print_evaluation(show_stdv=False), xgb.callback.early_stop(50)])
num_round_best = cvresult.shape[0] - 1
print('Best round num: ', num_round_best)

[0]	train-mae:5050.64	test-mae:5051.31
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 50 rounds.
[0]	train-mae:5050.64+8.39159	test-mae:5051.31+33.7194
[1]	train-mae:5017.15	test-mae:5018.44
[2]	train-mae:4983.93	test-mae:4985.98
[3]	train-mae:4951.14	test-mae:4953.85
[4]	train-mae:4918.71	test-mae:4922.05
[5]	train-mae:4886.59	test-mae:4890.58
[6]	train-mae:4854.74	test-mae:4859.37
[7]	train-mae:4823.26	test-mae:4828.59
[8]	train-mae:4792.1	test-mae:4798.11
[9]	train-mae:4761.25	test-mae:4767.89
[10]	train-mae:4730.7	test-mae:4737.99
[11]	train-mae:4700.48	test-mae:4708.41
[12]	train-mae:4670.53	test-mae:4679.14
[13]	train-mae:4640.85	test-mae:4650.15
[14]	train-mae:4611.54	test-mae:4621.48
[15]	train-mae:4582.48	test-mae:4593.05
[16]	train-mae:4553.77	test-mae:4564.96
[17]	train-mae:4525.28	test-mae:4537.16
[18]	train-mae:4497.11	test-mae:4509.59
[19]	train-mae:4469.22	test-mae:4482.36
[20]	train-mae:4

In [18]:
xgb_model = xgb.XGBRegressor()

In [14]:
# tune max_depth & min_child_weight
param_test1 = {
    'max_depth': range(3, 10, 2),
    'min_child_weight': range(1, 10, 2)
}
gridsearch_cv(xgb_model, param_test1, x_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  6.3min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 21.8min finished


CV Results:  {'mean_fit_time': array([26.26999989, 26.19159999, 26.26139989, 26.49039998, 26.59099989,
       41.49080009, 41.55159993, 41.92099986, 41.25759988, 41.22199993,
       58.38040009, 57.93559999, 57.71699991, 57.52080002, 57.4592001 ,
       75.9678    , 75.80479994, 75.10140004, 74.21339993, 70.98080006]), 'std_fit_time': array([0.28522962, 0.16879057, 0.35546855, 0.29875242, 0.21816505,
       0.23928673, 0.31753209, 0.34810227, 0.2192556 , 0.33631829,
       0.81907527, 0.3526219 , 0.54382902, 0.32343066, 0.48437201,
       0.61287234, 0.23585189, 0.45560235, 0.20133403, 4.00884557]), 'mean_score_time': array([0.1236001 , 0.1296    , 0.12640014, 0.12080002, 0.12140007,
       0.1822    , 0.18380008, 0.17620001, 0.17840009, 0.18140006,
       0.25139995, 0.25319991, 0.24620008, 0.24020004, 0.24619994,
       0.36860003, 0.36059999, 0.35679998, 0.31719999, 0.30300002]), 'std_score_time': array([0.00272761, 0.00808955, 0.00542586, 0.00462166, 0.00458693,
       0.00239992, 

{'max_depth': 9, 'min_child_weight': 5}

In [15]:
# tune gamma
param_test2 = {
    'gamma': [i / 10.0 for i in range(0, 5)]
}
gridsearch_cv(xgb_model, param_test2, x_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:  3.1min finished


CV Results:  {'mean_fit_time': array([26.06819992, 26.3283999 , 26.28119998, 26.37299995, 25.30700002]), 'std_fit_time': array([0.16679389, 0.35649435, 0.16876659, 0.15678144, 2.11150924]), 'mean_score_time': array([0.12140002, 0.12320008, 0.11980004, 0.12900009, 0.11859999]), 'std_score_time': array([0.00665129, 0.00872703, 0.00649307, 0.00414722, 0.00480003]), 'param_gamma': masked_array(data=[0.0, 0.1, 0.2, 0.3, 0.4],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'gamma': 0.0}, {'gamma': 0.1}, {'gamma': 0.2}, {'gamma': 0.3}, {'gamma': 0.4}], 'split0_test_score': array([-0.21829342, -0.21823294, -0.21823294, -0.21823294, -0.21823294]), 'split1_test_score': array([-0.364735  , -0.36451407, -0.36421613, -0.36421613, -0.36421526]), 'split2_test_score': array([-0.32554538, -0.32554538, -0.32554538, -0.32518576, -0.32518576]), 'split3_test_score': array([-0.32103278, -0.32103278, -0.32103278, -0.32103278, -0.32103278])

{'gamma': 0.4}

In [16]:
# tune subsample & colsample_bytree
param_test3 = {
    'subsample': [i / 10.0 for i in range(6, 10)],
    'colsample_bytree': [i / 10.0 for i in range(6, 10)]
}
gridsearch_cv(xgb_model, param_test3, x_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  4.2min
[Parallel(n_jobs=4)]: Done  80 out of  80 | elapsed:  8.3min finished


CV Results:  {'mean_fit_time': array([22.0190001 , 20.76460013, 19.65159998, 18.54620004, 24.91379995,
       23.69179997, 22.27420006, 21.02259998, 27.6447999 , 26.24760003,
       24.71179991, 23.27399988, 30.1039999 , 28.60940003, 27.01159992,
       25.50099988]), 'std_fit_time': array([0.03653497, 0.08834167, 0.08981446, 0.04667069, 0.05183596,
       0.11615062, 0.056485  , 0.1457418 , 0.10771323, 0.11853532,
       0.1091556 , 0.08392147, 0.12514461, 0.11062845, 0.17033332,
       0.05418109]), 'mean_score_time': array([0.11899996, 0.12579985, 0.12080002, 0.12499995, 0.11440015,
       0.11720004, 0.11820002, 0.11959996, 0.11919999, 0.12080002,
       0.1164001 , 0.11659999, 0.11259999, 0.11539989, 0.11820002,
       0.11160002]), 'std_score_time': array([0.00589915, 0.00495582, 0.00643131, 0.00209756, 0.00471597,
       0.0047917 , 0.00360006, 0.0038782 , 0.00679409, 0.00627376,
       0.00458698, 0.00496375, 0.00595319, 0.0040791 , 0.00462175,
       0.00628019]), 'param_colsa

{'colsample_bytree': 0.9, 'subsample': 0.9}

In [17]:
# tune scale_pos_weight
param_test4 = {
    'scale_pos_weight': [i for i in range(1, 10, 2)]
}
gridsearch_cv(xgb_model, param_test4, x_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:  3.4min finished


CV Results:  {'mean_fit_time': array([26.24240003, 29.98719997, 32.47079997, 30.72079997, 28.97379994]), 'std_fit_time': array([0.27409687, 0.21990031, 0.23909364, 0.14695637, 1.44541379]), 'mean_score_time': array([0.127     , 0.11519995, 0.10739999, 0.1158    , 0.1052    ]), 'std_score_time': array([0.00404968, 0.00470746, 0.00467336, 0.00074829, 0.0074403 ]), 'param_scale_pos_weight': masked_array(data=[1, 3, 5, 7, 9],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'scale_pos_weight': 1}, {'scale_pos_weight': 3}, {'scale_pos_weight': 5}, {'scale_pos_weight': 7}, {'scale_pos_weight': 9}], 'split0_test_score': array([-0.21829342, -0.31103141, -0.50174404, -0.60137743, -0.63534568]), 'split1_test_score': array([-0.364735  , -0.44457177, -0.62081916, -0.6937377 , -0.73481528]), 'split2_test_score': array([-0.32554538, -0.39875479, -0.57468383, -0.65507009, -0.69872157]), 'split3_test_score': array([-0.32103278, -0.400

{'scale_pos_weight': 1}

In [18]:
# tune reg_alpha
param_test5 = {
    'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100, 1000]
}
gridsearch_cv(xgb_model, param_test5, x_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:  3.5min finished


CV Results:  {'mean_fit_time': array([26.04039998, 26.20399995, 26.22939992, 26.16460009, 26.14100003,
       24.63279996]), 'std_fit_time': array([0.15138376, 0.16113488, 0.21881827, 0.26579902, 0.20769496,
       1.94846596]), 'mean_score_time': array([0.12640004, 0.11960006, 0.12080016, 0.12620006, 0.14180007,
       0.13120008]), 'std_score_time': array([0.00135647, 0.00922178, 0.00581038, 0.00591271, 0.00172049,
       0.00386783]), 'param_reg_alpha': masked_array(data=[1e-05, 0.01, 0.1, 1, 100, 1000],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'reg_alpha': 1e-05}, {'reg_alpha': 0.01}, {'reg_alpha': 0.1}, {'reg_alpha': 1}, {'reg_alpha': 100}, {'reg_alpha': 1000}], 'split0_test_score': array([-0.21829342, -0.2182329 , -0.21852022, -0.21749081, -0.21783309,
       -0.21987533]), 'split1_test_score': array([-0.36473498, -0.36471602, -0.3641202 , -0.36411091, -0.36186538,
       -0.37407492]), 'split2_tes

{'reg_alpha': 100}

In [19]:
# tune reg_lambda
param_test6 = {
    'reg_lambda': [1e-5, 1e-2, 0.1, 1, 100, 1000]
}
gridsearch_cv(xgb_model, param_test6, x_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:  3.5min finished


CV Results:  {'mean_fit_time': array([27.07699995, 26.06679993, 26.20240011, 25.97999988, 26.34560008,
       24.66860003]), 'std_fit_time': array([2.21181713, 0.41026884, 0.38347236, 0.10376117, 0.18611248,
       1.77492529]), 'mean_score_time': array([0.12420011, 0.12020006, 0.12499995, 0.12540002, 0.1177999 ,
       0.12539992]), 'std_score_time': array([0.00354406, 0.006911  , 0.00219091, 0.00319997, 0.00271295,
       0.0087315 ]), 'param_reg_lambda': masked_array(data=[1e-05, 0.01, 0.1, 1, 100, 1000],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'reg_lambda': 1e-05}, {'reg_lambda': 0.01}, {'reg_lambda': 0.1}, {'reg_lambda': 1}, {'reg_lambda': 100}, {'reg_lambda': 1000}], 'split0_test_score': array([-0.2186749 , -0.21844286, -0.21835012, -0.21829342, -0.21839783,
       -0.2185533 ]), 'split1_test_score': array([-0.3652651 , -0.36504904, -0.36532218, -0.364735  , -0.36655381,
       -0.36655127]), 'spl

{'reg_lambda': 1}

In [15]:
xgb_best_param = {
    'n_estimators': 2077,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'subsample': 0.9,#checked
    'colsample_bytree': 0.9,#checked
    'min_child_weight': 5,#checked
    'max_depth': 9,#checked
    'gamma': 0.4,#checked  
    'scale_pos_weight': 1,
    'reg_alpha': 100,
    'reg_lambda': 1,
}

In [16]:
xgb_model = XGBRegressor(**xgb_best_param)

### train & predict

In [17]:
print('Training...')
xgb_model.fit(x_train, y_train)
print('Predicting...')
y_pred_xgb = ss_y.inverse_transform(xgb_model.predict(test))

Training...
Predicting...


In [19]:
y_pred_xgb[7]

38.170906

In [21]:
feature_importances = xgb_model.feature_importances_
feature_im_df = pd.DataFrame()
feature_im_df['feature'] = train_claim_data.columns.values
feature_im_df['score'] = feature_importances
feature_im_df.sort_values(by='score', ascending=False)

AttributeError: 'numpy.ndarray' object has no attribute 'feature_importances_'

In [None]:
#result to csv
submit = testingset_df.copy()
submit['Next_Premium'] = best_y_pred_xgb
submit.to_csv('../result_csv/xgb_0821.csv', sep=',', index=None)

### LightGBM

In [15]:
params = {
    'boosting_type': 'gbdt', 
    'objective': 'regression_l1', 

    'learning_rate': 0.01, 
    'num_leaves': 60, 
    'max_depth': 12,
    'min_child_samples': 22,
    'min_child_weight': 0.001,
    'bagging_fraction': 0.6,
    'feature_fraction': 0.9,
    'reg_alpha' : 0.3,
    'reg_lambda' : 0.08,
    'subsample': 0.8, 
    'colsample_bytree': 0.8, 
    }

In [16]:
data_train = lgb.Dataset(x_train, train_label.reshape(-1), silent=True)
cv_results = lgb.cv(params, data_train, num_boost_round=5000, nfold=5, stratified=False, shuffle=True,
                    metrics='mae',early_stopping_rounds=30, verbose_eval=100, show_stdv=True, seed=0)

print('best n_estimators:', len(cv_results['l1-mean']))
print('best cv score:', cv_results['l1-mean'][-1])

[100]	cv_agg's l1: 2214.18 + 31.057
[200]	cv_agg's l1: 1857.38 + 26.3866
[300]	cv_agg's l1: 1775.54 + 24.9819
[400]	cv_agg's l1: 1755.99 + 24.1993
[500]	cv_agg's l1: 1747.65 + 24.29
[600]	cv_agg's l1: 1742.96 + 24.1118
[700]	cv_agg's l1: 1739.44 + 23.3184
[800]	cv_agg's l1: 1737.06 + 23.1187
[900]	cv_agg's l1: 1735.07 + 22.8065
[1000]	cv_agg's l1: 1733.43 + 23.0343
[1100]	cv_agg's l1: 1732.13 + 23.2639
[1200]	cv_agg's l1: 1730.93 + 23.1224
[1300]	cv_agg's l1: 1730.07 + 23.0668
[1400]	cv_agg's l1: 1729.49 + 22.8201
[1500]	cv_agg's l1: 1729.11 + 22.6329
[1600]	cv_agg's l1: 1728.87 + 22.6087
[1700]	cv_agg's l1: 1728.66 + 22.578
[1800]	cv_agg's l1: 1728.35 + 22.5425
[1900]	cv_agg's l1: 1728.12 + 22.4872
[2000]	cv_agg's l1: 1727.88 + 22.6349
best n_estimators: 2065
best cv score: 1727.753414696174


In [17]:
data_train = lgb.Dataset(x_train, y_train.reshape(-1), silent=True)
cv_results = lgb.cv(params, data_train, num_boost_round=5000, nfold=5, stratified=False, shuffle=True,
                    metrics='mae',early_stopping_rounds=30, verbose_eval=100, show_stdv=True, seed=0)

print('best n_estimators:', len(cv_results['l1-mean']))
print('best cv score:', cv_results['l1-mean'][-1])

[100]	cv_agg's l1: 0.376981 + 0.00380616
[200]	cv_agg's l1: 0.307121 + 0.00419285
[300]	cv_agg's l1: 0.273496 + 0.00377456
[400]	cv_agg's l1: 0.257633 + 0.00355855
[500]	cv_agg's l1: 0.249826 + 0.00349473
[600]	cv_agg's l1: 0.246157 + 0.00337316
[700]	cv_agg's l1: 0.244567 + 0.00332635
[800]	cv_agg's l1: 0.243709 + 0.00335767
[900]	cv_agg's l1: 0.243158 + 0.00335491
[1000]	cv_agg's l1: 0.242725 + 0.00336476
[1100]	cv_agg's l1: 0.242411 + 0.00336147
[1200]	cv_agg's l1: 0.242154 + 0.00333568
[1300]	cv_agg's l1: 0.241972 + 0.00335674
[1400]	cv_agg's l1: 0.241761 + 0.00333566
[1500]	cv_agg's l1: 0.241571 + 0.0032848
[1600]	cv_agg's l1: 0.241402 + 0.00323041
[1700]	cv_agg's l1: 0.241279 + 0.00322403
[1800]	cv_agg's l1: 0.241148 + 0.00321175
[1900]	cv_agg's l1: 0.241032 + 0.00320484
[2000]	cv_agg's l1: 0.240949 + 0.00319885
[2100]	cv_agg's l1: 0.240893 + 0.00317986
[2200]	cv_agg's l1: 0.24085 + 0.00317033
[2300]	cv_agg's l1: 0.240825 + 0.00317458
[2400]	cv_agg's l1: 0.240783 + 0.00316042
[25

In [8]:
lgb_model = lgb.LGBMRegressor(objective='regression_l1',num_leaves=80,
                              learning_rate=0.01, n_estimators=3556, max_depth=10,
                              metric='mae', bagging_fraction = 0.8,feature_fraction = 0.8)

In [18]:
lgb_params_test1={
    'max_depth': range(8,14,2),
    'num_leaves':range(60, 101, 20)
}
gridsearch_cv(lgb_model, lgb_params_test1, x_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed: 346.6min finished


CV Results:  {'mean_fit_time': array([5615.90780005, 6174.16020007, 1463.33099999,  528.69920006,
        603.10019999,  641.28000002,  375.22480006,  406.43220005,
        390.76180005]), 'std_fit_time': array([1876.96401927, 3623.35352585,  518.45975692,   66.28742409,
         24.88822413,   55.9499302 ,   14.46476811,    6.29301033,
         82.79466787]), 'mean_score_time': array([7.86799994, 7.89399996, 9.41560006, 7.43660002, 8.88540001,
       9.46719995, 4.86700001, 5.87959995, 7.10539989]), 'std_score_time': array([0.87029089, 1.32710703, 0.92264965, 0.23580384, 0.45137052,
       1.04880448, 0.0939403 , 0.14040034, 0.46224606]), 'param_max_depth': masked_array(data=[8, 8, 8, 10, 10, 10, 12, 12, 12],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object), 'param_num_leaves': masked_array(data=[60, 80, 100, 60, 80, 100, 60, 80, 100],
             mask=[False, False, False, False, Fa

{'max_depth': 12, 'num_leaves': 60}

In [19]:
lgb_params_test2={
    'min_child_samples': [18, 19, 20, 21, 22],
    'min_child_weight':[0.001, 0.002]
}
gridsearch_cv(lgb_model, lgb_params_test2, x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 81.8min
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed: 94.3min finished


CV Results:  {'mean_fit_time': array([391.14120011, 412.37280011, 413.30320001, 417.37760005,
       416.82379999, 416.10320015, 425.54560008, 415.93000002,
       423.0999999 , 359.70760012]), 'std_fit_time': array([47.134584  , 26.33701645,  5.01700946, 27.17706536,  8.63349751,
        8.08007995, 43.9566611 , 31.18252413, 28.60170048, 81.82034599]), 'mean_score_time': array([6.47719994, 6.5244    , 6.5776001 , 6.59399986, 6.63119998,
       6.64979987, 6.76159997, 6.66299996, 6.74559999, 6.56759992]), 'std_score_time': array([0.15213986, 0.18380284, 0.31875737, 0.29050297, 0.25324091,
       0.24390113, 0.11096955, 0.18447996, 0.14485106, 0.29899737]), 'param_min_child_samples': masked_array(data=[18, 18, 19, 19, 20, 20, 21, 21, 22, 22],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object), 'param_min_child_weight': masked_array(data=[0.001, 0.002, 0.001, 0.002, 0.001, 0.002, 0.

{'min_child_samples': 22, 'min_child_weight': 0.001}

In [20]:
lgb_params_test3={
    'feature_fraction': [0.5, 0.6, 0.7, 0.8, 0.9],
    'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0]
}
gridsearch_cv(lgb_model, lgb_params_test3, x_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 80.2min
[Parallel(n_jobs=4)]: Done 125 out of 125 | elapsed: 236.4min finished


CV Results:  {'mean_fit_time': array([342.44320002, 404.72599993, 403.55899997, 422.48080001,
       429.39860005, 398.23320003, 409.85299983, 406.0750001 ,
       428.60559998, 425.39219999, 398.4152    , 405.6132    ,
       416.64560003, 433.33239989, 434.86839995, 399.38679991,
       410.4012001 , 418.86719999, 433.58280001, 435.80079989,
       402.68119993, 415.35500002, 438.75660005, 445.45739994,
       400.51380014]), 'std_fit_time': array([49.57784846, 23.63690716, 20.83110371, 31.91081148, 27.59595764,
        8.91284754, 20.1580041 , 18.41110356,  6.70708872, 35.28071486,
        9.55351533,  6.99837866,  4.44885646, 31.88607732, 18.11796946,
       11.59695306,  4.33536652,  5.75580335, 13.2083434 , 14.43936776,
        2.89180519,  7.10606793, 10.50653203, 13.65059616, 47.43127607]), 'mean_score_time': array([ 6.88299999,  6.76380005,  6.721     , 10.15219998,  6.67579994,
        6.91920004,  6.85180006,  6.81860003,  6.78260002,  6.72579989,
        7.09520001,  6.8820

{'bagging_fraction': 0.6, 'feature_fraction': 0.9}

In [21]:
lgb_params_test4={
    'feature_fraction': [0.9, 0.92, 0.94, 0.96, 0.98, 1.0,]
}
gridsearch_cv(lgb_model, lgb_params_test4, x_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed: 62.8min finished


CV Results:  {'mean_fit_time': array([431.36000004, 466.99960008, 465.63879991, 461.50819993,
       467.02339997, 417.9092    ]), 'std_fit_time': array([82.87046379, 33.1580309 ,  9.08273575, 11.52106137, 15.46638029,
       86.10697766]), 'mean_score_time': array([6.9835999 , 7.28940005, 6.89400005, 6.89120011, 6.6822    ,
       6.321     ]), 'std_score_time': array([0.19710161, 0.87271271, 0.17856318, 0.29112773, 0.15866237,
       0.59698246]), 'param_feature_fraction': masked_array(data=[0.9, 0.92, 0.94, 0.96, 0.98, 1.0],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'feature_fraction': 0.9}, {'feature_fraction': 0.92}, {'feature_fraction': 0.94}, {'feature_fraction': 0.96}, {'feature_fraction': 0.98}, {'feature_fraction': 1.0}], 'split0_test_score': array([-0.16844467, -0.16855637, -0.16859732, -0.16867676, -0.16872855,
       -0.16942412]), 'split1_test_score': array([-0.28824031, -0.28832445, -0.2884

{'feature_fraction': 0.9}

In [None]:
lgb_params_test5={
    'reg_alpha': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5],
    'reg_lambda': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5]
}
gridsearch_cv(lgb_model, lgb_params_test5, x_train, y_train)

Fitting 5 folds for each of 49 candidates, totalling 245 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 88.8min


In [18]:
best_lgb_model = lgb.LGBMRegressor(objective='regression_l1',num_leaves=60, min_child_samples=22, min_child_weight=0.001, 
                                   bagging_fraction=0.6, feature_fraction=0.9, 
                                   learning_rate=0.01, n_estimators=2854, max_depth=12, reg_alpha=0.3, reg_lambda=0.08,
                                   metric='mae',subsample=0.8, colsample_bytree=0.8, )

In [19]:
print('Training...')
best_lgb_model.fit(x_train, y_train)
print('Predicting...')
# y_pred_lgb = ss_y.inverse_transform(best_lgb_model.predict(test))
# y_pred_lgb = best_lgb_model.predict(test_claim_data)

Training...
Predicting...


In [None]:
import shap
shap.initjs()

explainer = shap.TreeExplainer(best_lgb_model)
shap_values = explainer.shap_values(x_train)

In [None]:
shap.summary_plot(shap_values, x_train, plot_type="bar")

In [None]:
y_pred_lgb = ss_y.inverse_transform(best_lgb_model.predict(test))

In [None]:
y_pred_lgb

In [27]:
indices = np.argsort(best_lgb_model.feature_importances_)[::-1]
indices

array([14,  8, 15,  2,  1, 17, 16, 23, 30, 19, 24,  4, 18, 64,  7,  0, 13,
       26,  9, 12,  6, 11,  5, 43, 25, 65, 44,  3, 69, 46, 67, 47, 66, 31,
       20, 28, 10, 48, 32, 27, 22, 40, 68, 34, 52, 36, 38, 37, 45, 35, 51,
       49, 33, 42, 41, 21, 59, 63, 62, 61, 60, 29, 58, 57, 56, 55, 53, 50,
       39, 54], dtype=int64)

In [28]:
imp_train_data = train_claim_data.loc[:,list(features[indices][:35])]
imp_test_data = test_claim_data.loc[:,list(features[indices][:35])]

In [29]:
ss_x_imp = StandardScaler()
ss_x_imp.fit_transform(imp_train_data)
x_train_imp = ss_x_imp.transform(imp_train_data)
test_imp = ss_x_imp.transform(imp_test_data)

In [None]:
print('Training...')
best_lgb_model.fit(x_train_imp, y_train)
print('Predicting...')
y_pred_lgb = ss_y.inverse_transform(best_lgb_model.predict(test_imp))

In [None]:
y_pred_lgb

In [30]:
#result to csv
submit = testingset_df.copy()
submit['Next_Premium'] = y_pred_lgb
submit.iloc[submit[submit['Next_Premium'] < 0].index, 1] = 0
submit.to_csv('../result_csv/lgb_0903.csv', sep=',', index=None)

# DNN with Keras

In [39]:
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

Using TensorFlow backend.


In [43]:
# define base model
def baseline_model():
    model = Sequential()
    model.add(Dense(13, input_dim=73, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_absolute_error', optimizer='adam')
    return model

In [44]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)
# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=200, batch_size=500, verbose=1)

In [45]:
kfold = KFold(n_splits=5, random_state=seed)
results = cross_val_score(estimator, train_claim_data.values, train_label.values, cv=kfold, n_jobs=1)
print("Results: %.2f (%.2f) MAE" % (results.mean(), results.std()))

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Results: -3634.12 (340.29) MAE


In [46]:
# evaluate model with standardized dataset
np.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=1000, verbose=1)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=5, random_state=seed)
results = cross_val_score(pipeline, train_claim_data, train_label, cv=kfold, n_jobs=1)
print("Standardized: %.2f (%.2f) MAE" % (results.mean(), results.std()))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [47]:
# define wider model
def wider_model():
    # create model
    model = Sequential()
    model.add(Dense(100, input_dim=73, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_absolute_error', optimizer='adam')
    return model

In [48]:
np.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=wider_model, epochs=150, batch_size=1000, verbose=1)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=5, random_state=seed)
results = cross_val_score(pipeline, train_claim_data, train_label, cv=kfold, n_jobs=1)
print("Wider: %.2f (%.2f) MAE" % (results.mean(), results.std()))

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [49]:
# define wider model
def deeper_model():
    # create model
    model = Sequential()
    model.add(Dense(20, input_dim=73, kernel_initializer='normal', activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(16, kernel_initializer='normal', activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_absolute_error', optimizer='adam')
    return model

In [None]:
np.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=deeper_model, epochs=200, batch_size=1000, verbose=1)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=5, random_state=seed)
results = cross_val_score(pipeline, train_claim_data, train_label, cv=kfold, n_jobs=1)
print("Deeper: %.2f (%.2f) MAE" % (results.mean(), results.std()))

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [None]:
nn_model = pipeline.fit(train_claim_data, train_label)

In [None]:
y_pred_nn = nn_model.predict(test_claim_data)

In [None]:
y_pred_nn[:20]

In [38]:
# #result to csv
submit = testingset_df.copy()
submit['Next_Premium'] = y_pred_lgb
submit.iloc[submit[submit['Next_Premium'] < 0].index, 1] = 0
submit.to_csv('../result_csv/lgb_0829.csv', sep=',', index=None)