In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gc; gc.enable()
import pickle
from tqdm import tqdm
import warnings ; warnings.filterwarnings('ignore')
import os
from GridSearcher import data_loader, model_loader, fit_params, get_oof_predictions

In [2]:
folder = 'final oofs/'
os.listdir(folder)

['all_mean_enc_lgb_oof_test_pred.csv',
 'all_mean_enc_lgb_oof_val_pred.csv',
 'all_mean_enc_user_feat2_lgb_oof_test_pred.csv',
 'all_mean_enc_user_feat2_lgb_oof_val_pred.csv',
 'all_mean_enc_user_feat_lgb_oof_test_pred.csv',
 'all_mean_enc_user_feat_lgb_oof_val_pred.csv',
 'alpha_0001_oof_test_pred.csv',
 'alpha_0001_oof_val_pred.csv',
 'alpha_10_oof_test_pred.csv',
 'alpha_10_oof_val_pred.csv',
 'alpha_160_oof_test_pred.csv',
 'alpha_160_oof_val_pred.csv',
 'alpha_320_oof_test_pred.csv',
 'alpha_320_oof_val_pred.csv',
 'baseline_xgb_oof_test_pred.csv',
 'baseline_xgb_oof_val_pred.csv',
 'catboost1_without_text_oof_test_pred',
 'catboost1_without_text_oof_test_pred.csv',
 'catboost1_without_text_oof_val_pred',
 'catboost1_without_text_oof_val_pred.csv',
 'catboost_oof_test_pred.csv',
 'catboost_oof_val_pred.csv',
 'cat_interact_lgb_oof_test_pred.csv',
 'cat_interact_lgb_oof_val_pred.csv',
 'cls05_lgb_oof_test_pred.csv',
 'cls05_lgb_oof_val_pred.csv',
 'cls0_lgb_oof_test_pred.csv',
 'cl

In [3]:
best_prefixs = [
    'lgb411_tune',
    'lgb411_dart_tune'
    'poisson_lgb',
    'img_meta_xgb',
    'baseline_xgb',
    'mcl_cgb',
    'selftrained_bigru_conv1d_rnn',
    'text_lgb',
    'mlp',
    'rg_alpha_0001',
    'lr_l2_01',
]

configs = [
    {
        'name': 'lgb',
        'prefixs': [
            'lgb411_tune',
            'plants_lgb', #411
            'plants_with_img_meta_nima_fm_geo_active_lgb',
            #'plants_with_img_meta_nima_fm_geo_active_obj_xentropy_lgb',
            #'xentropy_small_lr_lgb', #lgb411_tune
            'xentropy_small_lr_cat_lgb',
            'simple_feature_lgb', #411
            'all_mean_enc_lgb', #411
            'all_mean_enc_user_feat_lgb', #411
            'all_mean_enc_user_feat2_lgb', #411
            'cat_interact_lgb', #411
            'mean_enc_lgb', #411
            'marcus_lgb', #o411
            'fused_text_lgb', #o411
            'mixed_features_text_proprocessing_lgb', #o411,
            'select_dense_features_lgb', #411
            'select_sparse_features_lgb', #411
        ],
    },
    {
        'name': 'lgb_dart',
        'prefixs': [
            'lgb411_dart_tune',
        ],
    },
    {
        'name': 'lgb_pois',
        'prefixs': [
            'poisson_lgb', #o411
        ]
    },
    {
        'name': 'xgb_lg',
        'prefixs': [
            'small_features_v5_xgb', #o411
            'small_features_v4_xgb', #o411
            'nima_features_xgb', #o411
            'img_meta_xgb', #o411
            'img_meta_nima_xgb', #o411
        ]
    },
    {
        'name': 'xgb_dw',
        'prefixs': [
            'baseline_xgb', #o411
        ]
    },
    {
        'name': 'xgb_ranking',
        'prefixs': [
            'ranking_xgb', #o411
        ]
    },
    {
        'name': 'catboost',
        'prefixs': [
            'catboost', #411
            'catboost1_without_text',
            'mcl_cgb',
        ]
    },
    {
        'name': 'rnn',
        'prefixs': [
            'pretrained_bigru_cv1d_rnn', #411
            'pretrained_bigru_attention_rnn', #411
            'pretrained_2gru_rnn', #411
            'selftrained_bigru_conv1d_rnn', #411
        ]
    },
    {
        'name': 'text',
        'prefixs': [
            'text_lgb',#411
            'text_cwb_rg',#411
            'text_fm', #411
            'text_rg', #411
        ]
    },
    {
        'name': 'regression_other',
        'prefixs': [
            'mlp',#411
            'alpha_0001',#411
            'alpha_160',#411
            'alpha_10',#411
            'alpha_320'#411
        ]
    },
    {
        'name': 'classfication_other',
        'prefixs': [
            'lr_l1_05',#411
            'lr_l1_1',#411
            'lr_l2_01',#411
            'lr_l2_1',#411
            'cls05_lgb',
        ]
    },
    {
        'name': 'classfication_0',
        'prefixs': [
            'cls0_lgb',
        ]
    },
    {
        'name': 'multiclass',
        'prefixs': [
            'multiclass_lgb',
        ]
    },
    {
        'name': 'multiclass3',
        'prefixs': [
            'multiclass3_lgb',
        ]
    }
]

In [4]:
train = pd.DataFrame()
test = pd.DataFrame()

In [5]:
from scipy.stats import hmean
from scipy.stats.mstats import gmean

def get_clipped_values(a):
    return np.clip(a, 1e-15, 1.)

inter_columns = []
basic_columns = []
for config in configs:
    columns = []
    print('Processing group: ', config['name'])
    for prefix in config['prefixs']:
        train_f = folder + prefix + '_oof_val_pred.csv'
        test_f = folder + prefix + '_oof_test_pred.csv'
        
        train_df = pd.read_csv(train_f)
        test_df = pd.read_csv(test_f)
        
        if config['name'] == 'multiclass' or config['name'] == 'multiclass3':
            original_cols = train_df.columns.tolist()
            
            for c in original_cols:
                col = prefix+c
                print('Add ', col)
                
                train.loc[:,col] = train_df[c]
                test.loc[:,col] = test_df[c]
        else:
            original_col = train_df.columns.tolist()[0]
            col = prefix
            print('Add ', col)
            columns.append(col)
            basic_columns.append(col)
            if prefix in best_prefixs:
                inter_columns.append(col)

            train.loc[:,col] = train_df[original_col]
            test.loc[:,col] = test_df[original_col]
        
        del train_df, test_df; gc.collect()
    
    # apply feature engineering on intra-group columns
    if len(columns) < 2:
        continue

    for df in [train, test]:    
        df.loc[:, config['name']+'_mean'] = df[columns].mean(axis=1)
        df.loc[:, config['name']+'_med'] = df[columns].median(axis=1)
        df.loc[:, config['name']+'_max'] = df[columns].max(axis=1)
        df.loc[:, config['name']+'_min'] = df[columns].min(axis=1)
        df.loc[:, config['name']+'_std'] = df[columns].std(axis=1)

        '''
        col_len = len(columns)
        for i in range(col_len-1):
            for j in range(i+1, col_len):
                cols = [columns[i], columns[j]]
                feat_name = cols[0]+'_'+cols[1] 
                print('Add ', feat_name, ' statistcs')        
                df.loc[:, feat_name+'_mean'] = df[cols].mean(axis=1)
                df.loc[:, feat_name+'_gmean'] = gmean(get_clipped_values(df[cols].values), axis=1)
                df.loc[:, feat_name+'_hmean'] = hmean(get_clipped_values(df[cols].values), axis=1)

        if col_len < 3:
            continue

        for i in range(col_len-2):
            for j in range(i+1, col_len-1):
                for k in range(j+1, col_len):
                    cols = [columns[i], columns[j], columns[k]]
                    feat_name = cols[0]+'_'+cols[1]+'_'+cols[2]  
                    print('Add ', feat_name, ' statistcs')                    
                    df.loc[:, feat_name+'_mean'] = df[cols].mean(axis=1)
                    df.loc[:, feat_name+'_gmean'] = gmean(get_clipped_values(df[cols].values), axis=1)
                    df.loc[:, feat_name+'_hmean'] = hmean(get_clipped_values(df[cols].values), axis=1)
                    df.loc[:, feat_name+'_med'] = df[cols].median(axis=1)
                    df.loc[:, feat_name+'_std'] = df[cols].std(axis=1)
        '''
        #df = df.astype(np.float32)
        
# apply feature engineering on inter_group columns
for df in [train, test]:    
    df.loc[:, 'inter_group_mean'] = df[inter_columns].mean(axis=1)
    df.loc[:, 'inter_group_med'] = df[inter_columns].median(axis=1)
    df.loc[:, 'inter_group_max'] = df[inter_columns].max(axis=1)
    df.loc[:, 'inter_group_min'] = df[inter_columns].min(axis=1)
    df.loc[:, 'inter_group_std'] = df[inter_columns].std(axis=1)

    col_len = len(inter_columns)
    for i in range(col_len-1):
        for j in range(i+1, col_len):
            cols = [inter_columns[i], inter_columns[j]]
            feat_name = cols[0]+'_'+cols[1]+'_inter'
            print('Add ', feat_name, ' statistcs')        
            df.loc[:, feat_name+'_mean'] = df[cols].mean(axis=1)
            df.loc[:, feat_name+'_gmean'] = gmean(get_clipped_values(df[cols].values), axis=1)
            df.loc[:, feat_name+'_hmean'] = hmean(get_clipped_values(df[cols].values), axis=1)
    
    for i in range(col_len-2):
        for j in range(i+1, col_len-1):
            for k in range(j+1, col_len):
                cols = [inter_columns[i], inter_columns[j], inter_columns[k]]
                feat_name = cols[0]+'_'+cols[1]+'_'+cols[2]+'_inter'  
                print('Add ', feat_name, ' statistcs')                    
                df.loc[:, feat_name+'_mean'] = df[cols].mean(axis=1)
                df.loc[:, feat_name+'_gmean'] = gmean(get_clipped_values(df[cols].values), axis=1)
                df.loc[:, feat_name+'_hmean'] = hmean(get_clipped_values(df[cols].values), axis=1)
                df.loc[:, feat_name+'_med'] = df[cols].median(axis=1)
                df.loc[:, feat_name+'_std'] = df[cols].std(axis=1)  
             
    #df = df.astype(np.float32)

Processing group:  lgb
Add  lgb411_tune
Add  plants_lgb
Add  plants_with_img_meta_nima_fm_geo_active_lgb
Add  xentropy_small_lr_cat_lgb
Add  simple_feature_lgb
Add  all_mean_enc_lgb
Add  all_mean_enc_user_feat_lgb
Add  all_mean_enc_user_feat2_lgb
Add  cat_interact_lgb
Add  mean_enc_lgb
Add  marcus_lgb
Add  fused_text_lgb
Add  mixed_features_text_proprocessing_lgb
Add  select_dense_features_lgb
Add  select_sparse_features_lgb
Processing group:  lgb_dart
Add  lgb411_dart_tune
Processing group:  lgb_pois
Add  poisson_lgb
Processing group:  xgb_lg
Add  small_features_v5_xgb
Add  small_features_v4_xgb
Add  nima_features_xgb
Add  img_meta_xgb
Add  img_meta_nima_xgb
Processing group:  xgb_dw
Add  baseline_xgb
Processing group:  xgb_ranking
Add  ranking_xgb
Processing group:  catboost
Add  catboost
Add  catboost1_without_text
Add  mcl_cgb
Processing group:  rnn
Add  pretrained_bigru_cv1d_rnn
Add  pretrained_bigru_attention_rnn
Add  pretrained_2gru_rnn
Add  selftrained_bigru_conv1d_rnn
Processi

Add  lgb411_tune_baseline_xgb_mcl_cgb_inter  statistcs
Add  lgb411_tune_baseline_xgb_selftrained_bigru_conv1d_rnn_inter  statistcs
Add  lgb411_tune_baseline_xgb_text_lgb_inter  statistcs
Add  lgb411_tune_baseline_xgb_mlp_inter  statistcs
Add  lgb411_tune_baseline_xgb_lr_l2_01_inter  statistcs
Add  lgb411_tune_mcl_cgb_selftrained_bigru_conv1d_rnn_inter  statistcs
Add  lgb411_tune_mcl_cgb_text_lgb_inter  statistcs
Add  lgb411_tune_mcl_cgb_mlp_inter  statistcs
Add  lgb411_tune_mcl_cgb_lr_l2_01_inter  statistcs
Add  lgb411_tune_selftrained_bigru_conv1d_rnn_text_lgb_inter  statistcs
Add  lgb411_tune_selftrained_bigru_conv1d_rnn_mlp_inter  statistcs
Add  lgb411_tune_selftrained_bigru_conv1d_rnn_lr_l2_01_inter  statistcs
Add  lgb411_tune_text_lgb_mlp_inter  statistcs
Add  lgb411_tune_text_lgb_lr_l2_01_inter  statistcs
Add  lgb411_tune_mlp_lr_l2_01_inter  statistcs
Add  img_meta_xgb_baseline_xgb_mcl_cgb_inter  statistcs
Add  img_meta_xgb_baseline_xgb_selftrained_bigru_conv1d_rnn_inter  statist

In [6]:
train.head(3)

Unnamed: 0,lgb411_tune,plants_lgb,plants_with_img_meta_nima_fm_geo_active_lgb,xentropy_small_lr_cat_lgb,simple_feature_lgb,all_mean_enc_lgb,all_mean_enc_user_feat_lgb,all_mean_enc_user_feat2_lgb,cat_interact_lgb,mean_enc_lgb,...,selftrained_bigru_conv1d_rnn_mlp_lr_l2_01_inter_mean,selftrained_bigru_conv1d_rnn_mlp_lr_l2_01_inter_gmean,selftrained_bigru_conv1d_rnn_mlp_lr_l2_01_inter_hmean,selftrained_bigru_conv1d_rnn_mlp_lr_l2_01_inter_med,selftrained_bigru_conv1d_rnn_mlp_lr_l2_01_inter_std,text_lgb_mlp_lr_l2_01_inter_mean,text_lgb_mlp_lr_l2_01_inter_gmean,text_lgb_mlp_lr_l2_01_inter_hmean,text_lgb_mlp_lr_l2_01_inter_med,text_lgb_mlp_lr_l2_01_inter_std
0,0.062362,0.053896,0.07185,0.047786,0.068429,0.054375,0.054969,0.052336,0.065756,0.048808,...,0.026851,0.026702,0.026547,0.028362,0.003379,0.032057,0.03174,0.031446,0.029213,0.005679
1,0.04164,0.057651,0.057721,0.062043,0.040434,0.015163,0.015783,0.017107,0.014746,0.018057,...,0.018156,0.016859,0.015426,0.021763,0.007547,0.02132,0.0189,0.016485,0.021763,0.011623
2,0.031383,0.040794,0.048324,0.052663,0.036796,0.036656,0.034745,0.035856,0.035003,0.041355,...,0.030409,0.028043,0.025421,0.037182,0.013095,0.029793,0.027571,0.025122,0.035334,0.012653


In [7]:
pd.options.display.max_rows=100
pd.options.display.max_columns=100
train[basic_columns].corr()

Unnamed: 0,lgb411_tune,plants_lgb,plants_with_img_meta_nima_fm_geo_active_lgb,xentropy_small_lr_cat_lgb,simple_feature_lgb,all_mean_enc_lgb,all_mean_enc_user_feat_lgb,all_mean_enc_user_feat2_lgb,cat_interact_lgb,mean_enc_lgb,marcus_lgb,fused_text_lgb,mixed_features_text_proprocessing_lgb,select_dense_features_lgb,select_sparse_features_lgb,lgb411_dart_tune,poisson_lgb,small_features_v5_xgb,small_features_v4_xgb,nima_features_xgb,img_meta_xgb,img_meta_nima_xgb,baseline_xgb,ranking_xgb,catboost,catboost1_without_text,mcl_cgb,pretrained_bigru_cv1d_rnn,pretrained_bigru_attention_rnn,pretrained_2gru_rnn,selftrained_bigru_conv1d_rnn,text_lgb,text_cwb_rg,text_fm,text_rg,mlp,alpha_0001,alpha_160,alpha_10,alpha_320,lr_l1_05,lr_l1_1,lr_l2_01,lr_l2_1,cls05_lgb,cls0_lgb
lgb411_tune,1.0,0.967159,0.97699,0.967743,0.905456,0.907395,0.90748,0.899326,0.902366,0.906453,0.919712,0.914794,0.915861,0.906655,0.912361,0.98208,0.985545,0.931473,0.927126,0.935678,0.940697,0.94355,0.977573,0.769518,0.896485,0.896485,0.976032,0.895204,0.89445,0.891942,0.894094,0.823135,0.685161,0.821971,0.823708,0.866779,0.813002,0.847543,0.828779,0.851161,0.813043,0.800982,0.822777,0.795435,0.945198,0.788349
plants_lgb,0.967159,1.0,0.968862,0.95173,0.911633,0.912456,0.91249,0.904745,0.907654,0.91211,0.914469,0.921208,0.921965,0.89615,0.902178,0.966236,0.963319,0.93101,0.926905,0.930084,0.925493,0.924579,0.955797,0.765113,0.890247,0.890247,0.969725,0.897465,0.896696,0.894225,0.897245,0.835321,0.695687,0.834781,0.836749,0.874204,0.824736,0.860072,0.840884,0.863767,0.823825,0.811323,0.833793,0.805571,0.919283,0.781393
plants_with_img_meta_nima_fm_geo_active_lgb,0.97699,0.968862,1.0,0.957623,0.904752,0.906086,0.906017,0.898062,0.900886,0.905318,0.915583,0.914908,0.915626,0.901881,0.907734,0.972245,0.972814,0.924954,0.920816,0.928658,0.931048,0.933453,0.964709,0.768467,0.894999,0.894999,0.976016,0.89199,0.891134,0.888897,0.891522,0.828427,0.690939,0.82795,0.830028,0.867792,0.818223,0.853142,0.834198,0.856787,0.816884,0.804517,0.826794,0.798866,0.931634,0.782935
xentropy_small_lr_cat_lgb,0.967743,0.95173,0.957623,1.0,0.914189,0.917104,0.916845,0.907965,0.911571,0.916972,0.932491,0.925647,0.925489,0.91962,0.924598,0.963631,0.964006,0.923585,0.91947,0.926771,0.928518,0.930603,0.954971,0.758656,0.888887,0.888887,0.961636,0.901173,0.900185,0.897477,0.899517,0.827092,0.68699,0.827854,0.829751,0.872978,0.821121,0.85648,0.837466,0.859861,0.82135,0.810223,0.830989,0.804675,0.919523,0.779242
simple_feature_lgb,0.905456,0.911633,0.904752,0.914189,1.0,0.968871,0.967921,0.957815,0.966615,0.971213,0.956146,0.93653,0.939173,0.933709,0.945776,0.908589,0.90019,0.922071,0.918064,0.921465,0.916744,0.916079,0.893908,0.738648,0.902593,0.902593,0.918213,0.919504,0.918586,0.916392,0.914868,0.869182,0.716797,0.84956,0.853084,0.893489,0.841548,0.880072,0.858496,0.88483,0.83805,0.822719,0.847758,0.814407,0.849272,0.754601
all_mean_enc_lgb,0.907395,0.912456,0.906086,0.917104,0.968871,1.0,0.993119,0.979055,0.973705,0.989717,0.963624,0.939821,0.943974,0.939846,0.953933,0.90964,0.901865,0.925226,0.921064,0.924221,0.919339,0.918438,0.895836,0.737372,0.903455,0.903455,0.918551,0.922844,0.921886,0.919655,0.917843,0.866855,0.710733,0.846129,0.848887,0.896498,0.846315,0.878901,0.860962,0.882902,0.84029,0.826367,0.848981,0.818613,0.851784,0.752769
all_mean_enc_user_feat_lgb,0.90748,0.91249,0.906017,0.916845,0.967921,0.993119,1.0,0.980549,0.972853,0.988447,0.962874,0.940541,0.944687,0.939407,0.953526,0.909536,0.901898,0.925803,0.921642,0.924791,0.919893,0.919009,0.895842,0.737337,0.902602,0.902602,0.918047,0.921836,0.920802,0.918679,0.916663,0.865516,0.709698,0.844977,0.847646,0.896866,0.846989,0.879539,0.861631,0.883506,0.841827,0.827788,0.850257,0.819961,0.852166,0.752736
all_mean_enc_user_feat2_lgb,0.899326,0.904745,0.898062,0.907965,0.957815,0.979055,0.980549,1.0,0.962162,0.974986,0.951025,0.932428,0.936438,0.928221,0.942182,0.902085,0.893906,0.917543,0.913503,0.916518,0.911642,0.91083,0.887875,0.737129,0.892868,0.892868,0.910381,0.914385,0.913422,0.91127,0.909777,0.871939,0.715363,0.851349,0.854074,0.891481,0.843902,0.877224,0.858859,0.881255,0.834829,0.820828,0.84308,0.813033,0.842887,0.752974
cat_interact_lgb,0.902366,0.907654,0.900886,0.911571,0.966615,0.973705,0.972853,0.962162,1.0,0.976066,0.960817,0.933094,0.936761,0.93758,0.950516,0.903797,0.895902,0.920951,0.916776,0.919459,0.914348,0.91316,0.890089,0.734428,0.902092,0.902092,0.913036,0.916719,0.915921,0.913904,0.910945,0.862606,0.710778,0.841915,0.845153,0.891943,0.835859,0.873883,0.852646,0.878525,0.832751,0.817674,0.841905,0.809139,0.845405,0.749832
mean_enc_lgb,0.906453,0.91211,0.905318,0.916972,0.971213,0.989717,0.988447,0.974986,0.976066,1.0,0.962645,0.938839,0.943108,0.937892,0.952404,0.909121,0.900995,0.923394,0.919352,0.9225,0.917717,0.916878,0.894925,0.737832,0.902028,0.902028,0.918151,0.921847,0.92107,0.918656,0.917476,0.869846,0.712374,0.849202,0.851942,0.897334,0.848833,0.881341,0.863476,0.885262,0.84163,0.82767,0.850187,0.819786,0.850715,0.753346


In [8]:
train.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503424 entries, 0 to 1503423
Data columns (total 464 columns):
lgb411_tune                                                           float64
plants_lgb                                                            float64
plants_with_img_meta_nima_fm_geo_active_lgb                           float64
xentropy_small_lr_cat_lgb                                             float64
simple_feature_lgb                                                    float64
all_mean_enc_lgb                                                      float64
all_mean_enc_user_feat_lgb                                            float64
all_mean_enc_user_feat2_lgb                                           float64
cat_interact_lgb                                                      float64
mean_enc_lgb                                                          float64
marcus_lgb                                                            float64
fused_text_lgb              

In [9]:
train.shape, test.shape

((1503424, 464), (508438, 464))

In [10]:
print(train.isnull().sum().max()) 
print(test.isnull().sum().max())

0
0


In [11]:
for col in tqdm(train.columns):
    train[col] = train[col].astype(np.float32)
    test[col] = test[col].astype(np.float32)

100%|████████████████████████████████████████████████████████████████████████████| 464/464 [13:47<00:00,  1.78s/it]


In [12]:
train.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503424 entries, 0 to 1503423
Data columns (total 464 columns):
lgb411_tune                                                           float32
plants_lgb                                                            float32
plants_with_img_meta_nima_fm_geo_active_lgb                           float32
xentropy_small_lr_cat_lgb                                             float32
simple_feature_lgb                                                    float32
all_mean_enc_lgb                                                      float32
all_mean_enc_user_feat_lgb                                            float32
all_mean_enc_user_feat2_lgb                                           float32
cat_interact_lgb                                                      float32
mean_enc_lgb                                                          float32
marcus_lgb                                                            float32
fused_text_lgb              

In [13]:
with open('meta_train.pickle', 'wb') as handle:
    pickle.dump(train, handle)
    
with open('meta_test.pickle', 'wb') as handle:
    pickle.dump(test, handle)

## Meta Model Tuning

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gc; gc.enable()
import pickle
from tqdm import tqdm
import warnings ; warnings.filterwarnings('ignore')
import os
from GridSearcher import data_loader, model_loader, fit_params, get_oof_predictions, clip_rmse

In [15]:
SEED=411
train_y = pd.read_csv("regression_target.csv").deal_probability.values

In [3]:
with open('meta_train.pickle', 'rb') as handle:
    train = pickle.load(handle)
    
with open('meta_test.pickle', 'rb') as handle:
    test = pickle.load(handle)

### Lightgbm-gbdt

In [4]:
ml = model_loader(model_type='lgb')

In [5]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.0, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'min_split_gain': [.0, .1, .2, .3, .4]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'min_split_gain': 0.0} train loss: 0.208859, valid loss:0.209726, loss_diff:0.000867
{'min_split_gain': 0.0} train loss: 0.208673, valid loss:0.210375, loss_diff:0.001702
{'min_split_gain': 0.0} train loss: 0.208943, valid loss:0.209207, loss_diff:0.000264
{'min_split_gain': 0.0} train loss: 0.208888, valid loss:0.209462, loss_diff:0.000574
{'min_split_gain': 0.0} train loss: 0.208679, valid loss:0.210262, loss_diff:0.001583
{'min_split_gain': 0.1} train loss: 0.208846, valid loss:0.209716, loss_diff:0.000869
{'min_split_gain': 0.1} train loss: 0.208672, valid loss:0.210384, loss_diff:0.001713
{'min_split_gain': 0.1} train loss: 0.208957, valid loss:0.209256, loss_diff:0.000299
{'min_split_gain': 0.1} train loss: 0.208903, valid loss:0.209489, loss_diff:0.000586
{'min_split_gain': 0.1} train loss: 0.208678, valid loss:0.210273, loss_diff:0.001595
{'min_split_gain': 0.2} train loss: 0.208872, valid loss:0.209699, loss_diff:0.000827
{'min_split_gain': 0.2} train loss: 0.208675, valid lo

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'min_split_gain': 0.0},0.209807,0.00045
1,{'min_split_gain': 0.1},0.209823,0.000439
2,{'min_split_gain': 0.2},0.209817,0.000461
3,{'min_split_gain': 0.3},0.209853,0.000457
4,{'min_split_gain': 0.4},0.209873,0.000454


In [6]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.0, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'colsample_bytree':[i/10.0 for i in range(6,11)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'colsample_bytree': 0.6} train loss: 0.208955, valid loss:0.209728, loss_diff:0.000773
{'colsample_bytree': 0.6} train loss: 0.208763, valid loss:0.210461, loss_diff:0.001699
{'colsample_bytree': 0.6} train loss: 0.209065, valid loss:0.209305, loss_diff:0.000240
{'colsample_bytree': 0.6} train loss: 0.209010, valid loss:0.209545, loss_diff:0.000536
{'colsample_bytree': 0.6} train loss: 0.208774, valid loss:0.210333, loss_diff:0.001559
{'colsample_bytree': 0.7} train loss: 0.208914, valid loss:0.209713, loss_diff:0.000799
{'colsample_bytree': 0.7} train loss: 0.208749, valid loss:0.210465, loss_diff:0.001716
{'colsample_bytree': 0.7} train loss: 0.209005, valid loss:0.209262, loss_diff:0.000257
{'colsample_bytree': 0.7} train loss: 0.208978, valid loss:0.209517, loss_diff:0.000539
{'colsample_bytree': 0.7} train loss: 0.208781, valid loss:0.210323, loss_diff:0.001543
{'colsample_bytree': 0.8} train loss: 0.208892, valid loss:0.209695, loss_diff:0.000804
{'colsample_bytree': 0.8} train 

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'colsample_bytree': 0.6},0.209875,0.000449
1,{'colsample_bytree': 0.7},0.209856,0.000464
2,{'colsample_bytree': 0.8},0.209846,0.000454
3,{'colsample_bytree': 0.9},0.209827,0.000449
4,{'colsample_bytree': 1.0},0.209807,0.00045


In [7]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.0, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}


fit_param = None

try_params = {
    'subsample':[i/10.0 for i in range(6,11)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'subsample': 0.6} train loss: 0.208898, valid loss:0.209697, loss_diff:0.000800
{'subsample': 0.6} train loss: 0.208723, valid loss:0.210410, loss_diff:0.001687
{'subsample': 0.6} train loss: 0.208999, valid loss:0.209241, loss_diff:0.000241
{'subsample': 0.6} train loss: 0.208936, valid loss:0.209458, loss_diff:0.000522
{'subsample': 0.6} train loss: 0.208718, valid loss:0.210266, loss_diff:0.001548
{'subsample': 0.7} train loss: 0.208874, valid loss:0.209687, loss_diff:0.000814
{'subsample': 0.7} train loss: 0.208664, valid loss:0.210372, loss_diff:0.001707
{'subsample': 0.7} train loss: 0.208947, valid loss:0.209205, loss_diff:0.000258
{'subsample': 0.7} train loss: 0.208899, valid loss:0.209434, loss_diff:0.000534
{'subsample': 0.7} train loss: 0.208697, valid loss:0.210274, loss_diff:0.001577
{'subsample': 0.8} train loss: 0.208858, valid loss:0.209697, loss_diff:0.000838
{'subsample': 0.8} train loss: 0.208686, valid loss:0.210366, loss_diff:0.001681
{'subsample': 0.8} train los

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'subsample': 0.6},0.209814,0.000454
1,{'subsample': 0.7},0.209794,0.000459
2,{'subsample': 0.8},0.2098,0.000456
3,{'subsample': 0.9},0.209814,0.000446
4,{'subsample': 1.0},0.209807,0.00045


In [8]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.0, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':.7,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'reg_alpha':[1.0, 1.5, 2.0, 2.5, 3.0]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'reg_alpha': 1.0} train loss: 0.208835, valid loss:0.209693, loss_diff:0.000858
{'reg_alpha': 1.0} train loss: 0.208651, valid loss:0.210392, loss_diff:0.001740
{'reg_alpha': 1.0} train loss: 0.208926, valid loss:0.209249, loss_diff:0.000323
{'reg_alpha': 1.0} train loss: 0.208878, valid loss:0.209455, loss_diff:0.000577
{'reg_alpha': 1.0} train loss: 0.208672, valid loss:0.210265, loss_diff:0.001593
{'reg_alpha': 1.5} train loss: 0.208843, valid loss:0.209639, loss_diff:0.000795
{'reg_alpha': 1.5} train loss: 0.208665, valid loss:0.210401, loss_diff:0.001736
{'reg_alpha': 1.5} train loss: 0.208932, valid loss:0.209257, loss_diff:0.000324
{'reg_alpha': 1.5} train loss: 0.208865, valid loss:0.209422, loss_diff:0.000557
{'reg_alpha': 1.5} train loss: 0.208711, valid loss:0.210298, loss_diff:0.001587
{'reg_alpha': 2.0} train loss: 0.208874, valid loss:0.209687, loss_diff:0.000814
{'reg_alpha': 2.0} train loss: 0.208664, valid loss:0.210372, loss_diff:0.001707
{'reg_alpha': 2.0} train los

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'reg_alpha': 1.0},0.209811,0.000447
1,{'reg_alpha': 1.5},0.209803,0.000463
2,{'reg_alpha': 2.0},0.209794,0.000459
3,{'reg_alpha': 2.5},0.209805,0.00045
4,{'reg_alpha': 3.0},0.209785,0.00046


In [9]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.0, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':.7,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'reg_alpha':[3.0, 3.5, 4.0, 4.5]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'reg_alpha': 3.0} train loss: 0.208879, valid loss:0.209649, loss_diff:0.000770
{'reg_alpha': 3.0} train loss: 0.208732, valid loss:0.210378, loss_diff:0.001646
{'reg_alpha': 3.0} train loss: 0.208991, valid loss:0.209223, loss_diff:0.000232
{'reg_alpha': 3.0} train loss: 0.208932, valid loss:0.209408, loss_diff:0.000476
{'reg_alpha': 3.0} train loss: 0.208730, valid loss:0.210266, loss_diff:0.001537
{'reg_alpha': 3.5} train loss: 0.208910, valid loss:0.209681, loss_diff:0.000772
{'reg_alpha': 3.5} train loss: 0.208761, valid loss:0.210416, loss_diff:0.001655
{'reg_alpha': 3.5} train loss: 0.209030, valid loss:0.209252, loss_diff:0.000222
{'reg_alpha': 3.5} train loss: 0.208949, valid loss:0.209423, loss_diff:0.000474
{'reg_alpha': 3.5} train loss: 0.208732, valid loss:0.210279, loss_diff:0.001547
{'reg_alpha': 4.0} train loss: 0.208930, valid loss:0.209668, loss_diff:0.000738
{'reg_alpha': 4.0} train loss: 0.208730, valid loss:0.210366, loss_diff:0.001636
{'reg_alpha': 4.0} train los

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'reg_alpha': 3.0},0.209785,0.00046
1,{'reg_alpha': 3.5},0.20981,0.000462
2,{'reg_alpha': 4.0},0.209782,0.000457
3,{'reg_alpha': 4.5},0.209796,0.000456


In [11]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.0, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':.7,  
    'colsample_bytree':1., 
    'reg_alpha':4.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}
fit_param = None

try_params = {
    'reg_lambda':[i/10.0 for i in range(0,11,1)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'reg_lambda': 0.0} train loss: 0.208930, valid loss:0.209668, loss_diff:0.000738
{'reg_lambda': 0.0} train loss: 0.208730, valid loss:0.210366, loss_diff:0.001636
{'reg_lambda': 0.0} train loss: 0.209013, valid loss:0.209200, loss_diff:0.000187
{'reg_lambda': 0.0} train loss: 0.208962, valid loss:0.209422, loss_diff:0.000460
{'reg_lambda': 0.0} train loss: 0.208768, valid loss:0.210253, loss_diff:0.001484
{'reg_lambda': 0.1} train loss: 0.208930, valid loss:0.209670, loss_diff:0.000740
{'reg_lambda': 0.1} train loss: 0.208741, valid loss:0.210401, loss_diff:0.001660
{'reg_lambda': 0.1} train loss: 0.209003, valid loss:0.209192, loss_diff:0.000189
{'reg_lambda': 0.1} train loss: 0.208966, valid loss:0.209417, loss_diff:0.000451
{'reg_lambda': 0.1} train loss: 0.208766, valid loss:0.210260, loss_diff:0.001494
{'reg_lambda': 0.2} train loss: 0.208929, valid loss:0.209669, loss_diff:0.000740
{'reg_lambda': 0.2} train loss: 0.208741, valid loss:0.210401, loss_diff:0.001660
{'reg_lambda': 0

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'reg_lambda': 0.0},0.209782,0.000457
1,{'reg_lambda': 0.1},0.209788,0.00047
2,{'reg_lambda': 0.2},0.209793,0.000476
3,{'reg_lambda': 0.3},0.209789,0.00046
4,{'reg_lambda': 0.4},0.209795,0.000464
5,{'reg_lambda': 0.5},0.209794,0.000462
6,{'reg_lambda': 0.6},0.209809,0.000462
7,{'reg_lambda': 0.7},0.209802,0.000453
8,{'reg_lambda': 0.8},0.209787,0.000461
9,{'reg_lambda': 0.9},0.209775,0.000465


In [None]:
default_params = {
    'boosting_type':'gbdt', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.02, 
    'n_estimators':3000, 
    'min_split_gain':0.1, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':.7,  
    'colsample_bytree':1., 
    'reg_alpha':1.5, 
    'reg_lambda':0.7, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = {
    'early_stopping_rounds': 50,
    'verbose': 100,
    'eval_metric': 'rmse'
}

_, ret_test, _ = get_oof_predictions(train, train_y, test, ml, 
                                     default_params, seed=19, fit_params=fit_param, use_eval_set=True)

In [None]:
test_df = pd.read_csv("test.csv", usecols=['item_id'])
pd.DataFrame(np.clip(ret_test,0,1), 
             index=test_df.item_id,
             columns=['deal_probability']).to_csv('lgb_meta_no_bagging_exclude_knn.csv')

### Lightgbm-dart

In [6]:
default_params = {
    'boosting_type':'dart', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.0, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'min_split_gain': [.0, .1, .2, .3, .4]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'min_split_gain': 0.0} train loss: 0.213927, valid loss:0.214264, loss_diff:0.000337
{'min_split_gain': 0.0} train loss: 0.213748, valid loss:0.215104, loss_diff:0.001356
{'min_split_gain': 0.0} train loss: 0.214077, valid loss:0.213627, loss_diff:-0.000450
{'min_split_gain': 0.0} train loss: 0.214023, valid loss:0.213778, loss_diff:-0.000245
{'min_split_gain': 0.0} train loss: 0.213804, valid loss:0.214721, loss_diff:0.000917
{'min_split_gain': 0.1} train loss: 0.213920, valid loss:0.214252, loss_diff:0.000331
{'min_split_gain': 0.1} train loss: 0.213755, valid loss:0.215107, loss_diff:0.001352
{'min_split_gain': 0.1} train loss: 0.214055, valid loss:0.213606, loss_diff:-0.000449
{'min_split_gain': 0.1} train loss: 0.214021, valid loss:0.213776, loss_diff:-0.000245
{'min_split_gain': 0.1} train loss: 0.213790, valid loss:0.214729, loss_diff:0.000938
{'min_split_gain': 0.2} train loss: 0.213925, valid loss:0.214274, loss_diff:0.000349
{'min_split_gain': 0.2} train loss: 0.213742, vali

KeyboardInterrupt: 

In [23]:
default_params = {
    'boosting_type':'dart', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.3, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'colsample_bytree':[i/10.0 for i in range(6,11)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'colsample_bytree': 0.6} train loss: 0.212777, valid loss:0.212907, loss_diff:0.000130
{'colsample_bytree': 0.6} train loss: 0.212682, valid loss:0.213379, loss_diff:0.000696
{'colsample_bytree': 0.6} train loss: 0.212666, valid loss:0.213448, loss_diff:0.000782
{'colsample_bytree': 0.6} train loss: 0.212721, valid loss:0.213123, loss_diff:0.000403
{'colsample_bytree': 0.6} train loss: 0.212778, valid loss:0.212892, loss_diff:0.000114
{'colsample_bytree': 0.7} train loss: 0.212761, valid loss:0.212934, loss_diff:0.000172
{'colsample_bytree': 0.7} train loss: 0.212644, valid loss:0.213358, loss_diff:0.000714
{'colsample_bytree': 0.7} train loss: 0.212643, valid loss:0.213420, loss_diff:0.000777
{'colsample_bytree': 0.7} train loss: 0.212692, valid loss:0.213109, loss_diff:0.000417
{'colsample_bytree': 0.7} train loss: 0.212771, valid loss:0.212889, loss_diff:0.000118
{'colsample_bytree': 0.8} train loss: 0.212740, valid loss:0.212878, loss_diff:0.000138
{'colsample_bytree': 0.8} train 

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'colsample_bytree': 0.6},0.21315,0.000231
1,{'colsample_bytree': 0.7},0.213142,0.000216
2,{'colsample_bytree': 0.8},0.213123,0.000223
3,{'colsample_bytree': 0.9},0.21314,0.000212
4,{'colsample_bytree': 1.0},0.21311,0.000217


In [24]:
default_params = {
    'boosting_type':'dart', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.3, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'subsample':[i/10.0 for i in range(6,11)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'subsample': 0.6} train loss: 0.212709, valid loss:0.212853, loss_diff:0.000144
{'subsample': 0.6} train loss: 0.212645, valid loss:0.213329, loss_diff:0.000684
{'subsample': 0.6} train loss: 0.212630, valid loss:0.213384, loss_diff:0.000754
{'subsample': 0.6} train loss: 0.212655, valid loss:0.213066, loss_diff:0.000410
{'subsample': 0.6} train loss: 0.212758, valid loss:0.212853, loss_diff:0.000095
{'subsample': 0.7} train loss: 0.212711, valid loss:0.212867, loss_diff:0.000157
{'subsample': 0.7} train loss: 0.212632, valid loss:0.213330, loss_diff:0.000698
{'subsample': 0.7} train loss: 0.212620, valid loss:0.213375, loss_diff:0.000755
{'subsample': 0.7} train loss: 0.212664, valid loss:0.213054, loss_diff:0.000390
{'subsample': 0.7} train loss: 0.212737, valid loss:0.212844, loss_diff:0.000107
{'subsample': 0.8} train loss: 0.212731, valid loss:0.212885, loss_diff:0.000154
{'subsample': 0.8} train loss: 0.212635, valid loss:0.213348, loss_diff:0.000713
{'subsample': 0.8} train los

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'subsample': 0.6},0.213097,0.000226
1,{'subsample': 0.7},0.213094,0.000223
2,{'subsample': 0.8},0.213108,0.000221
3,{'subsample': 0.9},0.213126,0.000219
4,{'subsample': 1.0},0.21311,0.000217


In [25]:
default_params = {
    'boosting_type':'dart', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.3, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':.7,  
    'colsample_bytree':1., 
    'reg_alpha':2.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'reg_alpha':[1.0, 1.5, 2.0, 2.5, 3.0]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'reg_alpha': 1.0} train loss: 0.212702, valid loss:0.212864, loss_diff:0.000162
{'reg_alpha': 1.0} train loss: 0.212614, valid loss:0.213305, loss_diff:0.000691
{'reg_alpha': 1.0} train loss: 0.212582, valid loss:0.213371, loss_diff:0.000788
{'reg_alpha': 1.0} train loss: 0.212675, valid loss:0.213072, loss_diff:0.000397
{'reg_alpha': 1.0} train loss: 0.212702, valid loss:0.212821, loss_diff:0.000118
{'reg_alpha': 1.5} train loss: 0.212732, valid loss:0.212871, loss_diff:0.000139
{'reg_alpha': 1.5} train loss: 0.212631, valid loss:0.213312, loss_diff:0.000681
{'reg_alpha': 1.5} train loss: 0.212605, valid loss:0.213374, loss_diff:0.000770
{'reg_alpha': 1.5} train loss: 0.212669, valid loss:0.213057, loss_diff:0.000388
{'reg_alpha': 1.5} train loss: 0.212743, valid loss:0.212854, loss_diff:0.000111
{'reg_alpha': 2.0} train loss: 0.212711, valid loss:0.212867, loss_diff:0.000157
{'reg_alpha': 2.0} train loss: 0.212632, valid loss:0.213330, loss_diff:0.000698
{'reg_alpha': 2.0} train los

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'reg_alpha': 1.0},0.213086,0.000223
1,{'reg_alpha': 1.5},0.213093,0.000217
2,{'reg_alpha': 2.0},0.213094,0.000223
3,{'reg_alpha': 2.5},0.213104,0.000216
4,{'reg_alpha': 3.0},0.21311,0.000222


In [26]:
default_params = {
    'boosting_type':'dart', 
    'num_leaves':31, 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'min_split_gain':0.3, 
    'min_child_weight':0.001, 
    'min_child_samples':20, 
    'subsample':.7,  
    'colsample_bytree':1., 
    'reg_alpha':1.0, 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'reg_lambda':[i/10.0 for i in range(0,11,1)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'reg_lambda': 0.0} train loss: 0.212702, valid loss:0.212864, loss_diff:0.000162
{'reg_lambda': 0.0} train loss: 0.212614, valid loss:0.213305, loss_diff:0.000691
{'reg_lambda': 0.0} train loss: 0.212582, valid loss:0.213371, loss_diff:0.000788
{'reg_lambda': 0.0} train loss: 0.212675, valid loss:0.213072, loss_diff:0.000397
{'reg_lambda': 0.0} train loss: 0.212702, valid loss:0.212821, loss_diff:0.000118
{'reg_lambda': 0.1} train loss: 0.212705, valid loss:0.212876, loss_diff:0.000171
{'reg_lambda': 0.1} train loss: 0.212607, valid loss:0.213300, loss_diff:0.000693
{'reg_lambda': 0.1} train loss: 0.212582, valid loss:0.213371, loss_diff:0.000788
{'reg_lambda': 0.1} train loss: 0.212675, valid loss:0.213072, loss_diff:0.000397
{'reg_lambda': 0.1} train loss: 0.212718, valid loss:0.212836, loss_diff:0.000118
{'reg_lambda': 0.2} train loss: 0.212700, valid loss:0.212859, loss_diff:0.000159
{'reg_lambda': 0.2} train loss: 0.212611, valid loss:0.213313, loss_diff:0.000701
{'reg_lambda': 0

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'reg_lambda': 0.0},0.213086,0.000223
1,{'reg_lambda': 0.1},0.213091,0.000216
2,{'reg_lambda': 0.2},0.213094,0.000223
3,{'reg_lambda': 0.3},0.213092,0.000222
4,{'reg_lambda': 0.4},0.213097,0.000217
5,{'reg_lambda': 0.5},0.213094,0.000217
6,{'reg_lambda': 0.6},0.213097,0.000216
7,{'reg_lambda': 0.7},0.213096,0.000212
8,{'reg_lambda': 0.8},0.213091,0.000222
9,{'reg_lambda': 0.9},0.213083,0.000214


### XGB-gbdt

In [None]:
ml = model_loader(model_type='xgb')

default_params = {
    'booster':'gbtree', 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'gamma':0.0, 
    'min_child_weight':0.001,
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2., 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'min_child_weight':[0.001, 0.1, 2, 4, 8]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

In [None]:
ml = model_loader(model_type='xgb')

default_params = {
    'booster':'gbtree', 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'gamma':0.0, 
    'min_child_weight':0.001,
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2., 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'gamma':[.0, .1, .2, .3, .4]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

In [None]:
ml = model_loader(model_type='xgb')

default_params = {
    'booster':'gbtree', 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'gamma':0.0, 
    'min_child_weight':0.001,
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2., 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'colsample_bytree':[i/10.0 for i in range(6,11)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

In [None]:
ml = model_loader(model_type='xgb')

default_params = {
    'booster':'gbtree', 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'gamma':0.0, 
    'min_child_weight':0.001,
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2., 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'subsample':[i/10.0 for i in range(6,11)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

In [None]:
ml = model_loader(model_type='xgb')

default_params = {
    'booster':'gbtree', 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'gamma':0.0, 
    'min_child_weight':0.001,
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2., 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'reg_alpha':[1.0, 1.5, 2.0, 2.5, 3.0]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

In [None]:
ml = model_loader(model_type='xgb')

default_params = {
    'booster':'gbtree', 
    'max_depth':5, 
    'learning_rate':0.1, 
    'n_estimators':100, 
    'gamma':0.0, 
    'min_child_weight':0.001,
    'subsample':1.,  
    'colsample_bytree':1., 
    'reg_alpha':2., 
    'reg_lambda':0.0, 
    'random_state':SEED, 
    'n_jobs': 3
}

fit_param = None

try_params = {
    'reg_lambda':[i/10.0 for i in range(0,11,1)]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

## Rigde

In [12]:
ml = model_loader(model_type='rg')

default_params = {
    'alpha': 1.0, 
    'fit_intercept': True, 
    'normalize': False, 
    'copy_X': True, 
    'max_iter': None, 
    'tol': 0.001, 
    'solver':'auto', 
    'random_state': SEED
}

fit_param = None

try_params = {
    'alpha':[1,2,4,8]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'alpha': 1} train loss: 0.209819, valid loss:0.209670, loss_diff:-0.000149
{'alpha': 1} train loss: 0.209629, valid loss:0.210411, loss_diff:0.000782
{'alpha': 1} train loss: 0.209860, valid loss:0.209322, loss_diff:-0.000538
{'alpha': 1} train loss: 0.209857, valid loss:0.209452, loss_diff:-0.000405
{'alpha': 1} train loss: 0.209675, valid loss:0.210227, loss_diff:0.000552
{'alpha': 2} train loss: 0.209827, valid loss:0.209679, loss_diff:-0.000148
{'alpha': 2} train loss: 0.209637, valid loss:0.210420, loss_diff:0.000783
{'alpha': 2} train loss: 0.209869, valid loss:0.209325, loss_diff:-0.000544
{'alpha': 2} train loss: 0.209866, valid loss:0.209459, loss_diff:-0.000406
{'alpha': 2} train loss: 0.209683, valid loss:0.210233, loss_diff:0.000550
{'alpha': 4} train loss: 0.209842, valid loss:0.209697, loss_diff:-0.000146
{'alpha': 4} train loss: 0.209652, valid loss:0.210436, loss_diff:0.000784
{'alpha': 4} train loss: 0.209886, valid loss:0.209334, loss_diff:-0.000552
{'alpha': 4} trai

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'alpha': 1},0.209816,0.000429
1,{'alpha': 2},0.209823,0.00043
2,{'alpha': 4},0.209838,0.000432
3,{'alpha': 8},0.209863,0.000433


In [13]:
ml = model_loader(model_type='rg')

default_params = {
    'alpha': 1.0, 
    'fit_intercept': True, 
    'normalize': False, 
    'copy_X': True, 
    'max_iter': None, 
    'tol': 0.001, 
    'solver':'auto', 
    'random_state': SEED
}

fit_param = None

try_params = {
    'alpha':[0.05, 0.1, 0.5]
}

fit_params(train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'alpha': 0.05} train loss: 0.209811, valid loss:0.209662, loss_diff:-0.000149
{'alpha': 0.05} train loss: 0.209621, valid loss:0.210403, loss_diff:0.000782
{'alpha': 0.05} train loss: 0.209850, valid loss:0.209325, loss_diff:-0.000525
{'alpha': 0.05} train loss: 0.209850, valid loss:0.209446, loss_diff:-0.000404
{'alpha': 0.05} train loss: 0.209666, valid loss:0.210224, loss_diff:0.000558
{'alpha': 0.1} train loss: 0.209812, valid loss:0.209662, loss_diff:-0.000149
{'alpha': 0.1} train loss: 0.209622, valid loss:0.210403, loss_diff:0.000781
{'alpha': 0.1} train loss: 0.209850, valid loss:0.209324, loss_diff:-0.000527
{'alpha': 0.1} train loss: 0.209850, valid loss:0.209446, loss_diff:-0.000404
{'alpha': 0.1} train loss: 0.209666, valid loss:0.210224, loss_diff:0.000557
{'alpha': 0.5} train loss: 0.209815, valid loss:0.209666, loss_diff:-0.000149
{'alpha': 0.5} train loss: 0.209625, valid loss:0.210407, loss_diff:0.000781
{'alpha': 0.5} train loss: 0.209855, valid loss:0.209322, loss_d

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'alpha': 0.05},0.209812,0.000427
1,{'alpha': 0.1},0.209812,0.000427
2,{'alpha': 0.5},0.209813,0.000428


## Bagging + Ultimate Blending

In [16]:
seeds = [19, 23, 37]
config = {
    'lgb_dart':{
        'ml': model_loader(model_type='lgb'),
        'param': {
            'boosting_type':'dart', 
            'num_leaves':31, 
            'max_depth':5, 
            'learning_rate':0.1, 
            'n_estimators':5000, 
            'min_split_gain':0.0, 
            'min_child_weight':0.001, 
            'min_child_samples':20, 
            'subsample':.7,  
            'colsample_bytree':1., 
            'reg_alpha':4.0, 
            'reg_lambda':.9, 
            'random_state':SEED, 
            'n_jobs': 3
        },
        'fit_param': {
            'early_stopping_rounds': 50,
            'verbose': 100,
            'eval_metric': 'rmse'
        }
    },
    'lgb_gbdt':{
        'ml': model_loader(model_type='lgb'),
        'param': {
            'boosting_type':'gbdt', 
            'num_leaves':31, 
            'max_depth':5, 
            'learning_rate':0.02, 
            'n_estimators':5000, 
            'min_split_gain':0.0, 
            'min_child_weight':0.001, 
            'min_child_samples':20, 
            'subsample':.7,  
            'colsample_bytree':1., 
            'reg_alpha':4.0, 
            'reg_lambda':0.9, 
            'random_state':SEED, 
            'n_jobs': 3
        },
        'fit_param': {
            'early_stopping_rounds': 50,
            'verbose': 100,
            'eval_metric': 'rmse'
        }
    },
    'ridge':{
        'ml': model_loader(model_type='rg'),
        'param': {
            'alpha': 0.05, 
            'fit_intercept': True, 
            'normalize': False, 
            'copy_X': True, 
            'max_iter': None, 
            'tol': 0.001, 
            'solver':'auto', 
            'random_state': SEED
        },
        'fit_param': None
    }
}

In [None]:
results = []

for k,v in config.items():
    print('Training & bagging: ', k)
    res = {
        'val_oof': np.zeros((len(train_y),)),
        'test_oof': np.zeros((test.shape[0],))
    }
    
    for seed in seeds:
        print('Training seed =', seed)
        if 'random_state' in v['param']:
            v['param']['random_state'] = seed
            
        oof_val_pred, oof_test_pred, _ = get_oof_predictions(train, train_y, test, v['ml'], 
                                                          v['param'], seed=SEED, fit_params=v['fit_param'], 
                                                          use_eval_set= v['fit_param'] is not None)
        
        res['val_oof'] += oof_val_pred
        res['test_oof'] += oof_test_pred
    
    res['val_oof'] /= len(seeds)
    res['test_oof'] /= len(seeds)
    
    results.append(res)

Training & bagging:  lgb_dart
Training seed = 19
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 0.211366	valid's rmse: 0.211634
[200]	train's rmse: 0.209535	valid's rmse: 0.210124
[300]	train's rmse: 0.208858	valid's rmse: 0.209686
[400]	train's rmse: 0.208242	valid's rmse: 0.20943
[500]	train's rmse: 0.208029	valid's rmse: 0.209412
[600]	train's rmse: 0.207452	valid's rmse: 0.209224
Early stopping, best iteration is:
[617]	train's rmse: 0.207332	valid's rmse: 0.209192
Fold 1 completed.
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 0.211195	valid's rmse: 0.212355
[200]	train's rmse: 0.209372	valid's rmse: 0.210822
[300]	train's rmse: 0.208699	valid's rmse: 0.210374
[400]	train's rmse: 0.208082	valid's rmse: 0.210112
[500]	train's rmse: 0.207874	valid's rmse: 0.210065
[600]	train's rmse: 0.207324	valid's rmse: 0.20991
[700]	train's rmse: 0.207115	valid's rmse: 0.209877
[800]	train's rmse: 0.206706	valid's rmse: 0.209

Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 0.212258	valid's rmse: 0.212027
[200]	train's rmse: 0.210177	valid's rmse: 0.210188
[300]	train's rmse: 0.209526	valid's rmse: 0.209747
[400]	train's rmse: 0.209105	valid's rmse: 0.209511
[500]	train's rmse: 0.208745	valid's rmse: 0.209343
[600]	train's rmse: 0.208451	valid's rmse: 0.209235
[700]	train's rmse: 0.208179	valid's rmse: 0.209152
[800]	train's rmse: 0.207925	valid's rmse: 0.209092
[900]	train's rmse: 0.207683	valid's rmse: 0.209043
[1000]	train's rmse: 0.207456	valid's rmse: 0.209002
[1100]	train's rmse: 0.207241	valid's rmse: 0.208969
[1200]	train's rmse: 0.207026	valid's rmse: 0.208941
[1300]	train's rmse: 0.206814	valid's rmse: 0.208917
[1400]	train's rmse: 0.206614	valid's rmse: 0.208899
[1500]	train's rmse: 0.206414	valid's rmse: 0.208883
[1600]	train's rmse: 0.206213	valid's rmse: 0.208859
[1700]	train's rmse: 0.20602	valid's rmse: 0.208844
[1800]	train's rmse: 0.205824	valid's rmse: 0.2

[800]	train's rmse: 0.207739	valid's rmse: 0.209878
[900]	train's rmse: 0.207506	valid's rmse: 0.209835
[1000]	train's rmse: 0.207275	valid's rmse: 0.209792
[1100]	train's rmse: 0.207064	valid's rmse: 0.209759
[1200]	train's rmse: 0.206853	valid's rmse: 0.209733
[1300]	train's rmse: 0.206648	valid's rmse: 0.20971
[1400]	train's rmse: 0.206446	valid's rmse: 0.209691
[1500]	train's rmse: 0.206246	valid's rmse: 0.209669
[1600]	train's rmse: 0.206051	valid's rmse: 0.209659
[1700]	train's rmse: 0.205865	valid's rmse: 0.209656
[1800]	train's rmse: 0.205675	valid's rmse: 0.209646
[1900]	train's rmse: 0.205492	valid's rmse: 0.20964
[2000]	train's rmse: 0.205308	valid's rmse: 0.20963
[2100]	train's rmse: 0.20512	valid's rmse: 0.209623
[2200]	train's rmse: 0.204941	valid's rmse: 0.209619
Early stopping, best iteration is:
[2218]	train's rmse: 0.204906	valid's rmse: 0.209615
Fold 5 completed.
Training seed = 37
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 0.21

In [6]:
test_df = pd.read_csv("data/test.csv", usecols=['item_id'])
pd.DataFrame(np.clip(results[1]['test_oof'],0,1), 
             index=test_df.item_id,
             columns=['deal_probability']).to_csv('lgb_gbdt_meta_bagging.csv')

In [7]:
new_train = pd.DataFrame()
new_test = pd.DataFrame()

new_train['f1'] = results[0]['val_oof']
new_train['f2'] = results[1]['val_oof']
new_train['f3'] = results[2]['val_oof']

new_test['f1'] = results[0]['test_oof']
new_test['f2'] = results[1]['test_oof']
new_test['f3'] = results[2]['test_oof']

ml = model_loader(model_type='rg')
default_params = {
    'alpha': 1.0, 
    'fit_intercept': True, 
    'normalize': False, 
    'copy_X': True, 
    'max_iter': None, 
    'tol': 0.001, 
    'solver':'auto', 
    'random_state': SEED
}

fit_param = None

try_params = {
    'alpha':[1,2,4,8]
}

fit_params(new_train, train_y, ml, default_params, try_params, fit_params=fit_param, seed=SEED, use_eval_set=False)

{'alpha': 1} train loss: 0.208867, valid loss:0.208642, loss_diff:-0.000225
{'alpha': 1} train loss: 0.208712, valid loss:0.209269, loss_diff:0.000557
{'alpha': 1} train loss: 0.208930, valid loss:0.208379, loss_diff:-0.000551
{'alpha': 1} train loss: 0.208890, valid loss:0.208555, loss_diff:-0.000335
{'alpha': 1} train loss: 0.208708, valid loss:0.209281, loss_diff:0.000573
{'alpha': 2} train loss: 0.208868, valid loss:0.208642, loss_diff:-0.000226
{'alpha': 2} train loss: 0.208712, valid loss:0.209270, loss_diff:0.000558
{'alpha': 2} train loss: 0.208931, valid loss:0.208378, loss_diff:-0.000552
{'alpha': 2} train loss: 0.208890, valid loss:0.208555, loss_diff:-0.000335
{'alpha': 2} train loss: 0.208708, valid loss:0.209281, loss_diff:0.000573
{'alpha': 4} train loss: 0.208868, valid loss:0.208643, loss_diff:-0.000226
{'alpha': 4} train loss: 0.208713, valid loss:0.209273, loss_diff:0.000560
{'alpha': 4} train loss: 0.208931, valid loss:0.208377, loss_diff:-0.000554
{'alpha': 4} trai

Unnamed: 0,param,val_loss_mean,val_loss_std
0,{'alpha': 1},0.208825,0.000377
1,{'alpha': 2},0.208825,0.000377
2,{'alpha': 4},0.208826,0.000378
3,{'alpha': 8},0.208827,0.00038


In [11]:
new_train[['f1', 'f2', 'f3']].corr()

Unnamed: 0,f1,f2,f3
f1,1.0,0.994878,0.989767
f2,0.994878,1.0,0.988676
f3,0.989767,0.988676,1.0


In [8]:
_, oof_test_pred, _ = get_oof_predictions( new_train, train_y, new_test, ml, 
                                                      default_params, seed=SEED, fit_params=fit_param, 
                                                      use_eval_set= False)

Fold 1 completed.
Fold 2 completed.
Fold 3 completed.
Fold 4 completed.
Fold 5 completed.


In [10]:
test_df = pd.read_csv("data/test.csv", usecols=['item_id'])
pd.DataFrame(np.clip(oof_test_pred,0,1), 
             index=test_df.item_id,
             columns=['deal_probability']).to_csv('stack_bagging_blend_no_xgb_meta_ridge.csv')

In [None]:
best_blend_test = None
best_score = None
min_w = 0.01
for a in tqdm(np.arange(min_w, 1+min_w-min_w*2, min_w)):
    for b in np.arange(min_w, 1-a+min_w-min_w*1, min_w):
        c = 1-a-b
        combined_res = a*results[0]['val_oof'] + \
                       b*results[1]['val_oof'] + \
                       c*results[2]['val_oof']

        score = clip_rmse(train_y, combined_res)
        if best_score is None or score < best_score:
            best_score = score
            print('best score updated: {:.6f}'.format(best_score), ' coefficient=> {}, {}, {}'.format(a, b, c))
            best_blend_test =  a*results[0]['test_oof'] + \
                               b*results[1]['test_oof'] + \
                               c*results[2]['test_oof']

  0%|                                                                                       | 0/98 [00:00<?, ?it/s]

best score updated: 0.209673  coefficient=> 0.01, 0.01, 0.98
best score updated: 0.209658  coefficient=> 0.01, 0.02, 0.97
best score updated: 0.209643  coefficient=> 0.01, 0.03, 0.96
best score updated: 0.209628  coefficient=> 0.01, 0.04, 0.95
best score updated: 0.209614  coefficient=> 0.01, 0.05, 0.94
best score updated: 0.209599  coefficient=> 0.01, 0.060000000000000005, 0.9299999999999999
best score updated: 0.209585  coefficient=> 0.01, 0.06999999999999999, 0.92
best score updated: 0.209571  coefficient=> 0.01, 0.08, 0.91
best score updated: 0.209557  coefficient=> 0.01, 0.09, 0.9
best score updated: 0.209543  coefficient=> 0.01, 0.09999999999999999, 0.89
best score updated: 0.209529  coefficient=> 0.01, 0.11, 0.88
best score updated: 0.209516  coefficient=> 0.01, 0.12, 0.87
best score updated: 0.209502  coefficient=> 0.01, 0.13, 0.86
best score updated: 0.209489  coefficient=> 0.01, 0.14, 0.85
best score updated: 0.209476  coefficient=> 0.01, 0.15000000000000002, 0.84
best score 

 43%|█████████████████████████████████▍                                            | 42/98 [02:36<03:28,  3.72s/it]

In [None]:
best_blend_test = None
best_score = None
min_w = 0.01
for a in np.arange(min_w, 1+min_w-min_w*4, min_w):
    for b in np.arange(min_w, 1-a+min_w-min_w*3, min_w):
        for c in np.arange(min_w, 1-a-b+min_w-min_w*2, min_w):
            for d in np.arange(min_w, 1-a-b-c+min_w-min_w*1, min_w):
                e = 1-a-b-c-d
                combined_res = a*results[0]['val_oof'] + \
                               b*results[1]['val_oof'] + \
                               c*results[2]['val_oof'] + \
                               d*results[3]['val_oof'] + \ 
                               e*results[4]['val_oof']
                
                score = clip_rmse(train_y, combined_res)
                if best_score is None or score < best_score:
                    best_score = score
                    print('best score updated:', best_score)
                    best_blend_test =  a*results[0]['test_oof'] + \
                                       b*results[1]['test_oof'] + \
                                       c*results[2]['test_oof'] + \
                                       d*results[3]['test_oof'] + \ 
                                       e*results[4]['test_oof']

In [None]:
test_df = pd.read_csv("data/test.csv", usecols=['item_id'])
pd.DataFrame(np.clip(best_blend_test,0,1), 
             index=test_df.item_id,
             columns=['deal_probability']).to_csv('stack_bagging_blend_no_xgb_meta.csv')