In [1]:
# Import library
import pandas as pd
import numpy as np
import os, random, warnings, gc, psutil, datetime
from tqdm import tqdm_notebook, tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
from math import sqrt

import lightgbm as lgbm
from catboost import CatBoostRegressor

from glob import glob
from IPython.display import display

import seaborn as sns
import matplotlib.pyplot as plt

# Set options
pd.set_option('max_columns',500)
pd.set_option('max_rows',500)
pd.options.display.max_colwidth = 300

warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_palette('bright')

In [2]:
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything()

In [3]:
train = pd.read_pickle('../Create_Features/preprocessed_train2.pickle')
test = pd.read_pickle('../Create_Features/preprocessed_test2.pickle')

In [4]:
n_trn = 415423
target_col = '18~20_ride'

In [5]:
train.tail()

Unnamed: 0,18~20_ride,bus_route_id,in_out,latitude,longitude,station_code,dayofweek,weekend,ride_total,takeoff_total,ride_go_to_work,takeoff_go_to_work,dis_jejusi,dis_seoquipo,bus_route_id_station_code,bus_route_id_station_code_weekend,date_fq_enc,station_code_fq_enc,bus_route_id_fq_enc,bus_route_id_station_code_fq_enc,date_bus_route_id_fq_enc,date_station_code_fq_enc,date_bus_route_id_station_code_fq_enc,7~8_ride_date_mean,7~8_ride_date_bus_route_id_mean,8~9_ride_date_mean,8~9_ride_date_bus_route_id_mean,9~10_ride_date_mean,9~10_ride_date_bus_route_id_mean,station_sequence,station_reverse_sequence,weekday,is_national_holiday,getin_total,morning_getin,morning_takeoff,noon_getin,noon_takeoff,station_morning_getin_sum,station_morning_takeoff_sum,bus_route_getin_sum,bus_route_takeoff_sum,station_morning_getin_mean,station_morning_takeoff_mean,bus_route_getin_mean,bus_route_takeoff_mean,kmeans1,kmeans2,regular_commuter_count,afternoon_takeoff,next_bus_time_diff,getin_user_count1_morning,getin_user_count2_morning,takeoff_user_count1_noon,takeoff_user_count2_noon,hourly_rain,prev_daily_rain,hourly_cloud,latlong_second,total_population,man_population,woman_population,avg_time_diff,passengers_in,passengers_out,latitude_rank,longitude_rank,current_day_national_plane,current_day_international_plane,prev_day_national_plane,prev_day_international_plane
415418,0.0,630,0,33.41437,126.26336,1044,0,0,4.0,0.0,0.0,0.0,25.950139,32.38546,31002,53650,15884,725,268,39,9,17,1,1.032108,0.0,0.988353,0.0,0.743075,0.0,5,5,0,0,4.0,4.0,0.0,0.0,0.0,52.0,81.0,10.0,8.0,3.058824,4.764706,1.111111,0.888889,48,6,94.0,0.0,,24.0,,38.0,,0.0,5.2,76,1692,25003.0,13337.0,11666.0,1426.34545,130.434783,113.413043,219468.0,25901.0,61516,5358,69338,6402
415419,0.0,630,0,33.49946,126.51479,1433,0,0,4.0,0.0,0.0,0.0,0.754801,26.927069,31003,53652,15884,1109,268,33,9,23,1,1.032108,0.0,0.988353,0.0,0.743075,0.0,6,4,0,0,4.0,4.0,0.0,0.0,0.0,567.0,7.0,10.0,8.0,24.652174,0.304348,1.111111,0.888889,65,38,627.0,0.0,,490.0,1.0,,,0.0,5.2,76,2626,14906.0,7541.0,7365.0,3103.390585,806.76087,17.782609,466717.5,269985.0,61516,5358,69338,6402
415420,0.0,630,0,33.231,126.26273,2146,0,0,0.0,1.0,0.0,1.0,38.482046,27.717482,31005,53656,15884,295,268,11,9,9,1,1.032108,0.0,0.988353,0.0,0.743075,0.0,7,3,0,0,0.0,0.0,1.0,0.0,0.0,11.0,4.0,10.0,8.0,1.222222,0.444444,1.111111,0.888889,12,6,5.0,0.0,,9.0,1.0,1.0,,0.0,5.2,76,55,23094.0,11431.0,11663.0,3005.568831,11.195652,2.7,9006.0,25388.0,61516,5358,69338,6402
415421,0.0,630,0,33.46483,126.3187,3040,0,0,1.0,0.0,0.0,0.0,19.38,31.873252,31012,53666,15884,185,268,6,9,5,1,1.032108,0.0,0.988353,0.0,0.743075,0.0,8,2,0,0,1.0,1.0,0.0,0.0,0.0,11.0,9.0,10.0,8.0,2.2,1.8,1.111111,0.888889,45,6,9.0,0.0,,12.0,,11.0,,0.0,5.2,76,2070,36550.0,18902.0,17648.0,1475.961464,13.911111,17.130435,276473.0,48222.0,61516,5358,69338,6402
415422,0.0,630,0,33.24873,126.50799,3599,0,0,0.0,4.0,0.0,4.0,27.987818,4.828453,31014,53670,15884,699,268,37,9,15,1,1.032108,0.0,0.988353,0.0,0.743075,0.0,9,1,0,0,0.0,0.0,4.0,0.0,0.0,0.0,38.0,10.0,8.0,0.0,2.533333,1.111111,0.888889,12,6,,0.0,,1.0,,32.0,,0.0,5.2,76,212,14156.0,7119.0,7037.0,2554.831081,21.277778,97.043478,46547.0,244837.0,61516,5358,69338,6402


In [6]:
# Before modeling
train_set = train.drop(target_col,1)
test_set = test.drop(target_col,1)

train_label = train[target_col]
test_label = test[target_col]

In [7]:
# Basic LGBM Model
n_splits= 5
NUM_BOOST_ROUND = 100000
SEED = 1993
lgbm_param = {'objective':'rmse',
              'boosting_type': 'gbdt',
              'random_state':1993,
              'learning_rate':0.01,
              'subsample':0.7,
              'tree_learner': 'serial',
              'colsample_bytree':0.78,
              'early_stopping_rounds':50,
              'subsample_freq': 1,
              'reg_lambda':7,
              'reg_alpha': 5,
              'num_leaves': 96,
              'seed' : SEED
            }

In [10]:
# StractifiedKfold & bus_route_id
split_col = 'bus_route_id'
len_seeds = 5

outer_stractified_busroute_oof_train = np.zeros( train.shape[0] )
outer_stractified_busroute_oof_test = np.zeros( test.shape[0] )

for _ in tqdm_notebook(range(len_seeds)):
    
    seed = random.randint(1, 100000)
    
    cv_list = []

    oof_train = np.zeros( train.shape[0] )
    final_test = np.zeros( test.shape[0] )

    kfolds = StratifiedKFold(n_splits = n_splits, shuffle=True, random_state=seed )

    for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate(kfolds.split(train_set, train_set[split_col])) ):

        X_train, y_train = train_set.iloc[trn_ind], train_label[trn_ind]
        X_valid, y_valid = train_set.iloc[val_ind], train_label[val_ind]
        
        dtrain = lgbm.Dataset( X_train, y_train )
        dvalid = lgbm.Dataset( X_valid, y_valid ,reference=dtrain)
        
        lgbm_param['seed'] = seed

        model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, valid_sets=(dtrain, dvalid), valid_names=('train','valid'), 
                            categorical_feature=['bus_route_id','station_code','weekday',\
                                                                'kmeans1','kmeans2',
                                                                ] ,
                           verbose_eval= 100)

        valid_pred = model.predict(X_valid)
        test_pred  = model.predict(test_set)

        oof_train[val_ind] += valid_pred
        final_test += test_pred

        cv_list.append( sqrt(mean_squared_error(y_valid, valid_pred)) )

        print('='*80)

    final_test /= n_splits

    print(f"Average CV : {np.mean(cv_list)}")
    print(f"RMSE for OOF: {sqrt(mean_squared_error(train_label, oof_train))}")
    
    outer_stractified_busroute_oof_train += oof_train
    outer_stractified_busroute_oof_test += final_test
    
outer_stractified_busroute_oof_train /=len_seeds
outer_stractified_busroute_oof_test /= len_seeds

print(f"Overall for OOF: {sqrt(mean_squared_error(train_label, outer_stractified_busroute_oof_train))}")

# PostProcessing
outer_stractified_busroute_oof_train = [x if x>0 else 0 for x in  outer_stractified_busroute_oof_train]
outer_stractified_busroute_oof_test = [x if x>0 else 0 for x in  outer_stractified_busroute_oof_test]


print(f"RMSE for OOF: {sqrt(mean_squared_error(train_label, outer_stractified_busroute_oof_train))}")

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 2.89949	valid's rmse: 2.883
[200]	train's rmse: 2.28091	valid's rmse: 2.38208
[300]	train's rmse: 2.04022	valid's rmse: 2.24322
[400]	train's rmse: 1.91365	valid's rmse: 2.19713
[500]	train's rmse: 1.8299	valid's rmse: 2.17688
[600]	train's rmse: 1.76711	valid's rmse: 2.16976
[700]	train's rmse: 1.7171	valid's rmse: 2.16365
[800]	train's rmse: 1.6753	valid's rmse: 2.16161
[900]	train's rmse: 1.6376	valid's rmse: 2.15879
[1000]	train's rmse: 1.60355	valid's rmse: 2.15586
[1100]	train's rmse: 1.57274	valid's rmse: 2.15388
[1200]	train's rmse: 1.54453	valid's rmse: 2.15176
[1300]	train's rmse: 1.51818	valid's rmse: 2.14952
[1400]	train's rmse: 1.494	valid's rmse: 2.1486
[1500]	train's rmse: 1.47151	valid's rmse: 2.147
Early stopping, best iteration is:
[1545]	train's rmse: 1.46203	valid's rmse: 2.14606
Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 2.9065	valid's rmse: 2.8776

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 2.92161	valid's rmse: 2.80745
[200]	train's rmse: 2.2996	valid's rmse: 2.29905
[300]	train's rmse: 2.06098	valid's rmse: 2.14832
[400]	train's rmse: 1.93202	valid's rmse: 2.08983
[500]	train's rmse: 1.84556	valid's rmse: 2.0634
[600]	train's rmse: 1.78241	valid's rmse: 2.04875
[700]	train's rmse: 1.73142	valid's rmse: 2.04047
[800]	train's rmse: 1.68779	valid's rmse: 2.03558
[900]	train's rmse: 1.64933	valid's rmse: 2.03121
[1000]	train's rmse: 1.61448	valid's rmse: 2.02781
[1100]	train's rmse: 1.58272	valid's rmse: 2.0242
[1200]	train's rmse: 1.55439	valid's rmse: 2.02191
[1300]	train's rmse: 1.52815	valid's rmse: 2.01917
[1400]	train's rmse: 1.50436	valid's rmse: 2.0166
[1500]	train's rmse: 1.4813	valid's rmse: 2.01535
[1600]	train's rmse: 1.46008	valid's rmse: 2.01374
[1700]	train's rmse: 1.44029	valid's rmse: 2.01199
[1800]	train's rmse: 1.42121	valid's rmse: 2.01057
[1900]	train's rmse: 1.4033	valid's

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 2.88858	valid's rmse: 2.97046
[200]	train's rmse: 2.27393	valid's rmse: 2.44062
[300]	train's rmse: 2.03551	valid's rmse: 2.28262
[400]	train's rmse: 1.91074	valid's rmse: 2.22331
[500]	train's rmse: 1.82769	valid's rmse: 2.19473
[600]	train's rmse: 1.76485	valid's rmse: 2.17993
[700]	train's rmse: 1.71372	valid's rmse: 2.17045
[800]	train's rmse: 1.67014	valid's rmse: 2.16271
[900]	train's rmse: 1.63177	valid's rmse: 2.15784
[1000]	train's rmse: 1.59844	valid's rmse: 2.15423
[1100]	train's rmse: 1.56794	valid's rmse: 2.1504
[1200]	train's rmse: 1.53993	valid's rmse: 2.14837
[1300]	train's rmse: 1.51364	valid's rmse: 2.14713
[1400]	train's rmse: 1.48975	valid's rmse: 2.14633
[1500]	train's rmse: 1.46733	valid's rmse: 2.14416
[1600]	train's rmse: 1.44654	valid's rmse: 2.1416
[1700]	train's rmse: 1.42721	valid's rmse: 2.13948
[1800]	train's rmse: 1.40927	valid's rmse: 2.13869
[1900]	train's rmse: 1.39173	val

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 2.94344	valid's rmse: 2.71145
[200]	train's rmse: 2.31482	valid's rmse: 2.20626
[300]	train's rmse: 2.07431	valid's rmse: 2.06817
[400]	train's rmse: 1.94485	valid's rmse: 2.01539
[500]	train's rmse: 1.85642	valid's rmse: 1.99351
[600]	train's rmse: 1.79203	valid's rmse: 1.98786
[700]	train's rmse: 1.74003	valid's rmse: 1.98459
[800]	train's rmse: 1.69534	valid's rmse: 1.98274
[900]	train's rmse: 1.65556	valid's rmse: 1.98056
[1000]	train's rmse: 1.62041	valid's rmse: 1.9793
[1100]	train's rmse: 1.5889	valid's rmse: 1.97812
[1200]	train's rmse: 1.56001	valid's rmse: 1.97658
[1300]	train's rmse: 1.53343	valid's rmse: 1.97493
Early stopping, best iteration is:
[1296]	train's rmse: 1.53452	valid's rmse: 1.97479
Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 2.89751	valid's rmse: 3.02038
[200]	train's rmse: 2.27358	valid's rmse: 2.48953
[300]	train's rmse: 2.03471	valid's rmse

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 2.87543	valid's rmse: 3.06383
[200]	train's rmse: 2.2569	valid's rmse: 2.54574
[300]	train's rmse: 2.02507	valid's rmse: 2.38055
[400]	train's rmse: 1.89937	valid's rmse: 2.31111
[500]	train's rmse: 1.81714	valid's rmse: 2.27683
[600]	train's rmse: 1.75554	valid's rmse: 2.25638
[700]	train's rmse: 1.70569	valid's rmse: 2.24458
[800]	train's rmse: 1.66302	valid's rmse: 2.23732
[900]	train's rmse: 1.62559	valid's rmse: 2.22962
[1000]	train's rmse: 1.59282	valid's rmse: 2.22581
[1100]	train's rmse: 1.56167	valid's rmse: 2.22158
[1200]	train's rmse: 1.53412	valid's rmse: 2.21963
[1300]	train's rmse: 1.50831	valid's rmse: 2.21837
Early stopping, best iteration is:
[1334]	train's rmse: 1.50003	valid's rmse: 2.21735
Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 2.90527	valid's rmse: 2.97531
[200]	train's rmse: 2.28406	valid's rmse: 2.42707
[300]	train's rmse: 2.04684	valid's rms

In [12]:
df_oof = pd.read_csv('../raw_dataset/train.csv', usecols = ['id','18~20_ride'])
df_oof['18~20_ride'] = outer_stractified_busroute_oof_train

# df_oof.to_csv('../oof/lgbm_5_seeds_stractified5k_bus_route_id_port.csv',index=False)
df_oof.to_csv('../lgbm_5_seeds_stractified5k_bus_route_id_port.csv',index=False)

df_sub = pd.read_csv('../raw_dataset/submission_sample.csv')
df_sub['18~20_ride'] = outer_stractified_busroute_oof_test

df_sub.to_csv('../submission/lgbm_5_seeds_stractified5k_bus_route_id_port.csv',index=False)

In [None]:
# StractifiedKfold & bus_route_id
split_col = 'bus_route_id'
len_seeds = 40

outer_stractified_busroute_oof_train = np.zeros( train.shape[0] )
outer_stractified_busroute_oof_test = np.zeros( test.shape[0] )

for _ in tqdm_notebook(range(len_seeds)):
    
    seed = random.randint(1, 100000)
    
    cv_list = []

    oof_train = np.zeros( train.shape[0] )
    final_test = np.zeros( test.shape[0] )

    kfolds = StratifiedKFold(n_splits = n_splits, shuffle=True, random_state=seed )

    for ind, (trn_ind, val_ind) in tqdm_notebook( enumerate(kfolds.split(train_set, train_set[split_col])) ):

        X_train, y_train = train_set.iloc[trn_ind], train_label[trn_ind]
        X_valid, y_valid = train_set.iloc[val_ind], train_label[val_ind]
        
        dtrain = lgbm.Dataset( X_train, y_train )
        dvalid = lgbm.Dataset( X_valid, y_valid ,reference=dtrain)
        
        lgbm_param['seed'] = seed

        model = lgbm.train(lgbm_param , dtrain, NUM_BOOST_ROUND, valid_sets=(dtrain, dvalid), valid_names=('train','valid'), 
                            categorical_feature=['bus_route_id','station_code','weekday',\
                                                                'kmeans1','kmeans2',
                                                                ] ,
                           verbose_eval= 100)

        valid_pred = model.predict(X_valid)
        test_pred  = model.predict(test_set)

        oof_train[val_ind] += valid_pred
        final_test += test_pred

        cv_list.append( sqrt(mean_squared_error(y_valid, valid_pred)) )

        print('='*80)

    final_test /= n_splits

    print(f"Average CV : {np.mean(cv_list)}")
    print(f"RMSE for OOF: {sqrt(mean_squared_error(train_label, oof_train))}")
    
    outer_stractified_busroute_oof_train += oof_train
    outer_stractified_busroute_oof_test += final_test
    
outer_stractified_busroute_oof_train /=len_seeds
outer_stractified_busroute_oof_test /= len_seeds

print(f"Overall for OOF: {sqrt(mean_squared_error(train_label, outer_stractified_busroute_oof_train))}")

# PostProcessing
outer_stractified_busroute_oof_train = [x if x>0 else 0 for x in  outer_stractified_busroute_oof_train]
outer_stractified_busroute_oof_test = [x if x>0 else 0 for x in  outer_stractified_busroute_oof_test]


print(f"RMSE for OOF: {sqrt(mean_squared_error(train_label, outer_stractified_busroute_oof_train))}")

In [None]:
df_oof = pd.read_csv('../raw_dataset/train.csv', usecols = ['id','18~20_ride'])
df_oof['18~20_ride'] = outer_stractified_busroute_oof_train

df_oof.to_csv('../oof/lgbm_40_seeds_stractified5k_bus_route_id.csv',index=False)

df_sub = pd.read_csv('../raw_dataset/submission_sample.csv')
df_sub['18~20_ride'] = outer_stractified_busroute_oof_test

df_sub.to_csv('../submission/lgbm_40_seeds_stractified5k_bus_route_id.csv',index=False)