In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv('../1_Data/Metadata/train.csv')
test = pd.read_csv('../1_Data/Metadata/test.csv')

# LightGBM model

In [3]:
train.columns

Index(['Unnamed: 0', 'gvkey', 'year', 'Filing', 'Date', 'Restate_Int',
       'acominc', 'ap', 'aqc', 'at', 'bkvlps', 'capx', 'ceq', 'ceqt', 'ch',
       'che', 'cogs', 'cstk', 'dltt', 'dp', 'dv', 'dvc', 'dvt', 'ebit',
       'ebitda', 'epsfi', 'epspi', 'gdwl', 'gp', 'intan', 'invt', 'ivst', 'lt',
       'ni', 'ppent', 'pstk', 're', 'rect', 'revt', 'seq', 'tstk', 'dvpsp_f',
       'dvpsx_f', 'au', 'auop', 'auopic', 'Weekday', 'Date_lag', 'Date_diff',
       'Week_num'],
      dtype='object')

In [4]:
train.select_dtypes(include=['float64']).columns

Index(['acominc', 'ap', 'aqc', 'at', 'bkvlps', 'capx', 'ceq', 'ceqt', 'ch',
       'che', 'cogs', 'cstk', 'dltt', 'dp', 'dv', 'dvc', 'dvt', 'ebit',
       'ebitda', 'epsfi', 'epspi', 'gdwl', 'gp', 'intan', 'invt', 'ivst', 'lt',
       'ni', 'ppent', 'pstk', 're', 'rect', 'revt', 'seq', 'tstk', 'dvpsp_f',
       'dvpsx_f', 'au', 'auop', 'auopic', 'Date_diff'],
      dtype='object')

In [5]:

def create_new_col(aggs):
    return [k + "_"+ v for k in aggs for v in aggs[k]]
aggs1 = {'Weekday':['mean','max','min'],
#        'Date_diff':['max','min','std','mean'],
       'Week_num':['mean','max','min'],
       'Restate_Int':['max']}
aggs2 = {'Weekday':['mean','max','min'],
#        'Date_diff':['max','min','std','mean'],
       'Week_num':['mean','max','min']}
initial_feature = ['Weekday','Date_diff','Week_num','Restate_Int','gvkey','year','Date_lag','Date_diff','Filing']
total_feature = train.select_dtypes(include=['float64']).columns


for feature in total_feature:
    if feature not in initial_feature:
        print(feature)
        aggs1[feature] = ['mean',]
        aggs2[feature] = ['mean',]
print('done')



train_agg = train.groupby('gvkey').agg(aggs1)
train_agg.columns = create_new_col(aggs1)
train_agg.reset_index(inplace = True)
y = train_agg['Restate_Int_max']
del train_agg['Restate_Int_max']
train_agg.head()


test_agg = test.groupby('gvkey').agg(aggs2)
test_agg.columns = create_new_col(aggs2)
test_agg.reset_index(inplace = True)
# print(dtypes(y))

acominc
ap
aqc
at
bkvlps
capx
ceq
ceqt
ch
che
cogs
cstk
dltt
dp
dv
dvc
dvt
ebit
ebitda
epsfi
epspi
gdwl
gp
intan
invt
ivst
lt
ni
ppent
pstk
re
rect
revt
seq
tstk
dvpsp_f
dvpsx_f
au
auop
auopic
done


In [6]:
train_agg.fillna(0,inplace = True)

In [7]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=2)
columns = train_agg.columns

train_agg, y = sm.fit_sample(train_agg, y)

train_agg = pd.DataFrame(data=train_agg, columns = columns)
y = pd.DataFrame(data=y)

In [8]:
train_agg.head()

Unnamed: 0,gvkey,Weekday_mean,Weekday_max,Weekday_min,Week_num_mean,Week_num_max,Week_num_min,acominc_mean,ap_mean,aqc_mean,...,re_mean,rect_mean,revt_mean,seq_mean,tstk_mean,dvpsp_f_mean,dvpsx_f_mean,au_mean,auop_mean,auopic_mean
0,1004.0,3.0,4.0,0.0,28.8,29.0,28.0,-18.879,104.3742,63.5354,...,303.8758,197.2402,1223.8998,581.2032,91.6036,0.0,0.0,6.0,1.6,1.0
1,1013.0,1.5,4.0,0.0,27.0,52.0,2.0,-11.8,89.35,95.3,...,-542.425,222.775,1307.425,892.3,0.0,0.0,0.0,4.0,2.5,1.0
2,1021.0,3.0,4.0,1.0,39.0,41.0,37.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1034.0,3.0,3.0,3.0,13.0,13.0,13.0,47.852,35.36,0.0,...,-175.285,90.898,553.617,918.078,7.644,0.18,0.18,11.0,1.0,2.0
4,1045.0,3.4,4.0,2.0,8.0,8.0,8.0,-1500.2,1069.8,0.0,...,-4311.0,917.0,21970.8,-1170.2,449.4,0.0,0.0,4.0,2.2,1.0


In [9]:
y.dtypes

0    int64
dtype: object

In [10]:
#get number of feature and number of training rows
X_train = train_agg
X_test = test_agg

num_train, num_feature = X_train.shape

#Get parameters 
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 5,
    'learning_rate': 0.001,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1000,
}


#------------------------------------------------------------
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)


oof_preds = np.zeros(X_train.shape[0])
sub_preds = np.zeros(X_test.shape[0])
feature_importance_df = pd.DataFrame()
feature_name = [col for col in X_train.columns]
print(feature_name)
print('Starting training...')
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X_train,y)):
    print('fold {} '.format(n_fold))
    trn_x, trn_y = X_train[feature_name].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = X_train[feature_name].iloc[val_idx], y.iloc[val_idx]
    lgb_train = lgb.Dataset(trn_x, trn_y)
    lgb_eval = lgb.Dataset(val_x, val_y)

    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=50000,
                valid_sets=lgb_eval,
                early_stopping_rounds=5000,
                   verbose_eval=1000)
    
    oof_preds[val_idx] = gbm.predict(val_x, num_iteration=gbm.best_iteration) #get oof prediction
#     sub_preds += gbm.predict_proba(X_test[feature_name],num_iteration=gbm.best_iteration)[:, 1]/folds.n_splits 
    #predict on test set, take average
    sub_preds += gbm.predict(X_test[feature_name], num_iteration=gbm.best_iteration) / folds.n_splits 
    
    #save the feature important 
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feature_name
    fold_importance_df["importance"] = np.log1p(gbm.feature_importance(
        importance_type='gain',
        iteration=gbm.best_iteration))
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

['gvkey', 'Weekday_mean', 'Weekday_max', 'Weekday_min', 'Week_num_mean', 'Week_num_max', 'Week_num_min', 'acominc_mean', 'ap_mean', 'aqc_mean', 'at_mean', 'bkvlps_mean', 'capx_mean', 'ceq_mean', 'ceqt_mean', 'ch_mean', 'che_mean', 'cogs_mean', 'cstk_mean', 'dltt_mean', 'dp_mean', 'dv_mean', 'dvc_mean', 'dvt_mean', 'ebit_mean', 'ebitda_mean', 'epsfi_mean', 'epspi_mean', 'gdwl_mean', 'gp_mean', 'intan_mean', 'invt_mean', 'ivst_mean', 'lt_mean', 'ni_mean', 'ppent_mean', 'pstk_mean', 're_mean', 'rect_mean', 'revt_mean', 'seq_mean', 'tstk_mean', 'dvpsp_f_mean', 'dvpsx_f_mean', 'au_mean', 'auop_mean', 'auopic_mean']
Starting training...
fold 0 
Training until validation scores don't improve for 5000 rounds.
[1000]	valid_0's auc: 0.91268
[2000]	valid_0's auc: 0.948423
[3000]	valid_0's auc: 0.959239
[4000]	valid_0's auc: 0.964292
[5000]	valid_0's auc: 0.967594
[6000]	valid_0's auc: 0.970301
[7000]	valid_0's auc: 0.97258
[8000]	valid_0's auc: 0.974522
[9000]	valid_0's auc: 0.975924
[10000]	vali

[18000]	valid_0's auc: 0.967058
[19000]	valid_0's auc: 0.967514
[20000]	valid_0's auc: 0.967813
[21000]	valid_0's auc: 0.968119
[22000]	valid_0's auc: 0.968444
[23000]	valid_0's auc: 0.968835
[24000]	valid_0's auc: 0.969284
[25000]	valid_0's auc: 0.969531
[26000]	valid_0's auc: 0.970045
[27000]	valid_0's auc: 0.970442
[28000]	valid_0's auc: 0.970761
[29000]	valid_0's auc: 0.97108
[30000]	valid_0's auc: 0.971197
[31000]	valid_0's auc: 0.971483
[32000]	valid_0's auc: 0.971678
[33000]	valid_0's auc: 0.972232
[34000]	valid_0's auc: 0.972466
[35000]	valid_0's auc: 0.972863
[36000]	valid_0's auc: 0.97311
[37000]	valid_0's auc: 0.973383
[38000]	valid_0's auc: 0.97354
[39000]	valid_0's auc: 0.973806
[40000]	valid_0's auc: 0.973878
[41000]	valid_0's auc: 0.973956
[42000]	valid_0's auc: 0.974132
[43000]	valid_0's auc: 0.974243
[44000]	valid_0's auc: 0.974288
[45000]	valid_0's auc: 0.974366
[46000]	valid_0's auc: 0.97449
[47000]	valid_0's auc: 0.974672
[48000]	valid_0's auc: 0.974913
[49000]	vali

[3000]	valid_0's auc: 0.943832
[4000]	valid_0's auc: 0.951885
[5000]	valid_0's auc: 0.955731
[6000]	valid_0's auc: 0.959265
[7000]	valid_0's auc: 0.961582
[8000]	valid_0's auc: 0.963241
[9000]	valid_0's auc: 0.964393
[10000]	valid_0's auc: 0.965174
[11000]	valid_0's auc: 0.965909
[12000]	valid_0's auc: 0.966606
[13000]	valid_0's auc: 0.967094
[14000]	valid_0's auc: 0.967663
[15000]	valid_0's auc: 0.968249
[16000]	valid_0's auc: 0.968672
[17000]	valid_0's auc: 0.969056
[18000]	valid_0's auc: 0.969485
[19000]	valid_0's auc: 0.969889
[20000]	valid_0's auc: 0.970214
[21000]	valid_0's auc: 0.970605
[22000]	valid_0's auc: 0.971002
[23000]	valid_0's auc: 0.971268
[24000]	valid_0's auc: 0.971535
[25000]	valid_0's auc: 0.971952
[26000]	valid_0's auc: 0.972264
[27000]	valid_0's auc: 0.972583
[28000]	valid_0's auc: 0.972837
[29000]	valid_0's auc: 0.973097
[30000]	valid_0's auc: 0.973351
[31000]	valid_0's auc: 0.973624
[32000]	valid_0's auc: 0.973852
[33000]	valid_0's auc: 0.973943
[34000]	valid_0

In [25]:
np.save('../3_metadata/oof_lightgbm.np',oof_preds )
np.save('../3_metadata/test_lightgbm.np',sub_preds )

In [26]:
from sklearn.metrics import roc_auc_score
rocauc= roc_auc_score(y, oof_preds)
print("CV score: {:<8.5f}".format(rocauc))

CV score: 0.97760 


In [27]:
submission = pd.read_csv('../1_Data/Restate_sampleSubmission.csv')
submission['Restate_Int'] = sub_preds

In [28]:
submission.to_csv('../3_metadata/test_lightgbm.csv',index = False)