In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

In [16]:
train = pd.read_csv('../1_Data/Metadata/train.csv')
test = pd.read_csv('../1_Data/Metadata/test.csv')
restate_list = train[train['Restate_Int']==1]['gvkey'].values


# LightGBM model

In [3]:
train.columns

Index(['gvkey', 'year', 'Filing', 'Date', 'Restate_Int', 'acominc', 'ap',
       'aqc', 'at', 'bkvlps', 'capx', 'ceq', 'ceqt', 'ch', 'che', 'cogs',
       'cstk', 'dltt', 'dp', 'dv', 'dvc', 'dvt', 'ebit', 'ebitda', 'epsfi',
       'epspi', 'gdwl', 'gp', 'intan', 'invt', 'ivst', 'lt', 'ni', 'ppent',
       'pstk', 're', 'rect', 'revt', 'seq', 'tstk', 'dvpsp_f', 'dvpsx_f', 'au',
       'auop', 'auopic', 'Weekday', 'Date_lag', 'Date_diff', 'Week_num'],
      dtype='object')

In [4]:
train.select_dtypes(include=['float64']).columns

Index(['acominc', 'ap', 'aqc', 'at', 'bkvlps', 'capx', 'ceq', 'ceqt', 'ch',
       'che', 'cogs', 'cstk', 'dltt', 'dp', 'dv', 'dvc', 'dvt', 'ebit',
       'ebitda', 'epsfi', 'epspi', 'gdwl', 'gp', 'intan', 'invt', 'ivst', 'lt',
       'ni', 'ppent', 'pstk', 're', 'rect', 'revt', 'seq', 'tstk', 'dvpsp_f',
       'dvpsx_f', 'au', 'auop', 'auopic', 'Date_diff'],
      dtype='object')

In [5]:
train.head()

Unnamed: 0,gvkey,year,Filing,Date,Restate_Int,acominc,ap,aqc,at,bkvlps,...,tstk,dvpsp_f,dvpsx_f,au,auop,auopic,Weekday,Date_lag,Date_diff,Week_num
0,1004,2005,0001104659-05-033688,2005-07-22,0,-13.842,97.002,0.0,978.819,11.5326,...,69.664,0.0,0.0,6.0,1.0,1.0,4,,,29
1,1004,2006,0001104659-06-047248,2006-07-17,0,-13.899,110.239,38.478,1067.633,13.0998,...,79.813,0.0,0.0,6.0,1.0,1.0,0,2005-07-22,-5.0,29
2,1004,2007,0001104659-07-055173,2007-07-20,0,-13.012,99.073,85.21,1362.01,15.0944,...,100.935,0.0,0.0,6.0,1.0,1.0,4,2006-07-17,3.0,29
3,1004,2008,0001047469-08-008126,2008-07-11,0,-23.996,100.651,0.0,1377.511,16.8937,...,103.159,0.0,0.0,6.0,1.0,1.0,4,2007-07-20,-8.0,28
4,1004,2009,0001047469-09-006783,2009-07-16,0,-29.646,114.906,193.989,1501.042,18.9167,...,104.447,0.0,0.0,6.0,4.0,1.0,3,2008-07-11,5.0,29


In [6]:
#Delete those with non-numeric + gvkey + year 
col_del = ['gvkey','year','Filing','Date','Date_lag']
for col in col_del:
    del train[col]
    print('Deleted column {} from train'.format(col))
    del test[col]
    print('Deleted column {} from train'.format(col))

Deleted column gvkey from train
Deleted column gvkey from train
Deleted column year from train
Deleted column year from train
Deleted column Filing from train
Deleted column Filing from train
Deleted column Date from train
Deleted column Date from train
Deleted column Date_lag from train
Deleted column Date_lag from train


In [7]:
y= train['Restate_Int']
del train['Restate_Int']

In [8]:
#get number of feature and number of training rows
X_train = train
X_test = test

num_train, num_feature = X_train.shape

#Get parameters 
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 20,
    'learning_rate': 0.001,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 20,
    'verbose': 1000,
}


#------------------------------------------------------------
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)


oof_preds = np.zeros(X_train.shape[0])
sub_preds = np.zeros(X_test.shape[0])
feature_importance_df = pd.DataFrame()
feature_name = [col for col in X_train.columns]
print(feature_name)
print('Starting training...')
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X_train,y)):
    print('fold {} '.format(n_fold))
    trn_x, trn_y = X_train[feature_name].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = X_train[feature_name].iloc[val_idx], y.iloc[val_idx]
    lgb_train = lgb.Dataset(trn_x, trn_y)
    lgb_eval = lgb.Dataset(val_x, val_y)

    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=50000,
                valid_sets=lgb_eval,
                early_stopping_rounds=5000,
                   verbose_eval=1000)
    
    oof_preds[val_idx] = gbm.predict(val_x, num_iteration=gbm.best_iteration) #get oof prediction
#     sub_preds += gbm.predict_proba(X_test[feature_name],num_iteration=gbm.best_iteration)[:, 1]/folds.n_splits 
    #predict on test set, take average
    sub_preds += gbm.predict(X_test[feature_name], num_iteration=gbm.best_iteration) / folds.n_splits 
    
    #save the feature important 
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feature_name
    fold_importance_df["importance"] = np.log1p(gbm.feature_importance(
        importance_type='gain',
        iteration=gbm.best_iteration))
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

['acominc', 'ap', 'aqc', 'at', 'bkvlps', 'capx', 'ceq', 'ceqt', 'ch', 'che', 'cogs', 'cstk', 'dltt', 'dp', 'dv', 'dvc', 'dvt', 'ebit', 'ebitda', 'epsfi', 'epspi', 'gdwl', 'gp', 'intan', 'invt', 'ivst', 'lt', 'ni', 'ppent', 'pstk', 're', 'rect', 'revt', 'seq', 'tstk', 'dvpsp_f', 'dvpsx_f', 'au', 'auop', 'auopic', 'Weekday', 'Date_diff', 'Week_num']
Starting training...
fold 0 
Training until validation scores don't improve for 5000 rounds.
[1000]	valid_0's auc: 0.675193
[2000]	valid_0's auc: 0.662963
[3000]	valid_0's auc: 0.651679
[4000]	valid_0's auc: 0.651734
[5000]	valid_0's auc: 0.637829
Early stopping, best iteration is:
[79]	valid_0's auc: 0.696733
fold 1 
Training until validation scores don't improve for 5000 rounds.
[1000]	valid_0's auc: 0.665083
[2000]	valid_0's auc: 0.675676
[3000]	valid_0's auc: 0.681136
[4000]	valid_0's auc: 0.678642
[5000]	valid_0's auc: 0.674402
[6000]	valid_0's auc: 0.672363
[7000]	valid_0's auc: 0.673291
Early stopping, best iteration is:
[2700]	valid_0

In [54]:
np.save('../3_metadata/oof_lightgbm_no_agg.np',oof_preds )
np.save('../3_metadata/test_lightgbm_no_agg.np',sub_preds )

In [55]:
from sklearn.metrics import roc_auc_score
rocauc= roc_auc_score(y, oof_preds)
print("CV score: {:<8.5f}".format(rocauc))

CV score: 0.63603 


In [56]:
submission = pd.read_csv('../1_Data/Restate_sampleSubmission.csv')
submission['Restate_Int'] = sub_preds

In [57]:
submission.to_csv('../3_metadata/test_lightgbm_no_agg.csv',index = False)

# Post processing 

In [58]:
def post_processing(x):
    if x in restate_list:
        return 1
    else:
        return 0
submission['boosting'] = submission['gvkey'].apply(post_processing)

In [59]:
submission['Restate_Int'] = submission['Restate_Int'] + submission['boosting']


In [60]:
submission['Restate_Int'].clip(upper =1 ,inplace = True)

In [61]:
submission['Restate_Int'].max()

1.0

In [62]:
submission[['gvkey','Restate_Int']].to_csv('../3_metadata/test_lightgbm_no_agg_postprocessing2.csv',index = False)