In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv('../1_Data/Metadata/train_agg.csv')
test = pd.read_csv('../1_Data/Metadata/test_agg.csv')
y = pd.read_csv('../1_Data/Metadata/y_agg.csv')

In [3]:
train = train[['gvkey']]
test = test[['gvkey']]
train = pd.concat([train, y], axis=1, sort=False)

In [4]:
train.columns = ['gvkey','label']
y = train['label']
del train['label']

In [5]:
train['lgbm'] = np.load('../3_metadata/oof_lightgbm.np.npy')
train['rf'] = np.load('../3_metadata/oof_RF.np.npy')  
train['rf2'] = np.load('../3_metadata/oof_RF2.np.npy')
train['ridge'] = np.load('../3_metadata/oof_Ridge.np.npy')
train['ridge2'] = np.load('../3_metadata/oof_Ridge2.np.npy')




test['lgbm'] = np.load('../3_metadata/test_lightgbm.np.npy')
test['rf'] = np.load('../3_metadata/test_RF.np.npy')
test['rf2'] = np.load('../3_metadata/test_RF2.np.npy')
test['ridge'] = np.load('../3_metadata/test_Ridge.np.npy')
test['ridge2'] = np.load('../3_metadata/test_Ridge2.np.npy')




In [6]:
train.head()

Unnamed: 0,gvkey,lgbm,rf,rf2,ridge,ridge2
0,1004.0,0.045264,0.2,0.387939,0.000113,0.000114
1,1013.0,0.111254,0.34,0.407727,0.000277,0.000277
2,1021.0,0.166657,0.23,0.305808,0.000123,0.000123
3,1034.0,0.016457,0.48,0.394686,0.000113,0.000114
4,1045.0,0.209564,0.62,0.65294,0.000277,0.000277


In [7]:
test.head()

Unnamed: 0,gvkey,lgbm,rf,rf2,ridge,ridge2
0,1004,0.048633,0.035,0.117944,0.000335,0.000335
1,1045,0.192185,0.629,0.615095,0.000335,0.000335
2,1050,0.008076,0.075,0.142359,0.000335,0.000335
3,1072,0.107215,0.069,0.136787,0.000335,0.000335
4,1075,0.001268,0.048,0.065278,0.000335,0.000335


In [8]:
#get number of feature and number of training rows
X_train = train
X_test = test

num_train, num_feature = X_train.shape

#Get parameters 
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 4,
    'learning_rate': 0.001,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1000,
}


#------------------------------------------------------------
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)


oof_preds = np.zeros(X_train.shape[0])
sub_preds = np.zeros(X_test.shape[0])
feature_importance_df = pd.DataFrame()
feature_name = [col for col in X_train.columns]
print(feature_name)
print('Starting training...')
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X_train,y)):
    print('fold {} '.format(n_fold))
    trn_x, trn_y = X_train[feature_name].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = X_train[feature_name].iloc[val_idx], y.iloc[val_idx]
    lgb_train = lgb.Dataset(trn_x, trn_y)
    lgb_eval = lgb.Dataset(val_x, val_y)

    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=50000,
                valid_sets=lgb_eval,
                early_stopping_rounds=5000,
                   verbose_eval=1000)
    
    oof_preds[val_idx] = gbm.predict(val_x, num_iteration=gbm.best_iteration) #get oof prediction
#     sub_preds += gbm.predict_proba(X_test[feature_name],num_iteration=gbm.best_iteration)[:, 1]/folds.n_splits 
    #predict on test set, take average
    sub_preds += gbm.predict(X_test[feature_name], num_iteration=gbm.best_iteration) / folds.n_splits 
    
    #save the feature important 
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feature_name
    fold_importance_df["importance"] = np.log1p(gbm.feature_importance(
        importance_type='gain',
        iteration=gbm.best_iteration))
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

['gvkey', 'lgbm', 'rf', 'rf2', 'ridge', 'ridge2']
Starting training...
fold 0 
Training until validation scores don't improve for 5000 rounds.
[1000]	valid_0's auc: 0.992214
[2000]	valid_0's auc: 0.993364
[3000]	valid_0's auc: 0.994338
[4000]	valid_0's auc: 0.994484
[5000]	valid_0's auc: 0.994688
[6000]	valid_0's auc: 0.995066
[7000]	valid_0's auc: 0.995351
[8000]	valid_0's auc: 0.995474
[9000]	valid_0's auc: 0.995578
[10000]	valid_0's auc: 0.995656
[11000]	valid_0's auc: 0.995791
[12000]	valid_0's auc: 0.995876
[13000]	valid_0's auc: 0.995869
[14000]	valid_0's auc: 0.995966
[15000]	valid_0's auc: 0.996076
[16000]	valid_0's auc: 0.996173
[17000]	valid_0's auc: 0.996186
[18000]	valid_0's auc: 0.996245
[19000]	valid_0's auc: 0.996297
[20000]	valid_0's auc: 0.996303
[21000]	valid_0's auc: 0.996426
[22000]	valid_0's auc: 0.996452
[23000]	valid_0's auc: 0.996504
[24000]	valid_0's auc: 0.99651
[25000]	valid_0's auc: 0.996478
[26000]	valid_0's auc: 0.996497
[27000]	valid_0's auc: 0.996484
[28

[12000]	valid_0's auc: 0.994794
[13000]	valid_0's auc: 0.994755
Early stopping, best iteration is:
[8882]	valid_0's auc: 0.994976
fold 9 
Training until validation scores don't improve for 5000 rounds.
[1000]	valid_0's auc: 0.979979
[2000]	valid_0's auc: 0.981199
[3000]	valid_0's auc: 0.982452
[4000]	valid_0's auc: 0.984157
[5000]	valid_0's auc: 0.98445
[6000]	valid_0's auc: 0.984587
[7000]	valid_0's auc: 0.985247
[8000]	valid_0's auc: 0.985872
[9000]	valid_0's auc: 0.986334
[10000]	valid_0's auc: 0.986529
[11000]	valid_0's auc: 0.986666
[12000]	valid_0's auc: 0.986822
[13000]	valid_0's auc: 0.98688
[14000]	valid_0's auc: 0.986757
[15000]	valid_0's auc: 0.986822
[16000]	valid_0's auc: 0.98677
[17000]	valid_0's auc: 0.986705
Early stopping, best iteration is:
[12633]	valid_0's auc: 0.986906


In [9]:
from sklearn.metrics import roc_auc_score
rocauc= roc_auc_score(y, oof_preds)
print("CV score: {:<8.5f}".format(rocauc))

CV score: 0.99201 


In [11]:
submission = pd.read_csv('../1_Data/Restate_sampleSubmission.csv')
submission['Restate_Int'] = sub_preds

In [12]:
submission.to_csv('../5_ensembleoutput/stacking040119.csv',index = False)
