In [68]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score ,roc_curve,auc, f1_score
from sklearn.model_selection import GridSearchCV,StratifiedKFold,KFold
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
seed = 45
#seed =145
% matplotlib inline

In [7]:
PATH='F:/AV/WNS'
OOF_PATH = 'F:/AV/WNS/oof'
submission_path = 'F:/AV/WNS/submission'
train_csv = 'train_catboost.csv'
submit_csv = 'sample_submission_M0L0uXE.csv'

oof_files = ['catboost_10fld_bagtemp_20180916173347.csv','catboost_10fld_20180916082333.csv',
             'catboost_5fld_baggingtemp0.85_20180916070735.csv',
            'xgb_10fld_20180915194445.csv','xgb_10fld_20180915183532.csv',
             'lightgbm_10fld_20180915124747.csv',
             'logreg_20180915130548.csv'
            ]

test_files = oof_files
Y_true = pd.read_csv(f'{PATH}/{train_csv}')['is_promoted']
submission = pd.read_csv(f'{PATH}/{submit_csv}')

In [24]:
oof_df = pd.concat([pd.read_csv(f'{OOF_PATH}/{i}') for i in oof_files],axis=1)
test_df = pd.concat([pd.read_csv(f'{submission_path}/{i}').iloc[:,1:] for i in oof_files],axis=1)

In [28]:
Y_true.shape[0] == oof_df.shape[0]

True

In [31]:
Y_true.shape

(54808,)

In [30]:
test_df.shape

(23490, 7)

In [29]:
oof_df.shape

(54808, 7)

In [51]:
oof_df.head()

Unnamed: 0,catboost_10fld_bagtemp_0.5282,catboost_10fld_0.531,catboost_5fld_baggingtemp0.85_0.5275,xgb_10fld_0.5215,xgb_10fld_0.5211,lightgbm_10fld_0.5254,logreg_0.3322
0,0.22945,0.233136,0.197621,0.24068,0.276706,0.251634,0.678898
1,0.015912,0.020193,0.019009,0.001845,0.001657,0.004689,0.143959
2,1.3e-05,9.6e-05,0.0002,0.000501,0.007921,0.001129,0.074057
3,1.2e-05,3.1e-05,3.5e-05,0.000347,0.000353,0.0002,0.102983
4,0.005401,0.002351,0.001812,0.005591,0.002578,0.002808,0.013852


In [38]:
test_df.columns = oof_df.columns
test_df.head()

Unnamed: 0,catboost_10fld_bagtemp_0.5282,catboost_10fld_0.531,catboost_5fld_baggingtemp0.85_0.5275,xgb_10fld_0.5215,xgb_10fld_0.5211,lightgbm_10fld_0.5254,logreg_0.3322
0,0.126459,0.137725,0.139478,0.19205,0.194406,0.190533,0.580263
1,0.000322,0.000372,0.000424,0.000745,0.001497,0.00185,0.156239
2,2.2e-05,2.9e-05,5.6e-05,0.000366,0.001258,0.00088,0.044435
3,0.000764,0.000878,0.001002,0.001621,0.00216,0.002967,0.027826
4,0.000743,0.000848,0.000984,0.001443,0.001753,0.002334,0.069927


In [66]:
def kfold_lightgbm(train_df,test_df, target,num_folds, stratified = False, debug= False,modelname="lightgbm_0"):
    # Divide in training/validation and test data
    train_df = train_df
    test_df = test_df
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    gc.collect()

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=seed)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['employee_id','index','logreg_0.3322']]
    print(feats)   
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], target)):
        train_x, train_y = train_df[feats].iloc[train_idx], target[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], target[valid_idx]

        clf = LGBMClassifier(
            nthread=4,
            n_estimators=5000,
            learning_rate=0.05,
            max_depth=3,
#            reg_lambda=1,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 100, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        print('Fold %2d F-score : %.6f' % (n_fold + 1, f1_score(valid_y, (oof_preds[valid_idx]>0.3).astype(int))))
        print('Fold %2d > 0.2 F-score : %.6f' % (n_fold + 1, f1_score(valid_y, (oof_preds[valid_idx]>0.2).astype(int))))
        print('Fold %2d > 0.1 F-score : %.6f' % (n_fold + 1, f1_score(valid_y, (oof_preds[valid_idx]>0.1).astype(int))))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()
        
    print('Full > 0.4 f1 score %.6f' % f1_score(target, (oof_preds>0.4).astype(int)))
    print('Full > 0.3 f1 score %.6f' % f1_score(target, (oof_preds>0.3).astype(int)))
    print('Full > 0.2 f1 score %.6f' % f1_score(target, (oof_preds>0.2).astype(int)))
    print('Full > 0.1 f1 score %.6f' % f1_score(target, (oof_preds>0.1).astype(int)))

    # Write submission file and plot feature importance
    if not debug:
        _ = datetime.now().strftime('%Y%m%d%H%M%S')
        Fname = 'F:/AV/WNS/submission/'+str(modelname)+'_'+str(_)+'.csv'
        submission['is_promoted'] = sub_preds
        submission[['employee_id', 'is_promoted']].to_csv(Fname, index= False)
        oof = pd.DataFrame(oof_preds)
        score = f1_score(target, (oof_preds>0.3).astype(int))
        oof.columns = [modelname+'_'+str(round(score,4))]
        OOF_Fname = 'F:/AV/WNS/oof/'+str(modelname)+'_'+str(_)+'.csv'
        oof.to_csv(OOF_Fname,index=False)
    #display_importances(feature_importance_df)
    return 

In [67]:
import gc
oof = kfold_lightgbm(oof_df,test_df, Y_true,num_folds=10, stratified = True, debug= True,modelname="LGBM_Stack_v1")

Starting LightGBM. Train shape: (54808, 7), test shape: (23490, 7)
['catboost_10fld_bagtemp_0.5282', 'catboost_10fld_0.531', 'catboost_5fld_baggingtemp0.85_0.5275', 'xgb_10fld_0.5215', 'xgb_10fld_0.5211', 'lightgbm_10fld_0.5254']
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.916438	valid_1's auc: 0.91305
[200]	training's auc: 0.91784	valid_1's auc: 0.913143
Early stopping, best iteration is:
[35]	training's auc: 0.916137	valid_1's auc: 0.913204
Fold  1 F-score : 0.540785
Fold  1 > 0.2 F-score : 0.513545
Fold  1 > 0.1 F-score : 0.399801
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.915866	valid_1's auc: 0.919161
[200]	training's auc: 0.917197	valid_1's auc: 0.919064
Early stopping, best iteration is:
[85]	training's auc: 0.915826	valid_1's auc: 0.919225
Fold  2 F-score : 0.544656
Fold  2 > 0.2 F-score : 0.497542
Fold  2 > 0.1 F-score : 0.399057
Training until validation scores don't improve for 200 rounds.