In [None]:
## imports

import numpy as np
import pandas as pd

from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)
pd.set_option('display.max_columns', 100)
from lightgbm import LGBMClassifier
from datetime import datetime
from catboost import CatBoostClassifier, Pool, cv

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score ,roc_curve,auc
from sklearn.model_selection import GridSearchCV,StratifiedKFold,KFold
from sklearn.preprocessing import LabelEncoder
seed = 45
#seed =145
% matplotlib inline
plt.style.use('fivethirtyeight')

In [None]:
### read data with some categorty name changes to raw files
PATH='F:/AV/WNS'
train_csv = 'train_catboost.csv'
test_csv = 'test_catboost.csv'
submit_csv = 'sample_submission_M0L0uXE.csv'

### read train, test and submission files
train = pd.read_csv(f'{PATH}/{train_csv}')
test = pd.read_csv(f'{PATH}/{test_csv}')
submission = pd.read_csv(f'{PATH}/{submit_csv}')

print("Shape of {}:{} {}:{} {}:{}".format('train',train.shape,'test',test.shape,'submission',submission.shape))

In [None]:
### inspect data

train.head()

In [None]:
### class balance

train['is_promoted'].value_counts()

## so approx 10% of past employees have been promoted

In [None]:
### lets check if there is any repeat in employees

len(train['employee_id'].unique()) == train.shape[0]

### so all IDs are unique

In [None]:
## null values

null_columns=train.columns[train.isnull().any()]
train[null_columns].isnull().sum()

## so 2 columns have null values

In [None]:
### inspect null value columns
a = train[(train.education.isnull())]
_ = train[(train.education.isnull() | train.previous_year_rating.isnull())]

print(a.shape,_.shape)

In [None]:
set(a.index).intersection(set(_.index)) == set(a.index)

### so everywhere where education is not present prev year rating is also not present, but vice-versa is not true

In [None]:
_.head(10)

In [None]:
### check avg % of people promoted with NA in previous ye rating vs without NA

print(np.mean(_['is_promoted']),np.mean(train[~train.education.isnull()]['is_promoted']))

In [None]:
### check avg % of people promoted with NA in education vs without NA

print(np.mean(a['is_promoted']),np.mean(train[~train.education.isnull()]['is_promoted']))

In [None]:
## For education we will use unknown for all missing values and 9999 for prev year training

train['education'] = train.education.fillna('unknown')
train['previous_year_rating'] = train.previous_year_rating.fillna(9999)

test['education'] = test.education.fillna('unknown')
test['previous_year_rating'] = test.previous_year_rating.fillna(9999)

In [None]:
### merge dataframes for ease of processing
Y = train['is_promoted'].values
train.drop('is_promoted',inplace=True,axis=1)
train['train'] = 'train'
test['train'] = 'test'
merged = pd.concat([train,test])
merged.shape

In [None]:
cat_cols = [i for i in merged.columns if merged[i].dtypes == 'object']+['KPIs_met >80%','awards_won?','previous_year_rating']
cat_cols

In [None]:
cat_cols.remove('train')
print(cat_cols)
#cat_idx = [merged.columns.get_loc(c) for c in merged.columns if c in cat_cols]

In [None]:
merged.head()

In [None]:
tr_cols = [i for i in merged.columns]
tr_cols.remove('employee_id')
tr  = merged[merged['train']=='train']
te = merged[merged['train']=='test']
tr.drop('train',axis=1,inplace=True)
te.drop('train',axis=1,inplace=True)

print(tr_cols)

### ###
tr_cols.remove('train')
X_train = tr[tr_cols]
Y_train = Y
X_test = te[tr_cols]

#cat_idx = [X_train.columns.get_loc(c) for c in X_train.columns if c in cat_cols]
cat_idx = [X_train.columns.get_loc(c) for c in cat_cols]

train_pool = Pool(X_train, Y_train, cat_features=cat_idx)

In [None]:
X_train.iloc[:,cat_idx].head()

In [None]:
X_train.head()

In [None]:
cat_idx

In [None]:
# def pre_process(df,cat_cols):
#     one_hot_encoded_training_predictors = pd.get_dummies(df[cat_cols])
#     df.drop(cat_cols,inplace=True,axis=1)
#     _ = pd.concat([df,one_hot_encoded_training_predictors],1)
#     new_tr, new_tst = _[_['train']=='train'],_[_['train']=='test']
#     new_tr.drop('train',inplace=True,axis=1)
#     new_tst.drop('train',inplace=True,axis=1)
#     return new_tr, new_tst

In [None]:
# train_OHE,test_OHE = pre_process(merged,cat_cols)

In [None]:
def kfold_lcatboost(train_df,test_df, target,num_folds, stratified = False, debug= False,modelname="catboost"):
    # Divide in training/validation and test data
    train_df = train_df
    test_df = test_df
    print("Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    gc.collect()

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=seed)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['employee_id','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], target)):
        train_x, train_y = train_df[feats].iloc[train_idx], target[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], target[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = CatBoostClassifier(
            #nthread=4,
            iterations=5000,
            bagging_temperature = 1.5,
            learning_rate=0.01,
            l2_leaf_reg = 1.25,
            depth=12,
            loss_function='Logloss',
            eval_metric='F1',
            silent=False)
        #print(train_x.head())
        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            verbose= 100, early_stopping_rounds= 200,cat_features=cat_idx,use_best_model=True)

        oof_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats])[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d F-score : %.6f' % (n_fold + 1, sklearn.metrics.f1_score(valid_y, (oof_preds[valid_idx]>0.3).astype(int))))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full f1 score %.6f' % sklearn.metrics.f1_score(target, (oof_preds>0.3).astype(int)))
    
    # Write submission file and plot feature importance
    if not debug:
        _ = datetime.now().strftime('%Y%m%d%H%M%S')
        Fname = 'F:/AV/WNS/submission/'+str(modelname)+'_'+str(_)+'.csv'
        submission['is_promoted'] = sub_preds
        submission[['employee_id', 'is_promoted']].to_csv(Fname, index= False)
        oof = pd.DataFrame(oof_preds)
        score = sklearn.metrics.f1_score(target, (oof_preds>0.3).astype(int))
        oof.columns = [modelname+'_'+str(round(score,4))]
        OOF_Fname = 'F:/AV/WNS/oof/'+str(modelname)+'_'+str(_)+'.csv'
        oof.to_csv(OOF_Fname,index=False)
    #display_importances(feature_importance_df)
    return 

In [None]:
import gc
oof = kfold_lcatboost(X_train,X_test, Y,num_folds=10, stratified = True, debug= False,modelname="catboost_10fld_depth12_L2_1.25_temp_1.25")