In [1]:
## imports

import numpy as np
import pandas as pd

from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)
pd.set_option('display.max_columns', 100)
from lightgbm import LGBMClassifier
from datetime import datetime
from catboost import CatBoostClassifier, Pool, cv

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score ,roc_curve,auc
from sklearn.model_selection import GridSearchCV,StratifiedKFold,KFold
from sklearn.preprocessing import LabelEncoder
seed = 45
#seed =145
% matplotlib inline
plt.style.use('fivethirtyeight')

In [3]:
### read data with some categorty name changes to raw files
PATH='F:/AV/WNS'
train_csv = 'train_catboost.csv'
test_csv = 'test_catboost.csv'
submit_csv = 'sample_submission_M0L0uXE.csv'

### read train, test and submission files
train = pd.read_csv(f'{PATH}/{train_csv}')
test = pd.read_csv(f'{PATH}/{test_csv}')
submission = pd.read_csv(f'{PATH}/{submit_csv}')

print("Shape of {}:{} {}:{} {}:{}".format('train',train.shape,'test',test.shape,'submission',submission.shape))

Shape of train:(54808, 14) test:(23490, 13) submission:(23490, 2)


In [4]:
### inspect data

train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales_Marketing,region_7,Masters_above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales_Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales_Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [5]:
### class balance

train['is_promoted'].value_counts()

## so approx 10% of past employees have been promoted

0    50140
1     4668
Name: is_promoted, dtype: int64

In [6]:
### lets check if there is any repeat in employees

len(train['employee_id'].unique()) == train.shape[0]

### so all IDs are unique

True

In [7]:
## null values

null_columns=train.columns[train.isnull().any()]
train[null_columns].isnull().sum()

## so 2 columns have null values

education               2409
previous_year_rating    4124
dtype: int64

In [8]:
### inspect null value columns
a = train[(train.education.isnull())]
_ = train[(train.education.isnull() | train.previous_year_rating.isnull())]

print(a.shape,_.shape)

(2409, 14) (6148, 14)


In [9]:
set(a.index).intersection(set(_.index)) == set(a.index)

### so everywhere where education is not present prev year rating is also not present, but vice-versa is not true

True

In [10]:
_.head(10)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
10,29934,Technology,region_23,,m,sourcing,1,30,,1,0,0,77,0
21,33332,Operations,region_15,,m,sourcing,1,41,4.0,11,0,0,57,0
23,71177,Procurement,region_5,Bachelor's,m,other,1,27,,1,0,0,70,0
29,74759,Sales_Marketing,region_4,Bachelor's,m,sourcing,1,26,,1,0,0,44,0
32,35465,Sales_Marketing,region_7,,f,sourcing,1,24,1.0,2,0,0,48,0
43,17423,Sales_Marketing,region_2,,m,other,3,24,2.0,2,0,0,48,0
56,45709,Sales_Marketing,region_31,Bachelor's,f,other,1,29,,1,0,0,49,0
58,26599,Sales_Marketing,region_16,Bachelor's,m,other,2,27,,1,1,0,47,0
62,9150,Analytics,region_22,Bachelor's,f,other,1,28,,1,1,0,80,0
66,77981,Finance,region_22,Bachelor's,m,other,1,27,,1,1,1,58,1


In [11]:
### check avg % of people promoted with NA in previous ye rating vs without NA

print(np.mean(_['is_promoted']),np.mean(train[~train.education.isnull()]['is_promoted']))

0.07091737150292778 0.08675738086604706


In [12]:
### check avg % of people promoted with NA in education vs without NA

print(np.mean(a['is_promoted']),np.mean(train[~train.education.isnull()]['is_promoted']))

0.0506434205064342 0.08675738086604706


In [13]:
## For education we will use unknown for all missing values and 9999 for prev year training

train['education'] = train.education.fillna('unknown')
train['previous_year_rating'] = train.previous_year_rating.fillna(9999)

test['education'] = test.education.fillna('unknown')
test['previous_year_rating'] = test.previous_year_rating.fillna(9999)

In [14]:
### merge dataframes for ease of processing
Y = train['is_promoted'].values
train.drop('is_promoted',inplace=True,axis=1)
train['train'] = 'train'
test['train'] = 'test'
merged = pd.concat([train,test])
merged.shape

(78298, 14)

In [15]:
cat_cols = [i for i in merged.columns if merged[i].dtypes == 'object']+['KPIs_met >80%','awards_won?','previous_year_rating']
cat_cols

['department',
 'region',
 'education',
 'gender',
 'recruitment_channel',
 'train',
 'KPIs_met >80%',
 'awards_won?',
 'previous_year_rating']

In [16]:
cat_cols.remove('train')
print(cat_cols)
#cat_idx = [merged.columns.get_loc(c) for c in merged.columns if c in cat_cols]

['department', 'region', 'education', 'gender', 'recruitment_channel', 'KPIs_met >80%', 'awards_won?', 'previous_year_rating']


In [17]:
merged.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,train
0,65438,Sales_Marketing,region_7,Masters_above,f,sourcing,1,35,5.0,8,1,0,49,train
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,train
2,7513,Sales_Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,train
3,2542,Sales_Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,train
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,train


In [18]:
tr_cols = [i for i in merged.columns]
tr_cols.remove('employee_id')
tr  = merged[merged['train']=='train']
te = merged[merged['train']=='test']
tr.drop('train',axis=1,inplace=True)
te.drop('train',axis=1,inplace=True)

print(tr_cols)

### ###
tr_cols.remove('train')
X_train = tr[tr_cols]
Y_train = Y
X_test = te[tr_cols]

#cat_idx = [X_train.columns.get_loc(c) for c in X_train.columns if c in cat_cols]
cat_idx = [X_train.columns.get_loc(c) for c in cat_cols]

train_pool = Pool(X_train, Y_train, cat_features=cat_idx)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


['department', 'region', 'education', 'gender', 'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service', 'KPIs_met >80%', 'awards_won?', 'avg_training_score', 'train']


In [19]:
X_train.iloc[:,cat_idx].head()

Unnamed: 0,department,region,education,gender,recruitment_channel,KPIs_met >80%,awards_won?,previous_year_rating
0,Sales_Marketing,region_7,Masters_above,f,sourcing,1,0,5.0
1,Operations,region_22,Bachelor's,m,other,0,0,5.0
2,Sales_Marketing,region_19,Bachelor's,m,sourcing,0,0,3.0
3,Sales_Marketing,region_23,Bachelor's,m,other,0,0,1.0
4,Technology,region_26,Bachelor's,m,other,0,0,3.0


In [20]:
X_train.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,Sales_Marketing,region_7,Masters_above,f,sourcing,1,35,5.0,8,1,0,49
1,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60
2,Sales_Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50
3,Sales_Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50
4,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73


In [21]:
cat_idx

[0, 1, 2, 3, 4, 9, 10, 7]

In [22]:
# def pre_process(df,cat_cols):
#     one_hot_encoded_training_predictors = pd.get_dummies(df[cat_cols])
#     df.drop(cat_cols,inplace=True,axis=1)
#     _ = pd.concat([df,one_hot_encoded_training_predictors],1)
#     new_tr, new_tst = _[_['train']=='train'],_[_['train']=='test']
#     new_tr.drop('train',inplace=True,axis=1)
#     new_tst.drop('train',inplace=True,axis=1)
#     return new_tr, new_tst

In [23]:
# train_OHE,test_OHE = pre_process(merged,cat_cols)

In [24]:
def kfold_lcatboost(train_df,test_df, target,num_folds, stratified = False, debug= False,modelname="catboost"):
    # Divide in training/validation and test data
    train_df = train_df
    test_df = test_df
    print("Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    gc.collect()

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=seed)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['employee_id','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], target)):
        train_x, train_y = train_df[feats].iloc[train_idx], target[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], target[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = CatBoostClassifier(
            #nthread=4,
            iterations=5000,
            learning_rate=0.01,
            l2_leaf_reg = 1,
            depth=10,
            loss_function='Logloss',
            eval_metric='F1',
            silent=False)
        #print(train_x.head())
        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            verbose= 100, early_stopping_rounds= 200,cat_features=cat_idx,use_best_model=True)

        oof_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats])[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d F-score : %.6f' % (n_fold + 1, sklearn.metrics.f1_score(valid_y, (oof_preds[valid_idx]>0.3).astype(int))))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full f1 score %.6f' % sklearn.metrics.f1_score(target, (oof_preds>0.3).astype(int)))
    
    # Write submission file and plot feature importance
    if not debug:
        _ = datetime.now().strftime('%Y%m%d%H%M%S')
        Fname = 'F:/AV/WNS/submission/'+str(modelname)+'_'+str(_)+'.csv'
        submission['is_promoted'] = sub_preds
        submission[['employee_id', 'is_promoted']].to_csv(Fname, index= False)
        oof = pd.DataFrame(oof_preds)
        score = sklearn.metrics.f1_score(target, (oof_preds>0.3).astype(int))
        oof.columns = [modelname+'_'+str(round(score,4))]
        OOF_Fname = 'F:/AV/WNS/oof/'+str(modelname)+'_'+str(_)+'.csv'
        oof.to_csv(OOF_Fname,index=False)
    #display_importances(feature_importance_df)
    return 

In [25]:
import gc
oof = kfold_lcatboost(X_train,X_test, Y,num_folds=10, stratified = True, debug= False,modelname="catboost_10fld")

Train shape: (54808, 12), test shape: (23490, 12)
0:	learn: 0.1105866	test: 0.1105866	best: 0.1105866 (0)	test1: 0.1204819	total: 93.2ms	remaining: 7m 45s
100:	learn: 0.2617477	test: 0.2617477	best: 0.2843237 (5)	test1: 0.2647059	total: 8.48s	remaining: 6m 51s
200:	learn: 0.4092795	test: 0.4104787	best: 0.4113769 (198)	test1: 0.4000000	total: 20.9s	remaining: 8m 18s
300:	learn: 0.4631924	test: 0.4643963	best: 0.4643963 (299)	test1: 0.4625407	total: 37.6s	remaining: 9m 46s
400:	learn: 0.4806341	test: 0.4799135	best: 0.4799135 (400)	test1: 0.4871795	total: 59s	remaining: 11m 16s
500:	learn: 0.4875336	test: 0.4879885	best: 0.4881720 (486)	test1: 0.4871795	total: 1m 25s	remaining: 12m 49s
600:	learn: 0.4952755	test: 0.4939329	best: 0.4939329 (595)	test1: 0.4936306	total: 1m 49s	remaining: 13m 21s
700:	learn: 0.5001775	test: 0.4981333	best: 0.4982219 (688)	test1: 0.5007924	total: 2m 7s	remaining: 13m
800:	learn: 0.5028349	test: 0.4998225	best: 0.5000887 (795)	test1: 0.5055292	total: 2m 35s	

2800:	learn: 0.5833193	test: 0.5334026	best: 0.5334026 (2763)	test1: 0.5359877	total: 27m 25s	remaining: 21m 31s
2900:	learn: 0.5855539	test: 0.5328720	best: 0.5336563 (2816)	test1: 0.5359877	total: 28m 38s	remaining: 20m 43s
3000:	learn: 0.5898079	test: 0.5337254	best: 0.5339789 (2992)	test1: 0.5351682	total: 29m 51s	remaining: 19m 53s
3100:	learn: 0.5919253	test: 0.5348315	best: 0.5348315 (3063)	test1: 0.5351682	total: 31m 3s	remaining: 19m 1s
3200:	learn: 0.5956739	test: 0.5364421	best: 0.5367876 (3195)	test1: 0.5351682	total: 32m 16s	remaining: 18m 8s
3300:	learn: 0.5982395	test: 0.5371330	best: 0.5375453 (3292)	test1: 0.5351682	total: 33m 28s	remaining: 17m 13s
3400:	learn: 0.6013592	test: 0.5372928	best: 0.5376381 (3345)	test1: 0.5351682	total: 34m 40s	remaining: 16m 18s
3500:	learn: 0.6036737	test: 0.5381429	best: 0.5381429 (3493)	test1: 0.5351682	total: 35m 53s	remaining: 15m 21s
3600:	learn: 0.6065709	test: 0.5389924	best: 0.5389924 (3580)	test1: 0.5351682	total: 37m 4s	remain

400:	learn: 0.4819407	test: 0.4812219	best: 0.4812219 (400)	test1: 0.4477612	total: 2m 16s	remaining: 26m 5s
500:	learn: 0.4929552	test: 0.4925985	best: 0.4925985 (498)	test1: 0.4630542	total: 2m 57s	remaining: 26m 30s
600:	learn: 0.4997336	test: 0.4982232	best: 0.4984900 (596)	test1: 0.4680851	total: 3m 44s	remaining: 27m 22s
700:	learn: 0.5046894	test: 0.5028329	best: 0.5029219 (687)	test1: 0.4723127	total: 4m 29s	remaining: 27m 32s
800:	learn: 0.5082112	test: 0.5051255	best: 0.5051255 (800)	test1: 0.4747967	total: 5m 13s	remaining: 27m 21s
900:	learn: 0.5110132	test: 0.5064453	best: 0.5064453 (862)	test1: 0.4747967	total: 6m 5s	remaining: 27m 41s
1000:	learn: 0.5154712	test: 0.5088152	best: 0.5089050 (994)	test1: 0.4747967	total: 6m 57s	remaining: 27m 46s
1100:	learn: 0.5195443	test: 0.5109977	best: 0.5115215 (1099)	test1: 0.4772727	total: 7m 57s	remaining: 28m 10s
1200:	learn: 0.5254888	test: 0.5143961	best: 0.5143961 (1197)	test1: 0.4789644	total: 9m 6s	remaining: 28m 50s
1300:	le

300:	learn: 0.4625478	test: 0.4611456	best: 0.4614262 (296)	test1: 0.5015974	total: 1m 43s	remaining: 26m 55s
400:	learn: 0.4817544	test: 0.4813815	best: 0.4816547 (397)	test1: 0.5228346	total: 2m 27s	remaining: 28m 14s
500:	learn: 0.4901680	test: 0.4897156	best: 0.4902557 (491)	test1: 0.5273011	total: 3m 10s	remaining: 28m 27s
600:	learn: 0.4948343	test: 0.4944742	best: 0.4944742 (588)	test1: 0.5273011	total: 3m 50s	remaining: 28m 3s
700:	learn: 0.4992898	test: 0.4984889	best: 0.4984889 (679)	test1: 0.5318818	total: 4m 29s	remaining: 27m 31s
800:	learn: 0.5030110	test: 0.5013305	best: 0.5013305 (772)	test1: 0.5318818	total: 5m 8s	remaining: 26m 55s
900:	learn: 0.5047754	test: 0.5028349	best: 0.5028349 (892)	test1: 0.5318818	total: 5m 45s	remaining: 26m 13s
1000:	learn: 0.5073257	test: 0.5050398	best: 0.5050398 (996)	test1: 0.5318818	total: 6m 34s	remaining: 26m 17s
1100:	learn: 0.5111855	test: 0.5063604	best: 0.5063604 (1100)	test1: 0.5341615	total: 7m 30s	remaining: 26m 36s
1200:	lea

Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.538647343
bestIteration = 4051

Shrink model to first 4052 iterations.
Fold  7 F-score : 0.560669
0:	learn: 0.1979277	test: 0.1975465	best: 0.1975465 (0)	test1: 0.1481481	total: 81.5ms	remaining: 6m 47s
100:	learn: 0.2714988	test: 0.2718526	best: 0.2781970 (93)	test1: 0.2239089	total: 8.65s	remaining: 6m 59s
200:	learn: 0.4192581	test: 0.4183693	best: 0.4186657 (199)	test1: 0.3485915	total: 20.5s	remaining: 8m 8s
300:	learn: 0.4712706	test: 0.4704389	best: 0.4704389 (299)	test1: 0.4121622	total: 35.3s	remaining: 9m 11s
400:	learn: 0.4892627	test: 0.4878223	best: 0.4878223 (387)	test1: 0.4411277	total: 50.1s	remaining: 9m 34s
500:	learn: 0.4942057	test: 0.4922363	best: 0.4925054 (474)	test1: 0.4411277	total: 1m 4s	remaining: 9m 35s
600:	learn: 0.4998224	test: 0.4983117	best: 0.4983117 (598)	test1: 0.4462810	total: 1m 17s	remaining: 9m 23s
700:	learn: 0.5052222	test: 0.5034544	best: 0.5034544 (692)	test1: 0.4557377	tota

2100:	learn: 0.5666780	test: 0.5317639	best: 0.5317639 (2096)	test1: 0.4832000	total: 7m 27s	remaining: 10m 17s
2200:	learn: 0.5711870	test: 0.5329414	best: 0.5331950 (2197)	test1: 0.4832000	total: 7m 55s	remaining: 10m 4s
2300:	learn: 0.5758597	test: 0.5340477	best: 0.5340477 (2275)	test1: 0.4832000	total: 8m 22s	remaining: 9m 49s
2400:	learn: 0.5787349	test: 0.5348074	best: 0.5348998 (2370)	test1: 0.4832000	total: 8m 49s	remaining: 9m 33s
2500:	learn: 0.5835010	test: 0.5372671	best: 0.5372671 (2500)	test1: 0.4832000	total: 9m 16s	remaining: 9m 16s
2600:	learn: 0.5862069	test: 0.5385941	best: 0.5385941 (2596)	test1: 0.4832000	total: 9m 43s	remaining: 8m 58s
2700:	learn: 0.5883336	test: 0.5393491	best: 0.5396006 (2672)	test1: 0.4856230	total: 10m 10s	remaining: 8m 39s
2800:	learn: 0.5921974	test: 0.5398520	best: 0.5401033 (2769)	test1: 0.4856230	total: 10m 38s	remaining: 8m 20s
2900:	learn: 0.5960419	test: 0.5406707	best: 0.5410146 (2877)	test1: 0.4848485	total: 11m 4s	remaining: 8m
30