In [1]:
## imports

import numpy as np
import pandas as pd

from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)
pd.set_option('display.max_columns', 100)
from lightgbm import LGBMClassifier
from datetime import datetime
from catboost import CatBoostClassifier, Pool, cv

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score ,roc_curve,auc
from sklearn.model_selection import GridSearchCV,StratifiedKFold,KFold
from sklearn.preprocessing import LabelEncoder
seed = 45
#seed =145
% matplotlib inline
plt.style.use('fivethirtyeight')

In [3]:
### read data with some categorty name changes to raw files
PATH='F:/AV/WNS'
train_csv = 'train_catboost.csv'
test_csv = 'test_catboost.csv'
submit_csv = 'sample_submission_M0L0uXE.csv'

### read train, test and submission files
train = pd.read_csv(f'{PATH}/{train_csv}')
test = pd.read_csv(f'{PATH}/{test_csv}')
submission = pd.read_csv(f'{PATH}/{submit_csv}')

print("Shape of {}:{} {}:{} {}:{}".format('train',train.shape,'test',test.shape,'submission',submission.shape))

Shape of train:(54808, 14) test:(23490, 13) submission:(23490, 2)


In [4]:
### inspect data

train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales_Marketing,region_7,Masters_above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales_Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales_Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [5]:
### class balance

train['is_promoted'].value_counts()

## so approx 10% of past employees have been promoted

0    50140
1     4668
Name: is_promoted, dtype: int64

In [6]:
### lets check if there is any repeat in employees

len(train['employee_id'].unique()) == train.shape[0]

### so all IDs are unique

True

In [7]:
## null values

null_columns=train.columns[train.isnull().any()]
train[null_columns].isnull().sum()

## so 2 columns have null values

education               2409
previous_year_rating    4124
dtype: int64

In [8]:
### inspect null value columns
a = train[(train.education.isnull())]
_ = train[(train.education.isnull() | train.previous_year_rating.isnull())]

print(a.shape,_.shape)

(2409, 14) (6148, 14)


In [9]:
set(a.index).intersection(set(_.index)) == set(a.index)

### so everywhere where education is not present prev year rating is also not present, but vice-versa is not true

True

In [10]:
_.head(10)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
10,29934,Technology,region_23,,m,sourcing,1,30,,1,0,0,77,0
21,33332,Operations,region_15,,m,sourcing,1,41,4.0,11,0,0,57,0
23,71177,Procurement,region_5,Bachelor's,m,other,1,27,,1,0,0,70,0
29,74759,Sales_Marketing,region_4,Bachelor's,m,sourcing,1,26,,1,0,0,44,0
32,35465,Sales_Marketing,region_7,,f,sourcing,1,24,1.0,2,0,0,48,0
43,17423,Sales_Marketing,region_2,,m,other,3,24,2.0,2,0,0,48,0
56,45709,Sales_Marketing,region_31,Bachelor's,f,other,1,29,,1,0,0,49,0
58,26599,Sales_Marketing,region_16,Bachelor's,m,other,2,27,,1,1,0,47,0
62,9150,Analytics,region_22,Bachelor's,f,other,1,28,,1,1,0,80,0
66,77981,Finance,region_22,Bachelor's,m,other,1,27,,1,1,1,58,1


In [11]:
### check avg % of people promoted with NA in previous ye rating vs without NA

print(np.mean(_['is_promoted']),np.mean(train[~train.education.isnull()]['is_promoted']))

0.07091737150292778 0.08675738086604706


In [12]:
### check avg % of people promoted with NA in education vs without NA

print(np.mean(a['is_promoted']),np.mean(train[~train.education.isnull()]['is_promoted']))

0.0506434205064342 0.08675738086604706


In [13]:
## For education we will use unknown for all missing values and 9999 for prev year training

train['education'] = train.education.fillna('unknown')
train['previous_year_rating'] = train.previous_year_rating.fillna(9999)

test['education'] = test.education.fillna('unknown')
test['previous_year_rating'] = test.previous_year_rating.fillna(9999)

In [14]:
### merge dataframes for ease of processing
Y = train['is_promoted'].values
train.drop('is_promoted',inplace=True,axis=1)
train['train'] = 'train'
test['train'] = 'test'
merged = pd.concat([train,test])
merged.shape

(78298, 14)

In [15]:
cat_cols = [i for i in merged.columns if merged[i].dtypes == 'object']+['KPIs_met >80%','awards_won?','previous_year_rating']
cat_cols

['department',
 'region',
 'education',
 'gender',
 'recruitment_channel',
 'train',
 'KPIs_met >80%',
 'awards_won?',
 'previous_year_rating']

In [16]:
cat_cols.remove('train')
print(cat_cols)
#cat_idx = [merged.columns.get_loc(c) for c in merged.columns if c in cat_cols]

['department', 'region', 'education', 'gender', 'recruitment_channel', 'KPIs_met >80%', 'awards_won?', 'previous_year_rating']


In [17]:
merged.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,train
0,65438,Sales_Marketing,region_7,Masters_above,f,sourcing,1,35,5.0,8,1,0,49,train
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,train
2,7513,Sales_Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,train
3,2542,Sales_Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,train
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,train


In [18]:
tr_cols = [i for i in merged.columns]
tr_cols.remove('employee_id')
tr  = merged[merged['train']=='train']
te = merged[merged['train']=='test']
tr.drop('train',axis=1,inplace=True)
te.drop('train',axis=1,inplace=True)

print(tr_cols)

### ###
tr_cols.remove('train')
X_train = tr[tr_cols]
Y_train = Y
X_test = te[tr_cols]

#cat_idx = [X_train.columns.get_loc(c) for c in X_train.columns if c in cat_cols]
cat_idx = [X_train.columns.get_loc(c) for c in cat_cols]

train_pool = Pool(X_train, Y_train, cat_features=cat_idx)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


['department', 'region', 'education', 'gender', 'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service', 'KPIs_met >80%', 'awards_won?', 'avg_training_score', 'train']


In [19]:
X_train.iloc[:,cat_idx].head()

Unnamed: 0,department,region,education,gender,recruitment_channel,KPIs_met >80%,awards_won?,previous_year_rating
0,Sales_Marketing,region_7,Masters_above,f,sourcing,1,0,5.0
1,Operations,region_22,Bachelor's,m,other,0,0,5.0
2,Sales_Marketing,region_19,Bachelor's,m,sourcing,0,0,3.0
3,Sales_Marketing,region_23,Bachelor's,m,other,0,0,1.0
4,Technology,region_26,Bachelor's,m,other,0,0,3.0


In [20]:
X_train.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,Sales_Marketing,region_7,Masters_above,f,sourcing,1,35,5.0,8,1,0,49
1,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60
2,Sales_Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50
3,Sales_Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50
4,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73


In [21]:
cat_idx

[0, 1, 2, 3, 4, 9, 10, 7]

In [22]:
# def pre_process(df,cat_cols):
#     one_hot_encoded_training_predictors = pd.get_dummies(df[cat_cols])
#     df.drop(cat_cols,inplace=True,axis=1)
#     _ = pd.concat([df,one_hot_encoded_training_predictors],1)
#     new_tr, new_tst = _[_['train']=='train'],_[_['train']=='test']
#     new_tr.drop('train',inplace=True,axis=1)
#     new_tst.drop('train',inplace=True,axis=1)
#     return new_tr, new_tst

In [23]:
# train_OHE,test_OHE = pre_process(merged,cat_cols)

In [24]:
def kfold_catboost(train_df,test_df, target,num_folds, stratified = False, debug= False,modelname="catboost"):
    # Divide in training/validation and test data
    train_df = train_df
    test_df = test_df
    print("Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    gc.collect()

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=seed)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['employee_id','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], target)):
        train_x, train_y = train_df[feats].iloc[train_idx], target[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], target[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = CatBoostClassifier(
            #nthread=4,
            #class_weights = [1,2],
            bagging_temperature = 1.25,
            iterations=5000,
            learning_rate=0.01,
            l2_leaf_reg = 1,
            depth=10,
            loss_function='Logloss',
            eval_metric='F1',
            silent=False)
        #print(train_x.head())
        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            verbose= 100, early_stopping_rounds= 200,cat_features=cat_idx,use_best_model=True)

        oof_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats])[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d F-score : %.6f' % (n_fold + 1, sklearn.metrics.f1_score(valid_y, (oof_preds[valid_idx]>0.3).astype(int))))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full f1 score %.6f' % sklearn.metrics.f1_score(target, (oof_preds>0.3).astype(int)))
    
    # Write submission file and plot feature importance
    if not debug:
        _ = datetime.now().strftime('%Y%m%d%H%M%S')
        Fname = 'F:/AV/WNS/submission/'+str(modelname)+'_'+str(_)+'.csv'
        submission['is_promoted'] = sub_preds
        submission[['employee_id', 'is_promoted']].to_csv(Fname, index= False)
        oof = pd.DataFrame(oof_preds)
        score = sklearn.metrics.f1_score(target, (oof_preds>0.3).astype(int))
        oof.columns = [modelname+'_'+str(round(score,4))]
        OOF_Fname = 'F:/AV/WNS/oof/'+str(modelname)+'_'+str(_)+'.csv'
        oof.to_csv(OOF_Fname,index=False)
    #display_importances(feature_importance_df)
    return 

In [25]:
import gc
oof = kfold_catboost(X_train,X_test, Y,num_folds=5, stratified = True, debug= False,modelname="catboost_5fld_bagtemp")

Train shape: (54808, 12), test shape: (23490, 12)
0:	learn: 0.2019967	test: 0.2019967	best: 0.2019967 (0)	test1: 0.2454874	total: 124ms	remaining: 10m 21s
100:	learn: 0.2866941	test: 0.2870857	best: 0.2875429 (98)	test1: 0.3024302	total: 9.64s	remaining: 7m 47s
200:	learn: 0.4019629	test: 0.4026445	best: 0.4026445 (200)	test1: 0.4091293	total: 22.4s	remaining: 8m 55s
300:	learn: 0.4546205	test: 0.4571664	best: 0.4571664 (299)	test1: 0.4566285	total: 41s	remaining: 10m 40s
400:	learn: 0.4802271	test: 0.4814590	best: 0.4814590 (400)	test1: 0.4883907	total: 1m 2s	remaining: 11m 51s
500:	learn: 0.4927595	test: 0.4923572	best: 0.4923572 (500)	test1: 0.5031646	total: 1m 21s	remaining: 12m 14s
600:	learn: 0.4965905	test: 0.4947791	best: 0.4950813 (586)	test1: 0.5102362	total: 1m 40s	remaining: 12m 18s
700:	learn: 0.4995997	test: 0.4973948	best: 0.4973948 (691)	test1: 0.5141066	total: 2m 1s	remaining: 12m 25s
800:	learn: 0.5045872	test: 0.5006999	best: 0.5006999 (796)	test1: 0.5183450	total: 2

2900:	learn: 0.6128793	test: 0.5410946	best: 0.5411992 (2892)	test1: 0.4916468	total: 11m 45s	remaining: 8m 30s
3000:	learn: 0.6192407	test: 0.5420452	best: 0.5426087 (2962)	test1: 0.4912560	total: 12m 12s	remaining: 8m 7s
3100:	learn: 0.6221486	test: 0.5432432	best: 0.5436293 (3044)	test1: 0.4924543	total: 12m 39s	remaining: 7m 44s
3200:	learn: 0.6256881	test: 0.5438393	best: 0.5441914 (3128)	test1: 0.4912560	total: 13m 6s	remaining: 7m 21s
3300:	learn: 0.6292094	test: 0.5444015	best: 0.5444015 (3232)	test1: 0.4912560	total: 13m 33s	remaining: 6m 58s
3400:	learn: 0.6322133	test: 0.5454195	best: 0.5457000 (3384)	test1: 0.4908658	total: 13m 59s	remaining: 6m 34s
3500:	learn: 0.6349553	test: 0.5465408	best: 0.5465408 (3492)	test1: 0.4920635	total: 14m 27s	remaining: 6m 11s
3600:	learn: 0.6379342	test: 0.5471371	best: 0.5471371 (3590)	test1: 0.4920635	total: 14m 54s	remaining: 5m 47s
3700:	learn: 0.6417558	test: 0.5481510	best: 0.5481510 (3694)	test1: 0.4916733	total: 15m 21s	remaining: 5

1500:	learn: 0.5497777	test: 0.5277778	best: 0.5279625 (1490)	test1: 0.4811548	total: 5m 27s	remaining: 12m 43s
1600:	learn: 0.5533475	test: 0.5292163	best: 0.5295037 (1563)	test1: 0.4827586	total: 5m 55s	remaining: 12m 34s
1700:	learn: 0.5597388	test: 0.5321101	best: 0.5323966 (1696)	test1: 0.4827586	total: 6m 23s	remaining: 12m 23s
1800:	learn: 0.5648095	test: 0.5333074	best: 0.5335152 (1774)	test1: 0.4835869	total: 6m 51s	remaining: 12m 10s
1900:	learn: 0.5696299	test: 0.5346304	best: 0.5347344 (1899)	test1: 0.4848000	total: 7m 19s	remaining: 11m 55s
2000:	learn: 0.5735490	test: 0.5356934	best: 0.5358740 (1985)	test1: 0.4860112	total: 7m 47s	remaining: 11m 40s
2100:	learn: 0.5780983	test: 0.5364715	best: 0.5364715 (2090)	test1: 0.4864000	total: 8m 14s	remaining: 11m 22s
2200:	learn: 0.5810606	test: 0.5371451	best: 0.5374295 (2198)	test1: 0.4860112	total: 8m 43s	remaining: 11m 5s
2300:	learn: 0.5842782	test: 0.5386708	best: 0.5386708 (2292)	test1: 0.4860112	total: 9m 11s	remaining: 1

3700:	learn: 0.6313288	test: 0.5444702	best: 0.5445755 (3609)	test1: 0.5117739	total: 15m 52s	remaining: 5m 34s
3800:	learn: 0.6335766	test: 0.5451382	best: 0.5451382 (3756)	test1: 0.5117739	total: 16m 21s	remaining: 5m 9s
3900:	learn: 0.6346890	test: 0.5453140	best: 0.5454194 (3841)	test1: 0.5117739	total: 16m 50s	remaining: 4m 44s
4000:	learn: 0.6364299	test: 0.5453140	best: 0.5457005 (3911)	test1: 0.5117739	total: 17m 18s	remaining: 4m 19s
4100:	learn: 0.6387929	test: 0.5466126	best: 0.5466126 (4076)	test1: 0.5117739	total: 17m 48s	remaining: 3m 54s
4200:	learn: 0.6406392	test: 0.5473847	best: 0.5474903 (4174)	test1: 0.5117739	total: 18m 17s	remaining: 3m 28s
4300:	learn: 0.6421186	test: 0.5474903	best: 0.5474903 (4174)	test1: 0.5121760	total: 18m 45s	remaining: 3m 2s
4400:	learn: 0.6433490	test: 0.5476650	best: 0.5476650 (4334)	test1: 0.5121760	total: 19m 15s	remaining: 2m 37s
4500:	learn: 0.6453131	test: 0.5476650	best: 0.5476650 (4334)	test1: 0.5121760	total: 19m 43s	remaining: 2