In [1]:
## imports

import numpy as np
import pandas as pd

from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)
pd.set_option('display.max_columns', 100)
from lightgbm import LGBMClassifier
from datetime import datetime

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score ,roc_curve,auc
from sklearn.model_selection import GridSearchCV,StratifiedKFold,KFold
from sklearn.preprocessing import LabelEncoder
seed = 45
#seed =145
% matplotlib inline
plt.style.use('fivethirtyeight')

In [3]:
### read data
PATH='F:/AV/WNS'
train_csv = 'train_LZdllcl.csv'
test_csv = 'test_2umaH9m.csv'
submit_csv = 'sample_submission_M0L0uXE.csv'

### read train, test and submission files
train = pd.read_csv(f'{PATH}/{train_csv}')
test = pd.read_csv(f'{PATH}/{test_csv}')
submission = pd.read_csv(f'{PATH}/{submit_csv}')

print("Shape of {}:{} {}:{} {}:{}".format('train',train.shape,'test',test.shape,'submission',submission.shape))

Shape of train:(54808, 14) test:(23490, 13) submission:(23490, 2)


In [4]:
### inspect data

train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [5]:
### class balance

train['is_promoted'].value_counts()

## so approx 10% of past employees have been promoted

0    50140
1     4668
Name: is_promoted, dtype: int64

In [6]:
### lets check if there is any repeat in employees

len(train['employee_id'].unique()) == train.shape[0]

### so all IDs are unique

True

In [7]:
## null values

null_columns=train.columns[train.isnull().any()]
train[null_columns].isnull().sum()

## so 2 columns have null values

education               2409
previous_year_rating    4124
dtype: int64

In [8]:
### inspect null value columns
a = train[(train.education.isnull())]
_ = train[(train.education.isnull() | train.previous_year_rating.isnull())]

print(a.shape,_.shape)

(2409, 14) (6148, 14)


In [9]:
set(a.index).intersection(set(_.index)) == set(a.index)

### so everywhere where education is not present prev year rating is also not present, but vice-versa is not true

True

In [10]:
_.head(10)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
10,29934,Technology,region_23,,m,sourcing,1,30,,1,0,0,77,0
21,33332,Operations,region_15,,m,sourcing,1,41,4.0,11,0,0,57,0
23,71177,Procurement,region_5,Bachelor's,m,other,1,27,,1,0,0,70,0
29,74759,Sales & Marketing,region_4,Bachelor's,m,sourcing,1,26,,1,0,0,44,0
32,35465,Sales & Marketing,region_7,,f,sourcing,1,24,1.0,2,0,0,48,0
43,17423,Sales & Marketing,region_2,,m,other,3,24,2.0,2,0,0,48,0
56,45709,Sales & Marketing,region_31,Bachelor's,f,other,1,29,,1,0,0,49,0
58,26599,Sales & Marketing,region_16,Bachelor's,m,other,2,27,,1,1,0,47,0
62,9150,Analytics,region_22,Bachelor's,f,other,1,28,,1,1,0,80,0
66,77981,Finance,region_22,Bachelor's,m,other,1,27,,1,1,1,58,1


In [11]:
### check avg % of people promoted with NA in previous ye rating vs without NA

print(np.mean(_['is_promoted']),np.mean(train[~train.education.isnull()]['is_promoted']))

0.07091737150292778 0.08675738086604706


In [12]:
### check avg % of people promoted with NA in education vs without NA

print(np.mean(a['is_promoted']),np.mean(train[~train.education.isnull()]['is_promoted']))

0.0506434205064342 0.08675738086604706


In [13]:
## For education we will use unknown for all missing values and 9999 for prev year training

train['education'] = train.education.fillna('unknown')
train['previous_year_rating'] = train.previous_year_rating.fillna(9999)

test['education'] = test.education.fillna('unknown')
test['previous_year_rating'] = test.previous_year_rating.fillna(9999)

In [14]:
### merge dataframes for ease of processing
Y = train['is_promoted'].values
train.drop('is_promoted',inplace=True,axis=1)
train['train'] = 'train'
test['train'] = 'test'
merged = pd.concat([train,test])
merged.shape

(78298, 14)

In [15]:
cat_cols = [i for i in merged.columns if merged[i].dtypes == 'object']+['KPIs_met >80%','awards_won?']
cat_cols

['department',
 'region',
 'education',
 'gender',
 'recruitment_channel',
 'train',
 'KPIs_met >80%',
 'awards_won?']

In [16]:
cat_cols.remove('train')
print(cat_cols)

['department', 'region', 'education', 'gender', 'recruitment_channel', 'KPIs_met >80%', 'awards_won?']


In [17]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [18]:
new_tr, new_tst = merged[merged['train']=='train'],merged[merged['train']=='test']
new_tr['is_promoted'] = Y

# trn, sub = target_encode(new_tr["department"], 
#                          new_tst["department"], 
#                          target=new_tr.is_promoted, 
#                          min_samples_leaf=100,
#                          smoothing=10,
#                          noise_level=0.001)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [19]:
for i in ['department','region']:
    new_tr[i], new_tst[i] = target_encode(new_tr[i], 
                         new_tst[i], 
                         target=new_tr.is_promoted, 
                         min_samples_leaf=75,
                         smoothing=10,
                         noise_level=0.001)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [20]:
Y = new_tr['is_promoted'].values
new_tr.drop('is_promoted',axis=1,inplace=True)
merged = pd.concat([new_tr,new_tst])
merged.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


(78298, 14)

In [21]:
cat_cols = [i for i in merged.columns if merged[i].dtypes == 'object']+['KPIs_met >80%','awards_won?']
cat_cols.remove('train')
cat_cols

['education', 'gender', 'recruitment_channel', 'KPIs_met >80%', 'awards_won?']

In [22]:
def pre_process(df,cat_cols):
    one_hot_encoded_training_predictors = pd.get_dummies(df[cat_cols])
    df.drop(cat_cols,inplace=True,axis=1)
    _ = pd.concat([df,one_hot_encoded_training_predictors],1)
    new_tr, new_tst = _[_['train']=='train'],_[_['train']=='test']
    new_tr.drop('train',inplace=True,axis=1)
    new_tst.drop('train',inplace=True,axis=1)
    return new_tr, new_tst

In [23]:
train_OHE,test_OHE = pre_process(merged,cat_cols)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [24]:
train_OHE.head()

Unnamed: 0,employee_id,department,region,no_of_trainings,age,previous_year_rating,length_of_service,avg_training_score,KPIs_met >80%,awards_won?,education_Bachelor's,education_Below Secondary,education_Master's & above,education_unknown,gender_f,gender_m,recruitment_channel_other,recruitment_channel_referred,recruitment_channel_sourcing
0,65438,0.072006,0.106547,1,35,5.0,8,49,1,0,0,0,1,0,1,0,0,0,1
1,65141,0.09002,0.114154,1,30,5.0,4,60,0,0,1,0,0,0,0,1,1,0,0
2,7513,0.072152,0.060548,1,34,3.0,7,50,0,0,1,0,0,0,0,1,0,0,1
3,2542,0.072122,0.116371,2,39,1.0,10,50,0,0,1,0,0,0,0,1,1,0,0
4,48945,0.10766,0.063315,1,45,3.0,2,73,0,0,1,0,0,0,0,1,1,0,0


In [25]:
def kfold_lightgbm(train_df,test_df, target,num_folds, stratified = False, debug= False,modelname="lightgbm_0"):
    # Divide in training/validation and test data
    train_df = train_df
    test_df = test_df
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    gc.collect()

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=seed)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['employee_id','index','train']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], target)):
        train_x, train_y = train_df[feats].iloc[train_idx], target[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], target[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            n_estimators=5000,
            learning_rate=0.005,
#            num_leaves=34,
#            colsample_bytree=0.9,
#            subsample=0.8715623,
            max_depth=10,
            reg_alpha=.35,
            reg_lambda=1.75,
#            min_split_gain=0.0222415,
#            min_child_weight=50,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 100, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d F-score : %.6f' % (n_fold + 1, sklearn.metrics.f1_score(valid_y, (oof_preds[valid_idx]>0.3).astype(int))))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full f1 score %.6f' % sklearn.metrics.f1_score(target, (oof_preds>0.3).astype(int)))
    
    # Write submission file and plot feature importance
    if not debug:
        _ = datetime.now().strftime('%Y%m%d%H%M%S')
        Fname = 'F:/AV/WNS/submission/'+str(modelname)+'_'+str(_)+'.csv'
        submission['is_promoted'] = sub_preds
        submission[['employee_id', 'is_promoted']].to_csv(Fname, index= False)
        oof = pd.DataFrame(oof_preds)
        score = sklearn.metrics.f1_score(target, (oof_preds>0.3).astype(int))
        oof.columns = [modelname+'_'+str(round(score,4))]
        OOF_Fname = 'F:/AV/WNS/oof/'+str(modelname)+'_'+str(_)+'.csv'
        oof.to_csv(OOF_Fname,index=False)
    #display_importances(feature_importance_df)
    return 

In [26]:
import gc
oof = kfold_lightgbm(train_OHE,test_OHE, Y,num_folds=10, stratified = True, debug= True,modelname="lightgbm_10fld_targetenc")

Starting LightGBM. Train shape: (54808, 19), test shape: (23490, 19)
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.897872	valid_1's auc: 0.893472
[200]	training's auc: 0.905838	valid_1's auc: 0.900283
[300]	training's auc: 0.910182	valid_1's auc: 0.905675
[400]	training's auc: 0.91341	valid_1's auc: 0.907438
[500]	training's auc: 0.91568	valid_1's auc: 0.908457
[600]	training's auc: 0.917546	valid_1's auc: 0.909389
[700]	training's auc: 0.919598	valid_1's auc: 0.910017
[800]	training's auc: 0.921379	valid_1's auc: 0.910478
[900]	training's auc: 0.923504	valid_1's auc: 0.910717
[1000]	training's auc: 0.925142	valid_1's auc: 0.911111
[1100]	training's auc: 0.927141	valid_1's auc: 0.911277
[1200]	training's auc: 0.929027	valid_1's auc: 0.911237
[1300]	training's auc: 0.930731	valid_1's auc: 0.91122
Early stopping, best iteration is:
[1149]	training's auc: 0.928086	valid_1's auc: 0.911293
Fold  1 F-score : 0.531429
Training until validation scores d

[600]	training's auc: 0.917422	valid_1's auc: 0.903389
[700]	training's auc: 0.919349	valid_1's auc: 0.904946
[800]	training's auc: 0.921076	valid_1's auc: 0.90557
[900]	training's auc: 0.922841	valid_1's auc: 0.905869
[1000]	training's auc: 0.925115	valid_1's auc: 0.906023
[1100]	training's auc: 0.927109	valid_1's auc: 0.906123
[1200]	training's auc: 0.928872	valid_1's auc: 0.906182
[1300]	training's auc: 0.930755	valid_1's auc: 0.90624
[1400]	training's auc: 0.932421	valid_1's auc: 0.906515
[1500]	training's auc: 0.93405	valid_1's auc: 0.906957
[1600]	training's auc: 0.935671	valid_1's auc: 0.907219
[1700]	training's auc: 0.937218	valid_1's auc: 0.907373
[1800]	training's auc: 0.93865	valid_1's auc: 0.907475
[1900]	training's auc: 0.940068	valid_1's auc: 0.907388
[2000]	training's auc: 0.941447	valid_1's auc: 0.907601
[2100]	training's auc: 0.942756	valid_1's auc: 0.907861
[2200]	training's auc: 0.943942	valid_1's auc: 0.907811
[2300]	training's auc: 0.945146	valid_1's auc: 0.907662
