## Importing Python Modules

In [40]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion, make_union

In [41]:
sns.set_style('whitegrid')
# sns.set(context="paper", font="monospace")

## Data Loading 

In [42]:
def load_training_dataset():
    return pd.read_csv('data/train_LZdllcl.csv', index_col='employee_id')
df = load_training_dataset()
df = df.rename(columns={'KPIs_met >80%':'is_kpi_met', 'awards_won?':'has_won_awards'})
print('Train Data Size :', df.shape)
df.head()

Train Data Size : (54808, 13)


Unnamed: 0_level_0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,is_kpi_met,has_won_awards,avg_training_score,is_promoted
employee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [43]:
test = pd.read_csv('data/test_2umaH9m.csv', index_col='employee_id')
test = test.rename(columns={'KPIs_met >80%':'is_kpi_met', 'awards_won?':'has_won_awards'})
print('Test Data Size :', test.shape)
test.head()

Test Data Size : (23490, 12)


Unnamed: 0_level_0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,is_kpi_met,has_won_awards,avg_training_score
employee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
8724,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


In [44]:
test.isnull().sum()

department                 0
region                     0
education               1034
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    1812
length_of_service          0
is_kpi_met                 0
has_won_awards             0
avg_training_score         0
dtype: int64

## Data Pre-processing 

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54808 entries, 65438 to 51526
Data columns (total 13 columns):
department              54808 non-null object
region                  54808 non-null object
education               52399 non-null object
gender                  54808 non-null object
recruitment_channel     54808 non-null object
no_of_trainings         54808 non-null int64
age                     54808 non-null int64
previous_year_rating    50684 non-null float64
length_of_service       54808 non-null int64
is_kpi_met              54808 non-null int64
has_won_awards          54808 non-null int64
avg_training_score      54808 non-null int64
is_promoted             54808 non-null int64
dtypes: float64(1), int64(7), object(5)
memory usage: 5.9+ MB


In [46]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
no_of_trainings,54808.0,1.253011,0.609264,1.0,1.0,1.0,1.0,10.0
age,54808.0,34.803915,7.660169,20.0,29.0,33.0,39.0,60.0
previous_year_rating,50684.0,3.329256,1.259993,1.0,3.0,3.0,4.0,5.0
length_of_service,54808.0,5.865512,4.265094,1.0,3.0,5.0,7.0,37.0
is_kpi_met,54808.0,0.351974,0.47759,0.0,0.0,0.0,1.0,1.0
has_won_awards,54808.0,0.023172,0.15045,0.0,0.0,0.0,0.0,1.0
avg_training_score,54808.0,63.38675,13.371559,39.0,51.0,60.0,76.0,99.0
is_promoted,54808.0,0.08517,0.279137,0.0,0.0,0.0,0.0,1.0


In [47]:
df.isnull().sum()

department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
is_kpi_met                 0
has_won_awards             0
avg_training_score         0
is_promoted                0
dtype: int64

In [48]:
# Is there anyone who is promoted whose education is null??
mask = (df.education.isnull() == True) & (df.is_promoted == 1)
print('Is there anyone who is promoted whose education is null?? : ', df[mask].shape[0], 'folks!')
df[mask].describe().T

Is there anyone who is promoted whose education is null?? :  122 folks!


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
no_of_trainings,122.0,1.114754,0.533145,1.0,1.0,1.0,1.0,6.0
age,122.0,29.172131,6.729352,20.0,24.0,28.0,32.0,60.0
previous_year_rating,97.0,4.061856,1.028899,1.0,3.0,4.0,5.0,5.0
length_of_service,122.0,3.95082,3.177536,1.0,2.0,3.0,5.0,16.0
is_kpi_met,122.0,0.704918,0.45796,0.0,0.0,1.0,1.0,1.0
has_won_awards,122.0,0.163934,0.371743,0.0,0.0,0.0,0.0,1.0
avg_training_score,122.0,68.008197,17.208731,45.0,50.0,64.0,85.0,98.0
is_promoted,122.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [49]:
list(df.education.unique())

["Master's & above", "Bachelor's", nan, 'Below Secondary']

In [50]:
# Is there anyone who is promoted whose previous_year_rating is null??
mask = (df.previous_year_rating.isnull() == True) & (df.is_promoted == 1)
print('Is there anyone who is promoted whose previous_year_rating is null?? : ', df[mask].shape[0], 'folks!')
# df[mask].head()

Is there anyone who is promoted whose previous_year_rating is null?? :  339 folks!


In [51]:
# Is there anyone whose length_of_service>1  with previous_year_rating as null??
mask = (df.previous_year_rating.isnull() == True) & (df.length_of_service > 1)
print('Is there anyone whose length_of_service>1  with previous_year_rating as null?? : ', df[mask].shape[0], 'folks!')
# df[mask].head()
# Observation
# If only length_of_service=1, the previous_year_rating = NaN
# Possibility of FeatureEngineering?? length_of_service*previous_year_rating, setting NaN=0 
# [Could be wrong bcoz only last rating is available. What if last ratig is bad but exp is more?]
# How does this look for test dataset?

Is there anyone whose length_of_service>1  with previous_year_rating as null?? :  0 folks!


In [52]:
# How does this look for test dataset?
# Is there anyone whose length_of_service>1  with previous_year_rating as null??
mask = (test.previous_year_rating.isnull() == True) & (test.length_of_service > 1)
print('Is there anyone whose length_of_service>1  with previous_year_rating as null in TEST DATASET?? : ', test[mask].shape[0], 'folks!')
# None, so we can set the value as Zero

Is there anyone whose length_of_service>1  with previous_year_rating as null in TEST DATASET?? :  0 folks!


In [53]:
print(df.info())

df['is_promoted'] = df['is_promoted'].astype(np.int8)
df['avg_training_score'] = df['avg_training_score'].astype(np.int8)
df['has_won_awards'] = df['has_won_awards'].astype(np.int8)
df['is_kpi_met'] = df['is_kpi_met'].astype(np.int8)
df['length_of_service'] = df['length_of_service'].astype(np.int8)
df['age'] = df['age'].astype(np.int8)
df['no_of_trainings'] = df['no_of_trainings'].astype(np.int8)

df['previous_year_rating'] = df['previous_year_rating'].astype(np.float16)

df['department'] = df['department'].astype('category')
df['region'] = df['region'].astype('category')
df['education'] = df['education'].astype('category')
df['gender'] = df['gender'].astype('category')
df['recruitment_channel'] = df['recruitment_channel'].astype('category')

print('\nAfter processing:\n',)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54808 entries, 65438 to 51526
Data columns (total 13 columns):
department              54808 non-null object
region                  54808 non-null object
education               52399 non-null object
gender                  54808 non-null object
recruitment_channel     54808 non-null object
no_of_trainings         54808 non-null int64
age                     54808 non-null int64
previous_year_rating    50684 non-null float64
length_of_service       54808 non-null int64
is_kpi_met              54808 non-null int64
has_won_awards          54808 non-null int64
avg_training_score      54808 non-null int64
is_promoted             54808 non-null int64
dtypes: float64(1), int64(7), object(5)
memory usage: 5.9+ MB
None

After processing:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54808 entries, 65438 to 51526
Data columns (total 13 columns):
department              54808 non-null category
region                  54808 non-null category

## Data Modeling

In [63]:
class RatingsImputer(BaseEstimator, TransformerMixin):
    def fit (self, X, y=None, **fit_params):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        #X.loc[X.previous_year_rating.isnull(), 'previous_year_rating'] = 0
        X = X.fillna({'previous_year_rating':3})
        return X

class EducationCleanser(BaseEstimator, TransformerMixin):
    def fit (self, X, y=None, **fit_params):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        X = X.fillna({'education':"Bachelor's"})
        return X

# fillna_pipe.fit_transform(df).isnull().sum() # For Testing purpose only

In [64]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
    
    def fit (self, X, y=None, **fit_params):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)

# class ModifiedLabelEncoder(LabelEncoder):
#     def fit_transform(self, y, *args, **kwargs):
#         print(type(y))
#         print(y.head(5))
#         return super().fit_transform(y).reshape(-1, 1)

#     def transform(self, y, *args, **kwargs):
#         return super().transform(y).reshape(-1, 1)

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using LabelEncoder(). 
        If no columns specified, transforms all columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

categorical_cols = ['department', 'region', 'education', 'gender', 'recruitment_channel'] # list of categorical column names    
numeric_cols = [ col for col in df.columns.difference(categorical_cols) if col not in ['is_promoted', 'iregion']]

fillna_pipe = make_pipeline(RatingsImputer(), EducationCleanser())
# categorical_cols_pipe = make_pipeline(ColumnSelector(columns=categorical_cols), ModifiedLabelEncoder(), OneHotEncoder(sparse=False))
categorical_cols_pipe = make_pipeline(ColumnSelector(columns=categorical_cols), MultiColumnLabelEncoder())
numeric_cols_pipe = make_pipeline(ColumnSelector(columns=numeric_cols),MinMaxScaler())

# categorical_cols_pipe.fit_transform(df.head(5))
# numeric_cols_pipe.fit_transform(df.head(5))

In [65]:
# [ col for col in df.columns.difference(categorical_cols) if col not in ['is_promoted', 'iregion']]
fu = make_union(numeric_cols_pipe, categorical_cols_pipe)
pipe = make_pipeline(fillna_pipe, fu)
train_vec = pipe.fit_transform(df)
print(train_vec.shape)
print(train_vec[:3])

(54808, 12)
[[ 0.375       0.16666667  0.          1.          0.19444444  0.
   1.          7.         31.          2.          0.          2.        ]
 [ 0.25        0.35        0.          0.          0.08333333  0.
   1.          4.         14.          0.          1.          0.        ]
 [ 0.35        0.18333333  0.          0.          0.16666667  0.
   0.5         7.         10.          0.          1.          2.        ]]


In [66]:
# Split train data-set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_vec, df.is_promoted, train_size = 0.75, random_state = 42)

In [67]:
test_vec = pipe.transform(test)
test_vec[:5]

array([[ 0.1       ,  0.63333333,  0.        ,  1.        ,  0.        ,
         0.        ,  0.5       ,  8.        , 18.        ,  0.        ,
         1.        ,  2.        ],
       [ 0.275     ,  0.2       ,  0.        ,  0.        ,  0.11111111,
         0.        ,  0.5       ,  2.        , 28.        ,  0.        ,
         0.        ,  0.        ],
       [ 0.275     ,  0.13333333,  0.        ,  0.        ,  0.08333333,
         0.        ,  0.        ,  7.        ,  4.        ,  0.        ,
         1.        ,  0.        ],
       [ 0.275     ,  0.43333333,  0.        ,  0.        ,  0.22222222,
         0.22222222,  0.25      ,  5.        , 11.        ,  0.        ,
         0.        ,  0.        ],
       [ 0.25      ,  0.36666667,  0.        ,  0.        ,  0.16666667,
         0.        ,  0.75      ,  1.        , 21.        ,  0.        ,
         1.        ,  2.        ]])

## KNN Classifier

In [68]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, cross_validate

knn_clf = KNeighborsClassifier(2, weights='distance', n_jobs=-1)
scores = cross_val_score(knn_clf, x_train, y_train, cv=3, scoring='f1')
scores # Check against the recall score, for this is something the  Public LeaderBoard is scoring against.

array([0.24964269, 0.24883504, 0.24503005])

In [69]:
from sklearn.metrics import classification_report, confusion_matrix

knn_clf.fit(x_train, y_train)
preds = knn_clf.predict(x_test)

print('Accuracy = ', np.mean(preds == y_test))

creport = classification_report(y_test, preds, target_names=['Not Promoted', 'Promoted'])
print(creport)

confusion_matrix(y_test, preds)

Accuracy =  0.8884834330754634
              precision    recall  f1-score   support

Not Promoted       0.93      0.95      0.94     12571
    Promoted       0.29      0.25      0.27      1131

 avg / total       0.88      0.89      0.88     13702



array([[11892,   679],
       [  849,   282]], dtype=int64)

In [70]:
def conv2df(preds):
    df = pd.DataFrame(data={
        'employee_id': test.index.values,
        'is_promoted': preds
    })
    df['is_promoted'] = df['is_promoted'].astype(np.int8)
    return df

In [71]:
conv2df(knn_clf.predict(test_vec)).to_csv('data/output/KNNClassifier.csv', index=False)

## Possible areas of improvement
* Is there a better  way to fill missing values for the 'education' column???