In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion, make_union

In [None]:
sns.set_style('whitegrid')
# sns.set(context="paper", font="monospace")

## Data Loading 

In [None]:
def load_training_dataset():
    return pd.read_csv('data/train_LZdllcl.csv', index_col='employee_id')
df = load_training_dataset()
df = df.rename(columns={'KPIs_met >80%':'is_kpi_met', 'awards_won?':'has_won_awards'})
print('Train Data Size :', df.shape)
df.head()

In [None]:
test = pd.read_csv('data/test_2umaH9m.csv', index_col='employee_id')
test = test.rename(columns={'KPIs_met >80%':'is_kpi_met', 'awards_won?':'has_won_awards'})
print('Test Data Size :', test.shape)
test.head()

In [None]:
test.isnull().sum()

## Data Pre-processing 

In [None]:
df.isnull().sum()

In [None]:
print(df.info())

df['is_promoted'] = df['is_promoted'].astype(np.int8)
df['avg_training_score'] = df['avg_training_score'].astype(np.int8)
df['has_won_awards'] = df['has_won_awards'].astype(np.int8)
df['is_kpi_met'] = df['is_kpi_met'].astype(np.int8)
df['length_of_service'] = df['length_of_service'].astype(np.int8)
df['age'] = df['age'].astype(np.int8)
df['no_of_trainings'] = df['no_of_trainings'].astype(np.int8)

df['previous_year_rating'] = df['previous_year_rating'].astype(np.float16)

df['department'] = df['department'].astype('category')
df['region'] = df['region'].astype('category')
df['education'] = df['education'].astype('category')
df['gender'] = df['gender'].astype('category')
df['recruitment_channel'] = df['recruitment_channel'].astype('category')

print('\nAfter processing:\n',)
print(df.info())

In [None]:
print(test.info())

# test['is_promoted'] = test['is_promoted'].astype(np.int8)
test['avg_training_score'] = test['avg_training_score'].astype(np.int8)
test['has_won_awards'] = test['has_won_awards'].astype(np.int8)
test['is_kpi_met'] = test['is_kpi_met'].astype(np.int8)
test['length_of_service'] = test['length_of_service'].astype(np.int8)
test['age'] = test['age'].astype(np.int8)
test['no_of_trainings'] = test['no_of_trainings'].astype(np.int8)

test['previous_year_rating'] = test['previous_year_rating'].astype(np.float16)

test['department'] = test['department'].astype('category')
test['region'] = test['region'].astype('category')
test['education'] = test['education'].astype('category')
test['gender'] = test['gender'].astype('category')
test['recruitment_channel'] = test['recruitment_channel'].astype('category')

print('\nAfter processing:\n',)
print(test.info())

In [None]:
df_promoted = df[df.is_promoted==1]
df_others = df[df.is_promoted==0].sample(df_promoted.shape[0])
df_sample = pd.concat([df_promoted, df_others])
print('Promoted : ', df_promoted.shape[0], ', Others :', df_others.shape[0], ', Combined Sample :', df_sample.shape[0])
df_sample.corr()

In [None]:
sns.boxplot(x='age', data=df_promoted)
df_promoted.age.describe()

In [None]:
df['is_middle_age'] = df['age'].apply(lambda x: 1 if(x>=29 and x<=38) else 0) #(df['age']>=29 & df['age']<=38)
df['is_middle_age'] = df['is_middle_age'].astype(np.int8)
df['is_middle_age'].unique()
df.corr()

In [None]:
df.education.fillna("Bachelor's", inplace=True)
print(df.education.unique())

def numeric_edu(str):
    if(str.startswith('Master')): return 2
    elif(str.startswith('Bachelor')): return 1
    else: return 0

df['i_education'] = df.education.apply(numeric_edu)
df['i_education'] = df['i_education'].astype(np.int8)
print(df['i_education'].unique())
df.corr()

## Rank Features By Importance

In [None]:
# Find and Remove attributes with low variance
nullList = []
for x in df.select_dtypes(include=[np.number]):
    p=df[x].std()
    if ( p < 0.01 ) :
        nullList.append(x)
# df = df.drop(nullList, axis=1)
nullList

In [None]:
def numerify(ser):
    return LabelEncoder().fit_transform(ser)

df2 = df.copy()

df2['previous_year_rating'] = df2['previous_year_rating'].fillna(3)

# select_categorical_cols = ['department', 'region','education', 'gender', 'recruitment_channel']
df2['department'] = numerify(df2['department'])
df2['region'] = numerify(df2['region'])
df2['education'] = numerify(df2['education'])
df2['gender'] = numerify(df2['gender'])
df2['recruitment_channel'] = numerify(df2['recruitment_channel'])
df2.head().T

In [None]:
##Split train data-set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df2[df2.columns.difference(['is_promoted'])], 
                                                    df2['is_promoted'], 
                                                    train_size = 0.75, 
                                                    random_state = 42)

## Find Feature Importances
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
rf.fit(x_train, y_train)

names = df2.columns.values #select_columns #x_train.columns
scores = map(lambda x: round(x, 4), rf.feature_importances_)
score_map = zip(scores, names)
print("Features sorted by their score:")
# for a,b in sorted(score_map, reverse=True):
for a,b in sorted(score_map, reverse=True):
    print(a,b)

In [None]:
# df2.describe().T
# print(df2['previous_year_rating'].value_counts())
# df2['previous_year_rating'].unique()
df2.head()

## Data Modeling

In [None]:
class RatingsImputer(BaseEstimator, TransformerMixin):
    def fit (self, X, y=None, **fit_params):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        #X.loc[X.previous_year_rating.isnull(), 'previous_year_rating'] = 0
        X = X.fillna({'previous_year_rating':3})
        return X

class EducationCleanser(BaseEstimator, TransformerMixin):
    def fit (self, X, y=None, **fit_params):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        X = X.fillna({'education':"Bachelor's"})
        return X
    
# fillna_pipe.fit_transform(df).isnull().sum() # For Testing purpose only

In [None]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
    
    def fit (self, X, y=None, **fit_params):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)

In [None]:
select_numeric_cols = ['avg_training_score', 'has_won_awards', 'is_kpi_met', 'previous_year_rating']
select_categorical_cols = ['department', 'region','education', 'gender', 'recruitment_channel'] # select_categorical_cols = []
select_columns = select_numeric_cols + select_categorical_cols

fillna_pipe = make_pipeline(RatingsImputer(), EducationCleanser())
# fillna_pipe = make_pipeline(RatingsImputer())
select_categorical_cols_pipe = make_pipeline(ColumnSelector(columns=select_categorical_cols), OrdinalEncoder())
select_numeric_cols_pipe = make_pipeline(ColumnSelector(columns=select_numeric_cols),RobustScaler())

select_fu = make_union(select_numeric_cols_pipe, select_categorical_cols_pipe)
select_pipe = make_pipeline(fillna_pipe, select_fu)

train_vec = select_pipe.fit_transform(df[select_columns])
# test_vec = select_pipe.transform(test[select_columns])
print(train_vec.shape)
print(train_vec[:3])

In [None]:
# Split train data-set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_vec, 
                                                    df['is_promoted'], 
                                                    train_size = 0.75, 
                                                    random_state = 42)

In [None]:
## Find Feature Importances
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
rf.fit(x_train, y_train)

names = select_columns #x_train.columns
scores = map(lambda x: round(x, 4), rf.feature_importances_)
score_map = zip(scores, names)
print("Features sorted by their score:")
# for a,b in sorted(score_map, reverse=True):
for a,b in sorted(score_map, reverse=True):
    print(a,b)

In [None]:
test_vec = select_pipe.transform(test[select_columns])
test_vec[:5]

## LGBoost for Classification

In [None]:
from lightgbm import LGBMClassifier

from sklearn.model_selection import cross_val_score, cross_validate

lgb_clf=LGBMClassifier(random_state=42, silent=False, class_weight={0:1, 1:5}, learning_rate=.05, n_estimators=45)

scores = cross_val_score(lgb_clf, x_train, y_train, cv=3, scoring='f1')
scores

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

lgb_clf.fit(x_train, y_train)
preds = lgb_clf.predict(x_test)

print('Accuracy = ', np.mean(preds == y_test))

creport = classification_report(y_test, preds, target_names=['Not Promoted', 'Promoted'])
print(creport)

confusion_matrix(y_test, preds)

In [None]:
def conv2df(preds):
    df = pd.DataFrame(data={
        'employee_id': test.index.values,
        'is_promoted': preds
    })
    df['is_promoted'] = df['is_promoted'].astype(np.int8)
    return df

In [None]:
conv2df(lgb_clf.predict(test_vec)).to_csv('data/output/final_submission.csv', index=False)