In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion, make_union

from sklearn.impute import SimpleImputer

In [None]:
sns.set_style('whitegrid')
# sns.set(context="paper", font="monospace")

## Data Loading 

In [None]:
def load_training_dataset():
    return pd.read_csv('data/train_LZdllcl.csv', index_col='employee_id')
df = load_training_dataset()
df = df.rename(columns={'KPIs_met >80%':'is_kpi_met', 'awards_won?':'has_won_awards'})
print('Train Data Size :', df.shape)
df.head()

In [None]:
test = pd.read_csv('data/test_2umaH9m.csv', index_col='employee_id')
test = test.rename(columns={'KPIs_met >80%':'is_kpi_met', 'awards_won?':'has_won_awards'})
print('Test Data Size :', test.shape)
test.head()

In [None]:
test.isnull().sum()

## Data Pre-processing 

In [None]:
df.isnull().sum()

In [None]:
print(df.info())

df['is_promoted'] = df['is_promoted'].astype(np.int8)
df['avg_training_score'] = df['avg_training_score'].astype(np.int8)
df['has_won_awards'] = df['has_won_awards'].astype(np.int8)
df['is_kpi_met'] = df['is_kpi_met'].astype(np.int8)
df['length_of_service'] = df['length_of_service'].astype(np.int8)
df['age'] = df['age'].astype(np.int8)
df['no_of_trainings'] = df['no_of_trainings'].astype(np.int8)

df['previous_year_rating'] = df['previous_year_rating'].astype(np.float16)

df['department'] = df['department'].astype('category')
df['region'] = df['region'].astype('category')
df['education'] = df['education'].astype('category')
df['gender'] = df['gender'].astype('category')
df['recruitment_channel'] = df['recruitment_channel'].astype('category')

print('\nAfter processing:\n',)
print(df.info())

In [None]:
print(test.info())

# test['is_promoted'] = test['is_promoted'].astype(np.int8)
test['avg_training_score'] = test['avg_training_score'].astype(np.int8)
test['has_won_awards'] = test['has_won_awards'].astype(np.int8)
test['is_kpi_met'] = test['is_kpi_met'].astype(np.int8)
test['length_of_service'] = test['length_of_service'].astype(np.int8)
test['age'] = test['age'].astype(np.int8)
test['no_of_trainings'] = test['no_of_trainings'].astype(np.int8)

test['previous_year_rating'] = test['previous_year_rating'].astype(np.float16)

test['department'] = test['department'].astype('category')
test['region'] = test['region'].astype('category')
test['education'] = test['education'].astype('category')
test['gender'] = test['gender'].astype('category')
test['recruitment_channel'] = test['recruitment_channel'].astype('category')

print('\nAfter processing:\n',)
print(test.info())

In [None]:
df_promoted = df[df.is_promoted==1]
df_others = df[df.is_promoted==0].sample(df_promoted.shape[0])
df_sample = pd.concat([df_promoted, df_others])
print('Promoted : ', df_promoted.shape[0], ', Others :', df_others.shape[0], ', Combined Sample :', df_sample.shape[0])
df_sample.corr()

In [None]:
sns.boxplot(x='age', data=df_promoted)
df_promoted.age.describe()

In [None]:
df['is_middle_age'] = df['age'].apply(lambda x: 1 if(x>=29 and x<=38) else 0) #(df['age']>=29 & df['age']<=38)
df['is_middle_age'] = df['is_middle_age'].astype(np.int8)
df['is_middle_age'].unique()
df.corr()

In [None]:
df.education.fillna("Bachelor's", inplace=True)
print(df.education.unique())

def numeric_edu(str):
    if(str.startswith('Master')): return 2
    elif(str.startswith('Bachelor')): return 1
    else: return 0

df['i_education'] = df.education.apply(numeric_edu)
df['i_education'] = df['i_education'].astype(np.int8)
print(df['i_education'].unique())
df.corr()

## Rank Features By Importance

In [None]:
# Find and Remove attributes with low variance
nullList = []
for x in df.select_dtypes(include=[np.number]):
    p=df[x].std()
    if ( p < 0.01 ) :
        nullList.append(x)
# df = df.drop(nullList, axis=1)
nullList

In [None]:
def numerify(ser):
    return LabelEncoder().fit_transform(ser)

df2 = df.copy()

df2['previous_year_rating'] = df2['previous_year_rating'].fillna(3)

# select_categorical_cols = ['department', 'region','education', 'gender', 'recruitment_channel']
df2['department'] = numerify(df2['department'])
df2['region'] = numerify(df2['region'])
df2['education'] = numerify(df2['education'])
df2['gender'] = numerify(df2['gender'])
df2['recruitment_channel'] = numerify(df2['recruitment_channel'])
df2.head().T

In [None]:
##Split train data-set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df2[df2.columns.difference(['is_promoted'])], 
                                                    df2['is_promoted'], 
                                                    train_size = 0.75, 
                                                    random_state = 42)

## Find Feature Importances
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
rf.fit(x_train, y_train)

names = df2.columns.values #select_columns #x_train.columns
scores = map(lambda x: round(x, 4), rf.feature_importances_)
score_map = zip(scores, names)
print("Features sorted by their score:")
# for a,b in sorted(score_map, reverse=True):
for a,b in sorted(score_map, reverse=True):
    print(a,b)

In [None]:
features = df2.columns
importances = rf.feature_importances_
indices = np.argsort(importances) 
plt.figure(1)
plt.title('Feature Importances', fontsize=50)
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices], fontsize=50)
plt.xlabel('Relative Importance', fontsize=50)
fig = plt.gcf()
fig.set_size_inches(60, 30, forward=True)
# ax = fig.axes[0]
# plt.tight_layout()
# plt.draw()

## Data Modeling

In [None]:
select_numeric_cols = ['avg_training_score', 'has_won_awards', 'is_kpi_met', 'previous_year_rating']
select_categorical_cols = ['department', 'region','education', 'gender', 'recruitment_channel'] # select_categorical_cols = []
select_columns = select_numeric_cols + select_categorical_cols

# Ref.: https://jorisvandenbossche.github.io/blog/2018/05/28/scikit-learn-columntransformer/
# Warning: Don't repeat columns to avoid column duplication in resultant vector
preprocess = make_column_transformer(
    (['previous_year_rating'],make_pipeline(SimpleImputer(strategy='constant',fill_value=3),RobustScaler())),
    (list(set(select_numeric_cols) - set(['previous_year_rating'])), RobustScaler()),
#     (['education'],make_pipeline(SimpleImputer(strategy='constant',fill_value="Bachelor's"),OrdinalEncoder())),
    (['education'],make_pipeline(SimpleImputer(strategy='most_frequent'),OrdinalEncoder())),
    (list(set(select_categorical_cols) - set(['education'])), OrdinalEncoder()),    
)

train_vec = preprocess.fit_transform(df[select_columns])
print(train_vec.shape)
print(train_vec[:3])

In [None]:
# Split train data-set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_vec, 
                                                    df['is_promoted'], 
                                                    train_size = 0.75, 
                                                    random_state = 42)

In [None]:
## Find Feature Importances
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
rf.fit(x_train, y_train)

names = select_columns #x_train.columns
scores = map(lambda x: round(x, 4), rf.feature_importances_)
score_map = zip(scores, names)
print("Features sorted by their score:")
# for a,b in sorted(score_map, reverse=True):
for a,b in sorted(score_map, reverse=True):
    print(a,b)

In [None]:
test_vec = preprocess.transform(test[select_columns])
test_vec[:5]

## LGBoost for Classification

In [None]:
from lightgbm import LGBMClassifier

from sklearn.model_selection import cross_val_score, cross_validate

lgb_clf=LGBMClassifier(random_state=42, silent=False, class_weight={0:1, 1:5}, learning_rate=.05, n_estimators=45)

scores = cross_val_score(lgb_clf, x_train, y_train, cv=3, scoring='f1')
scores

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

lgb_clf.fit(x_train, y_train)
preds = lgb_clf.predict(x_test)

print('Accuracy = ', np.mean(preds == y_test))

creport = classification_report(y_test, preds, target_names=['Not Promoted', 'Promoted'])
print(creport)

confusion_matrix(y_test, preds)

In [None]:
def conv2df(preds):
    df = pd.DataFrame(data={
        'employee_id': test.index.values,
        'is_promoted': preds
    })
    df['is_promoted'] = df['is_promoted'].astype(np.int8)
    return df

In [None]:
conv2df(lgb_clf.predict(test_vec)).to_csv('data/output/final_submission.csv', index=False)

## Model Selection

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
# from sklearn.svm import SVC
# from sklearn.naive_bayes import GaussianNB
# from sklearn.ensemble import BaggingClassifier
# from sklearn.ensemble import VotingClassifier

LOGISTIC_REGRESSION = 'LogisticRegression'
DECISION_TREE = 'DecisionTree'
KNN = 'KNN'
RANDOM_FOREST = 'RandomForest'
GRADIENT_BOOSTING = 'GradientBoosting'
ADA_BOOST = 'AdaBoost'
MLP = 'NeuralNetwork'
LGBM = 'LGBM'

In [None]:
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")

learners = [
    LogisticRegression(random_state=42), 
    DecisionTreeClassifier(random_state=42), 
    KNeighborsClassifier(),
    RandomForestClassifier(random_state=42), 
    GradientBoostingClassifier(random_state=42), 
    AdaBoostClassifier(random_state=42), 
    MLPClassifier(random_state=42),
    LGBMClassifier(random_state=42)
]

labels = [
    LOGISTIC_REGRESSION, 
    DECISION_TREE, 
    KNN, 
    RANDOM_FOREST, 
    GRADIENT_BOOSTING, 
    ADA_BOOST, 
    MLP,
    LGBM
]

lFrame = pd.DataFrame(columns=['Learner', 'scoreAUC'])
i=0
for learner, label in zip(learners, labels):
    learner.fit(x_train, y_train)
    y_preds = learner.predict_proba(x_test)[:, 1]
    score = roc_auc_score(y_test, y_preds)
    lFrame.loc[i] = [label, score]
    i += 1
    print('Classifier : <',label,'> Score AUC: ', score)
    
lFrame.sort_values('scoreAUC', ascending=False)    

In [None]:
params = {
    ADA_BOOST:  { 'n_estimators': range(10, 100, 10),
                   'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0] },
    
    GRADIENT_BOOSTING: { 'n_estimators': [25, 50, 100], 
                          'learning_rate': [0.001, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1], 
                          'max_depth': [1, 2, 3, 5, 7, 9, 10],},
    
    RANDOM_FOREST:  {"max_depth": [3, 5, 7, 9, 11], 
                      "min_samples_leaf": [1, 3, 5], 
                      "criterion": ["gini", "entropy"],  
                      "n_estimators" : [25, 50, 100] },
    
    MLP : {'activation' : ['identity', 'logistic', 'tanh', 'relu'],
           'solver' : ['lbfgs', 'sgd', 'adam'],
           'learning_rate' : ['constant', 'invscaling', 'adaptive'],
           'early_stopping' : [True, False] },
    
    KNN : {'n_neighbors': [2,3,4,5]},
    
    LOGISTIC_REGRESSION : {'C': range(1, 10, 1)},
    
    DECISION_TREE : { "criterion": ["gini", "entropy"],
                       "min_samples_split": [2, 3, 5],
                       "max_depth": [2, 3, 5, 7, 9],
                       "min_samples_leaf": [1, 3, 5],
                       "max_leaf_nodes": [3, 5, 7,9, 11], },
    
    LGBM : { "class_weight" : [{0:1, 1:1}, {0:1, 1:5}],
             "learning_rate" : [0.025, 0.05, 0.1, 0.25, 0.5],
             "n_estimators" : [25, 45, 75, 100] }
}

In [None]:
# Ref.: https://github.com/Hichambd/Kaggle-Santander-Customer-Satisfaction/blob/master/Santander%20Customer%20Satisfaction.ipynb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

classifiers = {
    LOGISTIC_REGRESSION : LogisticRegression(random_state=42), 
#     DECISION_TREE : DecisionTreeClassifier(random_state=42), 
    KNN : KNeighborsClassifier(), 
#     RANDOM_FOREST : RandomForestClassifier(random_state=42), 
#     GRADIENT_BOOSTING : GradientBoostingClassifier(random_state=42),  
    ADA_BOOST : AdaBoostClassifier(random_state=42), 
    MLP : MLPClassifier(random_state=42),
    LGBM : LGBMClassifier(random_state=42)
}

clf_list = []
i=0
max=len(classifiers)
for name,clf in classifiers.items():
    print("name={}, clf={}, params={}".format(name,clf,params.get(name)))
    gscv = GridSearchCV(clf, param_grid=params.get(name))
    gscv.fit(x_train, y_train)
    classifiers[name] = gscv.best_estimator_
    y_preds = gscv.predict(x_test)
    f1score = f1_score(y_test, y_preds)
    print("{}'s classifier's Best f1score is ".format(name), f1score)    
    dict1 = {
        'Classifier': name,
        'F1Score': f1score,
        'BestScore': gscv.best_score_,
        'BestParams': gscv.best_params_,
        'BestEstimator': gscv.best_estimator_
    }
    clf_list.append(dict1)
    i = i+1
    print("Completed {}/{}.".format(i,max))

clfDataFrame = pd.DataFrame(clf_list)
# clfDataFrame.to_csv('sortedLearners.csv', sep=',', encoding='utf-8')
clfDataFrame.sort_values('F1Score', ascending=False)

In [None]:
for k,v in classifiers.items():
    print(k,v)