In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import itertools
from scipy import stats

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

from sklearn.utils import resample

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras import optimizers

import imblearn
from imblearn.over_sampling import SMOTE, RandomOverSampler

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [None]:
def plot_loss_accuracy(history):
    historydf = pd.DataFrame(history.history, index=history.epoch)
    plt.figure(figsize=(8, 6))
    historydf.plot(ylim=(0, max(1, historydf.values.max())))
    loss = history.history['loss'][-1]
    acc = history.history['acc'][-1]
    plt.title('Loss: %.3f, Accuracy: %.3f' % (loss, acc))

In [None]:
def plot_bin_loss_accuracy(history):
    historydf = pd.DataFrame(history.history, index=history.epoch)
    plt.figure(figsize=(8, 6))
    historydf.plot(ylim=(0, max(1, historydf.values.max())))
    loss = history.history['loss'][-1]
    acc = history.history['binary_accuracy'][-1]
    plt.title('Loss: %.3f, Accuracy: %.3f' % (loss, acc))

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
data = pd.read_csv('C:/Users/ak19919/Downloads/ml_root/analytics vidya/hr_analytics/train.csv')
data.head(10)

In [None]:
data = data.set_index('employee_id')
data.head()

In [None]:
data.columns = data.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
data.nunique()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
# Imputing missing values and adding info column

for column in ['education', 'previous_year_rating']:
    data[column].fillna(data[column].mode()[0], inplace = True)
    
data['joining_age'] = data['age'] - data['length_of_service']

In [None]:
data.info()

In [None]:
data.describe().transpose()

In [None]:
# Keeping only numerical data fields
df_num = data.select_dtypes(include = ['float64', 'int64'])
df_num.head()

In [None]:
df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8)

In [None]:
sns.heatmap(df_num.corr()[['is_promoted']], annot=True, vmin=-1, vmax=1)

In [None]:
fig, ax = plt.subplots(figsize=(10,10))         # Sample figsize in inches
sns.heatmap(df_num.corr(), annot=True, square=True, vmin=-1, vmax=1)

In [None]:
# As age is a categorical variable, instead of scaling this feature like other variables, Quantile based binning 
# is a good strategy to use for adaptive binning. Quantiles are specific values or cut-points which help in 
# partitioning the continuous valued distribution of a specific numeric field into discrete contiguous bins 
# or intervals. Thus, q-Quantiles help in partitioning a numeric attribute into q equal partitions

quantile_list = [0, .25, .5, .75, 1.]
age_quantiles = data['age'].quantile(quantile_list)
age_quantiles

In [None]:
fig, ax = plt.subplots()
data['age'].hist(bins=30, color='#A9C5D3', 
                             edgecolor='black', grid=False)
for quantile in age_quantiles:
    qvl = plt.axvline(quantile, color='r')
ax.legend([qvl], ['Quantiles'], fontsize=10)
ax.set_title('Age Histogram with Quantiles', fontsize=12)
ax.set_xlabel('Age', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)

In [None]:
quantile_labels = ['0-25Q', '25-50Q', '50-75Q', '75-100Q']
# data['age_quantile_range'] = pd.qcut(data['age'], q=quantile_list)
data['age_range'] = pd.qcut(data['age'], q=quantile_list, labels=quantile_labels)
data = data.drop('age', axis=1)
data.head()

In [None]:
Join_age_quantiles = data['joining_age'].quantile(quantile_list)

fig, ax = plt.subplots()
data['joining_age'].hist(bins=30, color='#A9C5D3', 
                             edgecolor='black', grid=False)
for quantile in Join_age_quantiles:
    qvl = plt.axvline(quantile, color='r')
ax.legend([qvl], ['Quantiles'], fontsize=10)
ax.set_title('Joining Age Histogram with Quantiles', fontsize=12)
ax.set_xlabel('Joining Age', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)

data['join_age_range'] = pd.qcut(data['joining_age'], q=quantile_list, labels=quantile_labels)
data = data.drop('joining_age', axis=1)
data.head()

In [None]:
#LOS = np.array(data['length_of_service'])
## LOS_clean = income[~np.isnan(LOS)]
#l, opt_lambda = stats.boxcox(LOS)
#print('Optimal lambda value for Length of service:', opt_lambda)

## data['rating_BC_0'] = stats.boxcox((1+data['previous_year_rating']), lmbda = 0)
#data['LOS_BC_opt'] = stats.boxcox(data['length_of_service'], lmbda = opt_lambda)
#data = data.drop('length_of_service', axis=1)
#data.head(5)

In [None]:
# One hot encoding categorical features

cat_features = ['gender', 'education', 'recruitment_channel', 'region', 'department', 'age_range', 'join_age_range']
df_cat = pd.get_dummies(data[cat_features])
data = data.drop(cat_features, axis=1)
data = pd.concat([data, df_cat], axis = 1)

In [None]:
# Scaling the features within range (0, 1)

ss = StandardScaler()
scale_features = ['no_of_trainings', 'previous_year_rating', 'length_of_service', 'avg_training_score']
data[scale_features] = ss.fit_transform(data[scale_features])
#data.head()

In [None]:
# Performing Train Test Split (70-30 split)

#data = data.drop('recruitment_channel', axis=1)
X = data.drop('is_promoted', axis=1).values
y = data['is_promoted'].values

# Oversampling the data, define oversampling strategy
#oversample = RandomOverSampler(sampling_strategy='minority')

# fit and apply the transform
#X_over, y_over = oversample.fit_resample(X, y)

In [None]:
def create_model():
    ADAMAX = optimizers.Adamax(lr = 0.002, beta_1 = 0.9, beta_2 = 0.999)
    # Accuracy with ADAMAX at 0.944, loss at 0.153, F1 Score at 0.5222337125129266

    ADAM = optimizers.Adam(lr = 0.01)
    # Accuracy with ADAM at 0.945, loss at 0.146, F1 Score at 0.517427589592538

    ADADELTA = optimizers.Adadelta(lr = 1.0, rho = 0.95)
    # Accuracy with ADADELTA at , loss at 0.155, F1 Score at 0.506652474720596

    ANN_model = Sequential()
    ANN_model.add(Dense(64, input_shape = (X_train.shape[1],), activation = 'tanh'))
    ANN_model.add(Dense(64, activation = 'tanh'))
    ANN_model.add(Dense(64, activation = 'tanh'))
    ANN_model.add(Dense(64, activation = 'tanh'))
    ANN_model.add(Dense(16, activation = 'tanh'))
    # Last layer to use sigmoid activation function (coz binary classification)
    ANN_model.add(Dense(1, activation = 'sigmoid'))
    ANN_model.compile(optimizer = ADAMAX, loss = 'binary_crossentropy', metrics = ['accuracy'])
    return ANN_model

In [None]:
n_split = 8
 
for train_index,test_index in KFold(n_split).split(X):
    X_train,X_test=X[train_index],X[test_index]
    y_train,y_test=y[train_index],y[test_index]
    model = create_model()
    model.fit(X_train, y_train, epochs = 20)
    print('Model evaluation', model.evaluate(X_test, y_test))

In [None]:


ANN_history = ANmodel.fit(X_train, y_train, verbose = 0, epochs = 30)
plot_loss_accuracy(ANN_history)

In [None]:
# Plot Confusion matrix

Y_Pred = ANN_model.predict(X_test)
Cnf_matrix = confusion_matrix(y_test, Y_Pred.round())
np.set_printoptions(precision = 2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(Cnf_matrix, classes=['Not Promoted','Promoted'],
                      title='Confusion matrix, without normalization')

In [None]:
#extracting true_positives, false_positives, true_negatives, false_negatives

tn, fp, fn, tp = confusion_matrix(y_test, Y_Pred.round()).ravel()
print("True Negatives: ",tn)
print("False Positives: ",fp)
print("False Negatives: ",fn)
print("True Positives: ",tp)

In [None]:
#Accuracy
Accuracy = (tn+tp)*100/(tp+tn+fp+fn) 
print("Accuracy: {:0.2f}%".format(Accuracy))
print("Accuracy Score: {}".format(accuracy_score(y_test, Y_Pred.round(), normalize = False)))

#Precision 
Precision = tp/(tp+fp) 
print("Precision: {:0.2f}".format(Precision))
print("Precision Score: {}".format(precision_score(y_test, Y_Pred.round(), pos_label = 1, average = 'binary')))

#Recall 
Recall = tp/(tp+fn) 
print("Recall: {:0.2f}".format(Recall))
print("Recall Score: {}".format(recall_score(y_test, Y_Pred.round(), pos_label = 1, average = 'binary')))

#F1 Score
f1 = (2*Precision*Recall)/(Precision + Recall)
print("F1 Score {:0.2f}".format(f1))
print("F1 Score: {}".format(f1_score(y_test, Y_Pred.round(), pos_label = 1, average = 'binary')))

#Specificity 
Specificity = tn/(tn+fp)
print("Specificity: {:0.2f}".format(Specificity))

In [None]:
test_data = pd.read_csv('C:/Users/ak19919/Downloads/ml_root/analytics vidya/hr_analytics/test.csv')

In [None]:
# Imputing missing values

for column in ['education', 'previous_year_rating']:
    test_data[column].fillna(test_data[column].mode()[0], inplace = True)

In [None]:
## data['LOS_BC_0'] = stats.boxcox((1+data['length_of_service']), lmbda = 0)
#test_data['LOS_BC_opt'] = stats.boxcox(test_data['length_of_service'], lmbda = opt_lambda)
#test_data = test_data.drop('length_of_service', axis=1)
#test_data.head(5)

In [None]:
test_data['joining_age'] = test_data_1['age'] - test_data['length_of_service']
test_data['join_age_range'] = pd.qcut(test_data['joining_age'], q=quantile_list, labels=quantile_labels)
test_data = test_data.drop('joining_age', axis=1)

test_data['age_range'] = pd.qcut(test_data['age'], q = quantile_list, labels = quantile_labels)
test_data = test_data.drop('age', axis = 1)
test_data.head()

In [None]:
df_cat_T = pd.get_dummies(test_data[cat_features])
test_data = test_data.drop(cat_features, axis = 1)
test_data = pd.concat([test_data, df_cat_T], axis = 1)

In [None]:
# Scaling the features within range (0, 1)
test_data[scale_features] = ss.fit_transform(test_data[scale_features])
test_data.head()

In [None]:
test_no_id = test_data.drop('employee_id', axis = 1)
test_predictions = ANN_model.predict(test_no_id)
employee_ID = test_data['employee_id']
submission_df_1 = pd.DataFrame({
                  "employee_id": employee_ID, 
                  "is_promoted": test_predictions.ravel()})

In [None]:
submission_df_1.is_promoted = submission_df_1.is_promoted.round()
submission_df_1.to_csv('submission.csv',index = False)

In [None]:
print(ANN_history.history.keys())