Building an ANN model to predict the possibility of employee attrition. 

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import seaborn as sns

### Load the data and perform EDA.

dataset repository -  https://www.kaggle.com/pavansubhasht/ibm-hr-analytics-attrition-dataset


In [None]:
url = "https://raw.githubusercontent.com/Dharshana03/Employee_attrition/main/Data/HR-Employee-Attrition.csv"
attrition_data = pd.read_csv(url)

In [None]:
attrition_data.head().transpose()

In [None]:
attrition_data.info()

In [None]:
attrition_data.shape

### Evaluating missing values using heatmap

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(attrition_data.isnull(), cbar=False)
plt.show()


### Assessing target class distribution

In [None]:
attrition_data = pd.get_dummies(attrition_data)
print("mean: ", attrition_data.Attrition_Yes.mean())
attrition_data["Attrition_Yes"].hist()

This is an imbalanced dataset

### Assessing information value of individual features (correlation analysis and pairlot)

In [None]:
sns.heatmap(attrition_data.corr(), cmap="Spectral")

In [None]:
attrition_data.corr()['Attrition_Yes'].sort_values(ascending=False)

In [None]:
sns.pairplot(attrition_data[['Attrition_Yes',"OverTime_Yes","MaritalStatus_Single","JobRole_Sales Representative",
                             "TotalWorkingYears","JobLevel"]],hue="Attrition_Yes")

In [None]:
sns.pairplot(attrition_data[["Attrition_Yes","YearsInCurrentRole","MonthlyIncome","Age","YearsWithCurrManager","StockOptionLevel",
                             "YearsAtCompany","JobInvolvement"]],hue="Attrition_Yes")

### Pre-processing the dataset

In [None]:
# removing the columns since there is 0 correlation and variance. 
attrition_data = attrition_data.drop(['EmployeeCount','StandardHours','Over18_Y'], axis=1)
# removing the columns that are redundant and not needed to predict the target
attrition_data = attrition_data.drop(['Attrition_No','EmployeeNumber'], axis=1)


In [None]:

attrition_data[['WorkLifeBalance','JobSatisfaction','JobLevel','JobInvolvement','RelationshipSatisfaction','EnvironmentSatisfaction','PerformanceRating']] =  attrition_data[['WorkLifeBalance','JobSatisfaction','JobLevel','JobInvolvement','RelationshipSatisfaction','EnvironmentSatisfaction','PerformanceRating']].astype('category',copy=False)
attrition_data.rename(columns={"Attrition_Yes": "Attrition"}, inplace=True)

attrition_data_cleaned = pd.get_dummies(attrition_data)


In [None]:
attrition_data_cleaned.shape

### Spliting the data into training/test datasets (70/30)

In [None]:
X = attrition_data_cleaned.drop("Attrition",axis=1).values
y = attrition_data_cleaned["Attrition"].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=1)


### Building a sequential neural network with the following parameters: 3 hidden dense layers - 70, 35, 18 nodes respectively, activation function = 'relu', dropout = 0.5 for each layer)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Activation,Dropout

In [None]:
X_train.shape

In [None]:
model = Sequential()
model.add(Dense(units=70,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(units=35,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(units=18,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(units=1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam')

### Using early stopping callback to prevent overfitting

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

In [None]:
model.fit(x=X_train, 
          y=y_train, 
          batch_size=64,
          epochs=200,
          validation_data=(X_test, y_test), verbose=1,
          callbacks=[early_stop]
          )

8. Plot training and validation losses versus epochs.
9. Print out model confusion matrix.
10. Print out model classification report.
11. Print out model ROC AUC.

4 pts.

### Plotting the training and validation losses versus epochs.

In [None]:
model_loss = pd.DataFrame(model.history.history)
model_loss.plot()

### Confusion matrix

In [None]:
from sklearn.metrics import classification_report,confusion_matrix, roc_auc_score

In [None]:
y_pred =(model.predict(X_test) > 0.5).astype("int32")

In [None]:
print(confusion_matrix(y_test,y_pred))

### Classification Report

In [None]:
print(classification_report(y_test,y_pred))

### ROC AUC

In [None]:
print('ROC AUC: ', roc_auc_score(y_test,model.predict(X_test)))