# HR Employee Attrition Analysis and Modelling

Importing libraries

In [None]:
import warnings
import numpy as np
import pandas as pd
import datetime as dt
import seaborn as sns
from sklearn import svm
import missingno as msno
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import datasets
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')
plt.style.use('ggplot')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Reading the dataset

In [None]:
data_cap = pd.read_csv("HR_Employee_Attrition_Data.csv")

In [None]:
data_cap.head()

In [None]:
#checking the shape of the data
data_cap.shape

In [None]:
#checking for dtypes
data_cap.dtypes

## Exploratory data analysis

In [None]:
data_cap.hist(bins=50, figsize=(20,15),grid=False)
plt.show()

In [None]:
# Let's make our correlation matrix visual
corr_matrix=data_cap.corr()
fig,ax=plt.subplots(figsize=(15,10))
ax=sns.heatmap(corr_matrix,
               annot=True,
               linewidths=0.5,
               fmt=".2f"
              )

In [None]:
#check if there are any null values
data_cap.isnull().sum()

There is no null value

In [None]:
data_cap.shape

# Now let’s visualize some categorica data

Let’s explore all the categorical values and visualize them
Now, we will use the value_counts function so that we can get the unique values from every categorical type of data.

Gender

In [None]:
gender_dict = data_cap["Gender"].value_counts()
gender_dict

Understanding the balancing of the Gender column visually

In [None]:
data_cap['Gender'].value_counts().plot(kind='bar',color=['salmon','lightblue'],title="Count of different gender")

Now, let’s figure out that how gender could be the reason for employees to leave the company or to stay in.

In [None]:
#Create a plot for crosstab

pd.crosstab(data_cap['Gender'],data_cap['Attrition']).plot(kind="bar",figsize=(10,6))
plt.title("Attrition vs Gender")
plt.xlabel("Attrition")
plt.ylabel("No of people who left based on gender")
plt.legend(["NO","Yes"])
plt.xticks(rotation=0)

In [None]:
gender_dict = data_cap["Attrition"].value_counts()
gender_dict

Here, from the chart it’s visible that it heavily depends on males, also we can see that it’s either male, female  but more number of them are Staying (NO) in the company.

BusinessTravel

In [None]:
BusinessTravel_dict = data_cap["BusinessTravel"].value_counts()
BusinessTravel_dict

BusinessTravel_dict

In [None]:
data_cap['BusinessTravel'].value_counts().plot(kind='bar',color=['salmon','lightblue'],title="Count of different gender")

In [None]:
pd.crosstab(data_cap['BusinessTravel'],data_cap['Attrition']).plot(kind="bar",figsize=(10,6))
plt.title("Attrition vs BusinessTravel")
plt.xlabel("Attrition")
plt.ylabel("No of people who left based on BusinessTravel")
plt.legend(["NO","Yes"])
plt.xticks(rotation=0)

Here, from the chart it’s visible that it heavily depends on Travel Rarely, also we can see that it’s either Travel_Rarely  
Travel_Frequently  
Non-Travel  but more number of them are staying in the company.

Department

In [None]:
Department_dict = data_cap["Department"].value_counts()
Department_dict

In [None]:
data_cap['Department'].value_counts().plot(kind='bar',color=['salmon','lightblue'],title="Count of different Department")

In [None]:
pd.crosstab(data_cap['Department'],data_cap['Attrition']).plot(kind="bar",figsize=(10,6))
plt.title("Attrition vs Department")
plt.xlabel("Attrition")
plt.ylabel("No of people who left based on Department")
plt.legend(["NO","Yes"])
plt.xticks(rotation=0)

Here, in the chart, we can see that the maximum number of employees are in the Research & Development  section and a high number of employees in the same section are Staying (NO) in the company.

In [None]:
JobRole_dict = data_cap["JobRole"].value_counts()
JobRole_dict

In [None]:
data_cap['JobRole'].value_counts().plot(kind='bar',color=['salmon','lightblue'],title="Count of different JobRole")

In [None]:
pd.crosstab(data_cap['JobRole'],data_cap['Attrition']).plot(kind="bar",figsize=(20,6))
plt.title("Attrition vs JobRole")
plt.xlabel("Attrition")
plt.ylabel("No of people who left based on JobRole")
plt.legend(["NO","Yes"])
plt.xticks(rotation=0)

Here, in the chart, we can see that the maximum number of employees are in the sales executive jobrole and a high number of employees in the same section are Staying (NO) in the company.

MaritalStatus

In [None]:
MaritalStatus_dict = data_cap["MaritalStatus"].value_counts()
MaritalStatus_dict

In [None]:
data_cap['MaritalStatus'].value_counts().plot(kind='bar',color=['salmon','lightblue'],title="Count of different JobRole")

In [None]:
pd.crosstab(data_cap['MaritalStatus'],data_cap['Attrition']).plot(kind="bar",figsize=(10,6))
plt.title("Attrition vs MaritalStatus")
plt.xlabel("Attrition")
plt.ylabel("No of people who left based on MaritalStatus")
plt.legend(["NO","Yes"])
plt.xticks(rotation=0)

Here, in the chart, we can see that the maximum number of employees are in the sales Married  MaritalStatus  and a high number of employees in the same section are Staying
(NO) in the company.

OverTime

In [None]:
OverTime_dict = data_cap["OverTime"].value_counts()
OverTime_dict

In [None]:
pd.crosstab(data_cap['OverTime'],data_cap['Attrition']).plot(kind="bar",figsize=(10,6))
plt.title("Attrition vs OverTime")
plt.xlabel("Attrition")
plt.ylabel("No of people who left based on OverTime")
plt.legend(["NO","Yes"])
plt.xticks(rotation=0)

Here, in the chart, we can see that the maximum number of employees are in the No overtime and a high number of employees in the same section are Staying (NO) in the company.

# Now let’s visualize some continuous data

Age

In [None]:
# Its Age vs Attrition
sns.jointplot(x='Attrition',y='Age',data=data_cap)

In the above graph, we can see that the ones who are having more age are Staying(NO)(old people) back in the company rather than the ones who have comparatively less age.

MonthlyIncome

In [None]:

# Its MonthlyIncomevs Attrition
sns.jointplot(x='Attrition',y='MonthlyIncome',data=data_cap)

In the above graph, we can see that the ones who are having more income  are staying (NO) back in the company rather than the ones who have comparatively less income.

JobLevel

In [None]:
# Its  JobLevelvs Attrition
sns.jointplot(x='Attrition',y='JobLevel',data=data_cap)

In the above graph, we can see that the ones who are having more JobLevel are staying (NO) back in the company rather than the ones who have comparatively less JobLevel.

TotalWorkingYears 

In [None]:
# Its  TotalWorkingYears vs Attrition
sns.jointplot(x='Attrition',y='TotalWorkingYears',data=data_cap)

In the above graph, we can see that the ones who are having more TotalWorkingYears are staying (NO) back in the company rather than the ones who have comparatively less TotalWorkingYears.

PercentSalaryHike 

In [None]:
# Its  PercentSalaryHike vs Attrition
sns.jointplot(x='Attrition',y='PercentSalaryHike',data=data_cap)

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data_cap.corr(), annot=True, cmap="YlGnBu")

In [None]:
#plot the Age distribution
plt.figure(figsize = (16,6))
sns.distplot(data_cap['Age'])
plt.show()

In [None]:
# Plot count of Attrition
plt.figure(figsize = (16,6))
sns.countplot('Attrition', data = data_cap)
plt.show()

In [None]:
#Calculate the attrition percentage
print("Attrition % in the org is::",(data_cap['Attrition'].value_counts()['Yes']/data_cap['Attrition'].count())*100)


In [None]:
#Check to see if there is any relation b/w Age and Attrition
plt.figure(figsize = (16,6))
sns.swarmplot(y = 'Age', x = 'Attrition', data = data_cap, hue = 'Attrition')
plt.show()
""""From the below plot we can see that there is NO linear relation between 'Age' and 'Attrition', 
but bulk of the attrition is happening between the age groups of '25-32'"""

In [None]:
#Plotting the count by business travel
plt.figure(figsize = (16,6))
print(data_cap['BusinessTravel'].value_counts())
sns.countplot(x= 'BusinessTravel', data = data_cap)
plt.show()

In [None]:
#creating a new variable 'cat_BusinessTravel' . Converting into a categorical variable
#Travel_Rarely = 1
#Travel_Frequently = 2
#Non_Travel = 3
data_cap.loc[data_cap['BusinessTravel'] == 'Travel_Rarely','cat_BusinessTravel'] = 1
data_cap.loc[data_cap['BusinessTravel'] == 'Travel_Frequently','cat_BusinessTravel'] = 2
data_cap.loc[data_cap['BusinessTravel'] == 'Non-Travel','cat_BusinessTravel'] = 3
data_cap.head(20)

In [None]:
#Plotting Attrition for Age and business travel
#Travel_Rarely = 1
#Travel_Frequently = 2
#Non_Travel = 3
plt.figure(figsize = (16,6))
sns.swarmplot(x= 'Attrition', y='Age',  data = data_cap, hue = 'cat_BusinessTravel')
plt.show()

#Insight 1
"""From the below plot we can see that attrition is higher for employees in age bracket of [25-32].
Also, in this age group employees are travelling more"""

In [None]:
#Plotting counts for department
print(data_cap['Department'].value_counts())
plt.figure(figsize = (16,6))
sns.countplot(data_cap['Department'])
plt.show()

In [None]:
#converting 'Department' data to ordinal variable
#Sales = 1
#Research and Development = 2
#Human Resorces = 3
data_cap.loc[data_cap['Department'] == 'Sales', 'ord_Department'] = 1
data_cap.loc[data_cap['Department'] == 'Research & Development', 'ord_Department'] = 2
data_cap.loc[data_cap['Department'] == 'Human Resources', 'ord_Department'] = 3


In [None]:
#Getting the count of Business travel based on Department
#HR
departmentgrp = data_cap.groupby(by = 'Department')

df1 = departmentgrp.get_group('Human Resources')['BusinessTravel'].value_counts()
df1 = pd.DataFrame(df1)
df1 = df1.reset_index()
df1['Department'] = 'HR'
df1

In [None]:
r = data_cap.groupby('Attrition')['Attrition'].count()
plt.pie(r, explode=[0.05, 0.1], labels=['No', 'Yes'], radius=1.5, autopct='%1.1f%%',  shadow=True);

In [None]:
#R&D
df2 = departmentgrp.get_group('Research & Development')['BusinessTravel'].value_counts()
df2 = df2.to_frame().reset_index()
df2['Department'] = 'R&D'
#df2.loc[df2['Department'] != 'HR', 'Department'] = 'R&D' 
df3 = df1.append(df2, ignore_index=True)
df3

In [None]:
df2 = departmentgrp.get_group('Sales')['BusinessTravel'].value_counts()
df2 = df2.to_frame().reset_index()
df2['Department'] = 'Sales'
df4 = df3.append(df2, ignore_index=True)
df4

In [None]:
#Calculating % of frequent travel employees in Sales and R&D
print("% of frequent travel in Sales is::",(df4.loc[((df4['Department'] == 'Sales') & (df4['index'] == 'Travel_Frequently')),'BusinessTravel'].sum())/(df4.loc[df4['Department'] == 'Sales', 'BusinessTravel'].sum()))
print("% of frequent travel in R&D is ::",(df4.loc[((df4['Department'] == 'R&D') & (df4['index'] == 'Travel_Frequently')),'BusinessTravel'].sum())/(df4.loc[df4['Department'] == 'R&D', 'BusinessTravel'].sum()))


In [None]:
#plotting department data against travel
plt.figure(figsize = (16,6))
sns.barplot(x= 'Department', y = 'BusinessTravel' , data = df4, hue = 'index')
plt.show()

In [None]:
##Calculating attrition % for each department
#Sales
attrbysales = departmentgrp.get_group('Sales')['Attrition'].value_counts()
#HR
attrbyHR = departmentgrp.get_group('Human Resources')['Attrition'].value_counts()
#RnD
attrbyRnD = departmentgrp.get_group('Research & Development')['Attrition'].value_counts()
attrbydept = pd.DataFrame(columns = ['Department', 'Attrition'])
#attrbydept.reset_index(inplace = True)
attrbydept.loc[len(attrbydept)] = ["Sales", attrbysales['Yes']/(attrbysales.sum())*100]
attrbydept.loc[len(attrbydept)] = ["HR",(attrbyHR['Yes']/attrbyHR.sum())*100]
attrbydept.loc[len(attrbydept)] = ["R&D",(attrbyRnD['Yes']/attrbyRnD.sum())*100]
attrbydept

#Insight 2
"""Attrition is highest in Sales team closely followed by HR team"""

In [None]:
#Plotting Attrition% for each department
plt.figure(figsize = (16,6))
sns.barplot(x= 'Department', y = 'Attrition' , data = attrbydept, hue = 'Department')
plt.show()

In [None]:
##Calculating attrition vs EnvironmentSatisfaction level
envsatisfcationgrp = data_cap.groupby(['EnvironmentSatisfaction','Attrition'])
envsatisfcationgrp.groups.keys()



In [None]:
count = envsatisfcationgrp.get_group((1, 'Yes'))['EnvironmentSatisfaction'].count()
count
AttrperEnvSat = pd.DataFrame(columns = ['EnvironmentSatisfaction', 'Attrition', 'Count'])
AttrperEnvSat.loc[len(AttrperEnvSat)] = ['1', 'Yes', count]
AttrperEnvSat

In [None]:
count = envsatisfcationgrp.get_group((2, 'Yes'))['EnvironmentSatisfaction'].count()
AttrperEnvSat.loc[len(AttrperEnvSat)] = ['2', 'Yes', count]
AttrperEnvSat

In [None]:
count = envsatisfcationgrp.get_group((3, 'Yes'))['EnvironmentSatisfaction'].count()
AttrperEnvSat.loc[len(AttrperEnvSat)] = ['3', 'Yes', count]
AttrperEnvSat

In [None]:
count = envsatisfcationgrp.get_group((4, 'Yes'))['EnvironmentSatisfaction'].count()
AttrperEnvSat.loc[len(AttrperEnvSat)] = ['4', 'Yes', count]
AttrperEnvSat

In [None]:
#Plotting Attrition vs Environmentsatisfaction
plt.figure(figsize = (16,6))
sns.barplot(x= 'EnvironmentSatisfaction', y = 'Count' , data = AttrperEnvSat, hue = 'EnvironmentSatisfaction')
plt.show()

#Insight3
"""Attrition is highest for group that has EnvironmentSatisfaction of '1', but for level '3' and '4' its high as well."""

In [None]:
#Attrition by gender
AttrGender = data_cap.groupby(['Attrition', 'Gender'])
AttrGender.groups.keys()

In [None]:
AttrperGend = pd.DataFrame(columns = ['Gender', 'Attrition', 'Count'])
AttrperGend

In [None]:
count = AttrGender.get_group(('Yes', 'Female'))['Attrition'].count()
AttrperGend.loc[len(AttrperGend)] = ['Female', 'Yes', count]
AttrperGend

In [None]:
count = AttrGender.get_group(('Yes', 'Male'))['Attrition'].count()
AttrperGend.loc[len(AttrperGend)] = ['Male', 'Yes', count]
AttrperGend

In [None]:
data_cap['Gender'].value_counts()
#plt.figure(figsize = (16,6))
#sns.barplot(x= 'Gender', y = 'Count' , data = AttrperGend, hue = 'Gender')

In [None]:
#Plotting Attrition vs Gender
plt.figure(figsize = (16,6))
sns.barplot(x= 'Gender', y = 'Count' , data = AttrperGend, hue = 'Gender')
plt.show()

#Insight 4
"""Out of total people who have left, Male's have higher attrition numbers"""

In [None]:
#Attrition by job level

In [None]:
attrbyJoblvl = data_cap.groupby(['JobLevel', 'Attrition'])
attrbyJoblvl.groups.keys()

In [None]:
count = attrbyJoblvl.get_group((1, 'Yes'))['Attrition'].count() 
count

In [None]:
attrvsJoblvl = pd.DataFrame(columns = ['JobLevel', 'Attrition', 'count', 'Attr%'])
attrvsJoblvl

In [None]:
attrvsJoblvl.loc[len(attrvsJoblvl)] = ['1', 'Yes', count, (count/data_cap['JobLevel'].value_counts()[1])*100]
attrvsJoblvl

In [None]:
count = attrbyJoblvl.get_group((2, 'Yes'))['Attrition'].count() 
attrvsJoblvl.loc[len(attrvsJoblvl)] = ['2', 'Yes', count, (count/data_cap['JobLevel'].value_counts()[2])*100]
attrvsJoblvl

In [None]:
count = attrbyJoblvl.get_group((3, 'Yes'))['Attrition'].count() 
attrvsJoblvl.loc[len(attrvsJoblvl)] = ['3', 'Yes', count, (count/data_cap['JobLevel'].value_counts()[3])*100]
attrvsJoblvl

In [None]:
count = attrbyJoblvl.get_group((4, 'Yes'))['Attrition'].count() 
attrvsJoblvl.loc[len(attrvsJoblvl)] = ['4', 'Yes', count, (count/data_cap['JobLevel'].value_counts()[4])*100]
attrvsJoblvl

In [None]:
count = attrbyJoblvl.get_group((5, 'Yes'))['Attrition'].count() 
attrvsJoblvl.loc[len(attrvsJoblvl)] = ['5', 'Yes', count, (count/data_cap['JobLevel'].value_counts()[5])*100]
attrvsJoblvl

In [None]:
#Plotting Attrition vs Job Level
plt.figure(figsize = (16,6))
sns.barplot(x= 'Attrition', y = 'Attr%' , data = attrvsJoblvl, hue = 'JobLevel')
plt.show()

#Insight 5
"""Attrition % is highest in Job level 1, followed by job level 3"""

In [None]:
#Plotting Attrition vs monthly income
plt.figure(figsize = (16,6))
sns.swarmplot(x= 'Attrition', y = 'MonthlyIncome' , data = data_cap, hue= 'MaritalStatus')
plt.show()

#Insight 6
"""Attrition is highest amount employees who are earning between (2500-5000)"""


In [None]:
#Plotting Joblevel vs monthly income
plt.figure(figsize = (16,6))
sns.swarmplot(x= 'JobLevel', y = 'MonthlyIncome' , data = data_cap, hue= 'Department')
plt.show()

In [None]:
#Plotting Age vs monthly income
plt.figure(figsize = (16,6))
sns.regplot(x= 'Age', y = 'MonthlyIncome' , data = data_cap)
plt.show()

#Insight 7
"""There is a linear relation between Age and Monthly income"""

In [None]:
data_cap.head()

In [None]:
#Analyzing salary hike
data_cap['PercentSalaryHike'].describe()

In [None]:
#Plotting histogram of Hike%
plt.figure(figsize = (16,6))
sns.distplot(data_cap['PercentSalaryHike'], kde = True)
plt.show()

#Insight 8
"""The salary hike histogram is skewed to the right""" 

In [None]:
#Plotting box plot for salary hike
plt.figure(figsize = (16,6))
sns.boxplot('PercentSalaryHike', data = data_cap)
plt.show()

#Insight 8
"""The median salary hike is 14% and the mean is 15%. The highest salary hike given is 25%""" 

In [None]:
#Plotting hike% for each department
plt.figure(figsize = (16,6))
sns.boxplot(y = 'PercentSalaryHike', x = 'Department', data = data_cap)
plt.show()

In [None]:
#Plotting hike% for each job level
plt.figure(figsize = (16,6))
sns.boxplot(y = 'PercentSalaryHike', x = 'JobLevel', data = data_cap)
plt.show()

In [None]:
#Plotting hike% for each job satisfaction level
plt.figure(figsize = (16,6))
sns.boxplot(y = 'PercentSalaryHike', x = 'JobSatisfaction', data = data_cap)
plt.show()

In [None]:
grp2 = data_cap.groupby(['JobSatisfaction', 'Attrition'])
grp2.groups.keys()

In [None]:
count = grp2.get_group((1, 'Yes'))['Attrition'].count()
count

In [None]:
attrbyJobsat = pd.DataFrame(columns = ['Jobsat','Attrition','Count', 'Attr%'])
attrbyJobsat.loc[len(attrbyJobsat)] = ['1', 'Yes', count, count/data_cap['JobSatisfaction'].value_counts()[1]*100]
attrbyJobsat

In [None]:
count = grp2.get_group((2, 'Yes'))['Attrition'].count()
attrbyJobsat.loc[len(attrbyJobsat)] = ['2', 'Yes', count, count/data_cap['JobSatisfaction'].value_counts()[2]*100]
attrbyJobsat

In [None]:
count = grp2.get_group((3, 'Yes'))['Attrition'].count()
attrbyJobsat.loc[len(attrbyJobsat)] = ['3', 'Yes', count, count/data_cap['JobSatisfaction'].value_counts()[3]*100]
attrbyJobsat

In [None]:
count = grp2.get_group((4, 'Yes'))['Attrition'].count()
attrbyJobsat.loc[len(attrbyJobsat)] = ['4', 'Yes', count, count/data_cap['JobSatisfaction'].value_counts()[4]*100]
attrbyJobsat

In [None]:
#Plotting Attr% for Job satisfaction level
plt.figure(figsize = (16,6))
sns.barplot(y = 'Attr%', x = 'Jobsat', data = attrbyJobsat)
plt.show()

#Insight 9
"""employees with job satisfaction level of 1 have the highest attrition % of 22.8%""" 

In [None]:
#Plotting hike% for No of years worked
plt.figure(figsize = (16,6))
sns.boxplot(y = 'PercentSalaryHike', x = 'YearsAtCompany', data = data_cap)
plt.show()

In [None]:
#Plotting 'Performance rating' and 'hike%'
plt.figure(figsize = (16,6))
sns.boxplot(y = 'PercentSalaryHike', x = 'PerformanceRating', data = data_cap)
plt.show()

#Insight 10
"""employees with rating of 4 have got hikes between (20-25), whereas employees with rating of 3 have got between (11-19)""" 

In [None]:
grp1 = data_cap.groupby(['RelationshipSatisfaction', 'Attrition'])
grp1.groups.keys()


In [None]:
count = grp1.get_group((1, 'Yes'))['Attrition'].count()
count

In [None]:
attrbyrelsat = pd.DataFrame(columns = ['Relsat', 'Attrition', 'Count', 'Attr%'])
attrbyrelsat

In [None]:
attrbyrelsat.loc[len(attrbyrelsat)] = ['1', 'Yes', count, count/data_cap['RelationshipSatisfaction'].value_counts()[1]*100]
attrbyrelsat

In [None]:
count = grp1.get_group((2, 'Yes'))['Attrition'].count()
attrbyrelsat.loc[len(attrbyrelsat)] = ['2', 'Yes', count,count/data_cap['RelationshipSatisfaction'].value_counts()[2]*100]
attrbyrelsat

In [None]:
count = grp1.get_group((3, 'Yes'))['Attrition'].count()
attrbyrelsat.loc[len(attrbyrelsat)] = ['3', 'Yes', count, count/data_cap['RelationshipSatisfaction'].value_counts()[3]*100]
attrbyrelsat

In [None]:
count = grp1.get_group((4, 'Yes'))['Attrition'].count()
attrbyrelsat.loc[len(attrbyrelsat)] = ['4', 'Yes', count, count/data_cap['RelationshipSatisfaction'].value_counts()[4]*100]
attrbyrelsat

In [None]:
plt.figure(figsize = (16,6))
sns.barplot(y = 'Attr%', x = 'Relsat', data = attrbyrelsat)
plt.show()

#Insight 11
"""employees with relation satisfaction of 1 have highest attrition numbers at 20.65%.""" 

# Data preprocessing

In [None]:
#Checking for unwanted columns
print("EmployeeCount unique values::", data_cap['EmployeeCount'].unique())
print("Over18 unique values::",data_cap['Over18'].unique())
print("StandardHours unique values::",data_cap['StandardHours'].unique())
#There is only one value for above column hence dropping it as they will not affect our model


In [None]:
data_cap.drop(columns = ['EmployeeCount', 'Over18','StandardHours', 'EmployeeNumber'], inplace = True)

In [None]:
data_cap.columns.tolist()

In [None]:
data_cap.shape

### Converting labels to nominal data

In [None]:
from sklearn import preprocessing

def labelencoder(df):
    temp_df = df.copy()
    le = preprocessing.LabelEncoder()
    
    temp_df["Attrition"] = le.fit_transform(temp_df["Attrition"])
    temp_df["BusinessTravel"] = le.fit_transform(temp_df["BusinessTravel"])
    temp_df["Department"] = le.fit_transform(temp_df["Department"])
    temp_df["EducationField"] = le.fit_transform(temp_df["EducationField"])
    temp_df["Gender"] = le.fit_transform(temp_df["Gender"])
    temp_df["JobRole"] = le.fit_transform(temp_df["JobRole"])
    temp_df["MaritalStatus"] = le.fit_transform(temp_df["MaritalStatus"])
    temp_df["OverTime"] = le.fit_transform(temp_df["OverTime"])
    return temp_df

encoded_cap_data = labelencoder(data_cap)




## Modelling using Random Forest

In [None]:
#Converting 'Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole'
#'MaritalStatus', 'overtime'

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc,confusion_matrix
import numpy as np

In [None]:
data_cap.head()

In [None]:
#Segregating the Independent and the dependant variable
y = encoded_cap_data["Attrition"].values
X = encoded_cap_data.drop(["Attrition"],axis =1)

In [None]:
#Splitting into training and testing
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2)


In [None]:
X_train.shape

RANDOM FOREST

In [None]:
#Build the decision tree model with tree size 50
model_HR_RF = RandomForestClassifier(n_estimators = 50, random_state = 0)
model_HR_RF.fit(X_train, y_train)
model_HR_score_train = model_HR_RF.score(X_train, y_train)
print("Training score: ",model_HR_score_train)
model_HR_score_test = model_HR_RF.score(X_test, y_test)
print("Testing score: ",model_HR_score_test)


In [None]:
#caluclating the probabilities for the AUC curve
#Decision treemodel_dt_2
y_pred_prob = model_HR_RF.predict_proba(X_test)[:, 1]

In [None]:
#Predicting the Attrition for X_test
y_pred_rf = model_HR_RF.predict(X_test)

### Performance metrics

In [None]:
# Model Accuracy:
print ("Model Accuracy is::", (accuracy_score(y_test,y_pred_rf))*100)
# Recall
print("Model recall is::", (recall_score(y_test, y_pred_rf))*100)

RF_Accuracy = accuracy_score(y_test,y_pred_rf)*100

In [None]:
#Area under the curve
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred_prob)
roc_auc_dt = auc(fpr_dt, tpr_dt)

In [None]:
from sklearn.metrics import classification_report
model_report = classification_report(y_test, y_pred_rf)
print(model_report)

In [None]:
plt.figure(1)
lw = 2
plt.plot(fpr_dt, tpr_dt, color='green',
         lw=lw, label='Random Forest (AUC = %0.2f)' % roc_auc_dt)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Area Under Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

confusion_mat = confusion_matrix(y_test,y_pred_rf)

correctly_classified_RF = (list(confusion_mat)[0][0]+list(confusion_mat)[0][1])/len(y_test)*100
incorrectly_classified_RF = (list(confusion_mat)[1][0]+list(confusion_mat)[1][1])/len(y_test)*100


roc_RF = roc_auc_score(y_test,y_pred_rf)*100


In [None]:
import itertools
def plot_confusion_matrix(model, model_name,normalize=False): # This function prints and plots the confusion matrix.
    cm = confusion_matrix(y_test, model, labels=[0, 1])
    classes=["Stayed", "Left"]
    cmap = plt.cm.Blues
    title = f"Confusion Matrix {model_name}"
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cm = np.around(cm, decimals=3)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
plt.figure(figsize=(6,6))
plot_confusion_matrix(y_pred_rf, ' (Random Forest)',normalize=False)
plt.show()

In [None]:
importances= model_HR_RF.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
    
f, ax = plt.subplots(figsize=(50,25))
plt.title("Feature ranking", fontsize = 11)
plt.bar(range(X_train.shape[1]), importances[indices],
    color="b", 
    align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns)
plt.xlim([-1, X_train.shape[1]])
plt.ylabel("importance", fontsize = 11)
plt.xlabel("index of the feature", fontsize = 20)

## Modelling using Decision Tree Classifier

In [None]:
#importing DecisionTreeClassifier from ScikitLearn and GridSearchCV for Auto hyperparameter tuning

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV

In [None]:
#Fitting Decsion Tree classifier to Train and test


model_HR_DT = DecisionTreeClassifier(random_state=67,max_depth=6)
model_HR_DT.fit(X_train, y_train)

model_dt_score_train = model_HR_DT.score(X_train, y_train)
print("Training score: ",model_dt_score_train)
model_dt_score_test = model_HR_DT.score(X_test, y_test)
print("Testing score: ",model_dt_score_test)


In [None]:
#Predicting the Attrition for X_test
y_pred_dt = model_HR_DT.predict(X_test)

In [None]:
# Model Accuracy:
print ("Model Accuracy is::", (accuracy_score(y_test,y_pred_dt))*100)
# Recall
print("Model recall is::", (recall_score(y_test, y_pred_dt))*100)

DT_Accuracy = accuracy_score(y_test,y_pred_dt)*100

In [None]:
plt.figure(figsize=(6,6))
plot_confusion_matrix(y_pred_dt, ' (Decision Tree)',normalize=False)
plt.show()

### Performing GridSearchCV for Decision Tree

In [None]:
#Custom GridSearch Parameters

decisionT_params = {
                'criterion':['gini','entropy'],
                'min_samples_split': range(1,10),
                'max_depth':range(1,10), 
                'min_samples_leaf':range(1,5)
                    }

grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), decisionT_params, verbose=1, cv=5)



#Fitting all the GridSearch parameters combinations to X_train and y_train
grid_search_cv.fit(X_train, y_train)

In [None]:
grid_search_cv.best_estimator_

In [None]:
#Fitting Random Forestwith depth as 6 and with random_State


model_HR_DT = DecisionTreeClassifier(criterion='entropy', max_depth=9, random_state=42)
model_HR_DT.fit(X_train, y_train)

model_dt_score_train = model_HR_DT.score(X_train, y_train)
print("Training score: ",model_dt_score_train)
model_dt_score_test = model_HR_DT.score(X_test, y_test)
print("Testing score: ",model_dt_score_test)


#Predicting the Attrition for X_test
y_pred_dt = model_HR_DT.predict(X_test)

In [None]:
plt.figure(figsize=(6,6))
plot_confusion_matrix(y_pred_dt, ' (Decsion Tree after GridSearch)',normalize=False)
plt.show()

In [None]:
DT_Accuracy = accuracy_score(y_test,y_pred_dt)*100


confusion_mat_DT = confusion_matrix(y_test,y_pred_dt)
print(confusion_mat_DT)

correctly_classified_DT = ((list(confusion_mat_DT)[0][0]+list(confusion_mat_DT)[0][1])/len(y_test))*100
incorrectly_classified_DT = (list(confusion_mat_DT)[1][0]+list(confusion_mat_DT)[1][1])/len(y_test)*100

roc_DT = roc_auc_score(y_test,y_pred_dt)*100


In [None]:
model_report = classification_report(y_test, y_pred_dt)
print(model_report)

In [None]:
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred_dt)
roc_auc_dt = auc(fpr_dt, tpr_dt)
plt.figure(1)
lw = 2
plt.plot(fpr_dt, tpr_dt,
         lw=lw, label='Decision Tree(AUC = %0.2f)' % roc_auc_dt)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Area Under Curve')
plt.legend(loc="lower right")
plt.show()

# Prediction using Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay

logistic_model = LogisticRegression(solver='liblinear',random_state=0).fit(X_train,y_train)

print("Train Accuracy : {:.2f} %".format(accuracy_score(logistic_model.predict(X_train),y_train)))
print("Test Accuracy : {:.2f} %".format(accuracy_score(logistic_model.predict(X_test),y_test)))

y_pred_lr = logistic_model.predict(X_test)

In [None]:
model_report = classification_report(y_test, y_pred_lr)
print(model_report)

In [None]:
# Model Accuracy:
print ("Model Accuracy is::", (accuracy_score(y_test,y_pred_lr))*100)
# Recall
print("Model recall is::", (recall_score(y_test, y_pred_lr))*100)

lr_Accuracy = accuracy_score(y_test,y_pred_lr)*100

In [None]:
LR_Accuracy = accuracy_score(y_test,y_pred_lr)*100
print(LR_Accuracy)




confusion_mat = confusion_matrix(y_test,y_pred_lr)
print(confusion_mat)

correctly_classified_LR = (list(confusion_mat)[0][0]+list(confusion_mat)[0][1])/len(y_test)*100
incorrectly_classified_LR = (list(confusion_mat)[1][0]+list(confusion_mat)[1][1])/len(y_test)*100

roc_LR = roc_auc_score(y_test,y_pred_lr)*100


In [None]:

cm = confusion_matrix(y_test,logistic_model.predict(X_test))
classes = ["0","1"]
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=classes)

fig, ax = plt.subplots(figsize=(6,6))
plt.title("Confusion Matrix (Logistic Regression)")
disp = disp.plot(ax=ax,cmap='Reds')
plt.show()

In [None]:
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred_lr)
roc_auc_dt = auc(fpr_dt, tpr_dt)
plt.figure(1)
lw = 2
plt.plot(fpr_dt, tpr_dt,
         lw=lw, label='Linear Regrssion(AUC = %0.2f)' % roc_auc_dt)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Area Under Curve')
plt.legend(loc="lower right")
plt.show()

# Classification using Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

# Init the Gaussian Classifier
model_nb = GaussianNB()

# Train the model
model_nb.fit(X_train, y_train)
# Predict Output 
y_pred_nb = model_nb.predict(X_test)

In [None]:
# Plot Confusion Matrix
# plt.figure(figsize=(10,8))
mat = confusion_matrix(y_pred_nb, y_test)
names = np.unique(y_pred_nb)
sns.heatmap(mat, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=names, yticklabels=names,cmap='Blues')
plt.title('Confusion Matrix (Naive Bayes)')
plt.xlabel('Truth')
plt.ylabel('Predicted')
plt.plot()

### Naive bayes after hyperparameter tuning

In [None]:
param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=100)
}


nb_classifier = GaussianNB()

params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
gs_NB = GridSearchCV(estimator=nb_classifier, 
                 param_grid=params_NB, 
                 cv=4,   # use any cross validation technique 
                 verbose=1, 
                 scoring='accuracy') 
gs_NB.fit(X_train, y_train)
gs_NB.best_params_

In [None]:
y_pred_nb = gs_NB.predict(X_test)

In [None]:
# Decision tree with depth = 4
model_nb_score_train = gs_NB.score(X_train, y_train)
print("Training score: ",model_nb_score_train)
model_nb_score_test = gs_NB.score(X_test, y_test)
print("Testing score: ",model_nb_score_test)


In [None]:
NB_Accuracy = accuracy_score(y_test,y_pred_nb)*100
print(NB_Accuracy)



confusion_mat = confusion_matrix(y_test,y_pred_nb)
print(confusion_mat)
correctly_classified_NB = (list(confusion_mat)[0][0]+list(confusion_mat)[0][1])/len(y_test)*100
incorrectly_classified_NB = (list(confusion_mat)[1][0]+list(confusion_mat)[1][1])/len(y_test)*100


roc_NB = roc_auc_score(y_test,y_pred_nb)*100

In [None]:
model_report = classification_report(y_test, y_pred_nb)
print(model_report)

In [None]:
# Model Accuracy:
print ("Model Accuracy is::", (accuracy_score(y_test,y_pred_nb))*100)
# Recall
print("Model recall is::", (recall_score(y_test, y_pred_nb))*100)

nb_Accuracy = accuracy_score(y_test,y_pred_nb)*100

In [None]:
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred_nb)
roc_auc_dt = auc(fpr_dt, tpr_dt)
plt.figure(1)
lw = 2
plt.plot(fpr_dt, tpr_dt,
         lw=lw, label='Naive Bayes(AUC = %0.2f)' % roc_auc_dt)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Area Under Curve')
plt.legend(loc="lower right")
plt.show()


# Modelling using Support Vector Machines

In [None]:
from sklearn import svm

#Initializing the SVM object with Linear kernel
clf = svm.SVC(kernel='linear') 

In [None]:
#Train the model using the training sets
clf.fit(X_train, y_train)

In [None]:
#Predict the response for test dataset
y_pred_svm = clf.predict(X_test)

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

SVM_Accuracy = accuracy_score(y_test,y_pred_svm)*100

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",SVM_Accuracy)



confusion_mat = confusion_matrix(y_test,y_pred_svm)

correctly_classified_SVM = (list(confusion_mat)[0][0]+list(confusion_mat)[0][1])/len(y_test)*100
incorrectly_classified_SVM = (list(confusion_mat)[1][0]+list(confusion_mat)[1][1])/len(y_test)*100


roc_SVM = roc_auc_score(y_test,y_pred_svm)*100


In [None]:
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred_svm))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred_svm))

In [None]:
# Model Accuracy:
print ("Model Accuracy is::", (accuracy_score(y_test,y_pred_svm))*100)
# Recall
print("Model recall is::", (recall_score(y_test, y_pred_svm))*100)

svm_Accuracy = accuracy_score(y_test,y_pred_svm)*100

In [None]:
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred_svm)
roc_auc_dt = auc(fpr_dt, tpr_dt)
plt.figure(1)
lw = 2
plt.plot(fpr_dt, tpr_dt,
         lw=lw, label='Support Vector Machines(AUC = %0.2f)' % roc_auc_dt)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Area Under Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:
cm = confusion_matrix(y_test,y_pred_svm)
classes = ["0","1"]
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=classes)

fig, ax = plt.subplots(figsize=(6,6))
plt.title("Confusion Matrix (Logistic Regression)")
disp = disp.plot(ax=ax,cmap='Reds')
plt.show()

# Modelling using IBK - K-neareast Neighbour

In [None]:
# Import necessary modules for KNearest Neighbour
from sklearn.neighbors import KNeighborsClassifier

In [None]:
#Initializing the KNN object with seven neighbours

knn = KNeighborsClassifier(n_neighbors=7)

In [None]:
#Train the model using the training sets
knn.fit(X_train, y_train)

In [None]:
#Predict the response for test dataset
y_pred_knn = knn.predict(X_test)

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics


KNN_Accuracy = accuracy_score(y_test,y_pred_knn)*100

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",KNN_Accuracy)




confusion_mat = confusion_matrix(y_test,y_pred_knn)

correctly_classified_KNN = (list(confusion_mat)[0][0]+list(confusion_mat)[0][1])/len(y_test)*100
incorrectly_classified_KNN = (list(confusion_mat)[1][0]+list(confusion_mat)[1][1])/len(y_test)*100


roc_KNN = roc_auc_score(y_test,y_pred_knn)*100

In [None]:
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred_knn))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred_knn))

In [None]:
# Model Accuracy:
print ("Model Accuracy is::", (accuracy_score(y_test,y_pred_knn))*100)
# Recall
print("Model recall is::", (recall_score(y_test, y_pred_knn))*100)

knn_Accuracy = accuracy_score(y_test,y_pred_knn)*100

### Checking on the best neighbours for IBK - KNN

In [None]:
neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
 
# Loop over K values
for i, k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
     
    # Compute training and test data accuracy
    train_accuracy[i] = knn.score(X_train, y_train)
    test_accuracy[i] = knn.score(X_test, y_test)
 
 #Generate plot
plt.plot(neighbors, test_accuracy, label = 'Testing dataset Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training dataset Accuracy')
 
plt.legend()
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.show()

In [None]:
#Initializing the KNN object with seven neighbours

knn = KNeighborsClassifier(n_neighbors=1)

#Train the model using the training sets
knn.fit(X_train, y_train)

#Predict the response for test dataset
y_pred_knn = knn.predict(X_test)

KNN_Accuracy = accuracy_score(y_test,y_pred_knn)*100

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",KNN_Accuracy)



# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred_knn))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred_knn))


# Model Recall: what percentage of positive tuples are labelled as such?
print("F1 Score:",metrics.f1_score(y_test, y_pred_knn))



confusion_mat = confusion_matrix(y_test,y_pred_knn)

correctly_classified_KNN = (list(confusion_mat)[0][0]+list(confusion_mat)[0][1])/len(y_test)*100
incorrectly_classified_KNN = (list(confusion_mat)[1][0]+list(confusion_mat)[1][1])/len(y_test)*100

roc_KNN = roc_auc_score(y_test,y_pred_knn)*100

In [None]:
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred_knn)
roc_auc_dt = auc(fpr_dt, tpr_dt)
plt.figure(1)
lw = 2
plt.plot(fpr_dt, tpr_dt,
         lw=lw, label='KNN(AUC = %0.2f)' % roc_auc_dt)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Area Under Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:
cm = confusion_matrix(y_test,y_pred_knn)
classes = ["0","1"]
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=classes)

fig, ax = plt.subplots(figsize=(6,6))
plt.title("Confusion Matrix (Logistic Regression)")
disp = disp.plot(ax=ax,cmap='Reds')
plt.show()

# Modelling using AdaBoost Classifier

In [None]:
# Load libraries
from sklearn.ensemble import AdaBoostClassifier

In [None]:
# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=50,
                         learning_rate=1)
# Train Adaboost Classifer
model = abc.fit(X_train, y_train)

#Predict the response for test dataset
y_pred_ada = model.predict(X_test)

In [None]:
# Model Accuracy, how often is the classifier correct?

ADA_Accuracy = accuracy_score(y_test,y_pred_ada)*100

print("Accuracy:",ADA_Accuracy)

In [None]:


# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred_ada))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred_ada))


# Model Recall: what percentage of positive tuples are labelled as such?
print("F1 Score:",metrics.f1_score(y_test, y_pred_ada))



confusion_mat = confusion_matrix(y_test,y_pred_ada)

correctly_classified_ADA = (list(confusion_mat)[0][0]+list(confusion_mat)[0][1])/len(y_test)*100
incorrectly_classified_ADA = (list(confusion_mat)[1][0]+list(confusion_mat)[1][1])/len(y_test)*100

roc_ADA = roc_auc_score(y_test,y_pred_ada)*100

In [None]:
# Model Accuracy:
print ("Model Accuracy is::", (accuracy_score(y_test,y_pred_ada))*100)
# Recall
print("Model recall is::", (recall_score(y_test, y_pred_ada))*100)

ada_Accuracy = accuracy_score(y_test,y_pred_ada)*100

In [None]:
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred_ada)
roc_auc_dt = auc(fpr_dt, tpr_dt)
plt.figure(1)
lw = 2
plt.plot(fpr_dt, tpr_dt,
         lw=lw, label='Adaboost(AUC = %0.2f)' % roc_auc_dt)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Area Under Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:
cm = confusion_matrix(y_test,y_pred_ada)
classes = ["0","1"]
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=classes)

fig, ax = plt.subplots(figsize=(6,6))
plt.title("Confusion Matrix (Logistic Regression)")
disp = disp.plot(ax=ax,cmap='Reds')
plt.show()

**bold text**## Plotting Correctly classified and incorrectly classified instances

# Modelling using Neural Network (Deep & Shallow)

In [None]:
# ann and dl libraraies
from keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam,SGD,Adagrad,Adadelta,RMSprop
from tensorflow.keras.utils import to_categorical


from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.layers import LeakyReLU

import random as rn

In [None]:
#Function for Shallo Neural Network
def create_model_shallow(optimizer,loss):
    # create model
    model = Sequential()
    model.add(Dense(200, activation='sigmoid'))
    model.add(Dropout(0.2))
    model.add(Dense(50, activation='sigmoid')) #Have only one hidden layer
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid')) # Since it has 2 outputs, 'sigmoid' as activation in the output layer
    # Compile model
    model.compile(loss=loss, optimizer=optimizer,metrics=["accuracy"])
    return model


In [None]:
#Applying SHALLOW NN with Adam Optimzer, binary_crossentropy and with epoch size of 100

shallow_model_adam = create_model_shallow(optimizer='adam', loss = 'binary_crossentropy') 

train=shallow_model_adam.fit(X_train, y_train, epochs=100, batch_size=100, verbose=0,validation_data=(X_test,y_test))


#Accuracy for Shallow Neural Network
train_acc = shallow_model_adam.evaluate(X_train, y_train, verbose=0)
test_acc = shallow_model_adam.evaluate(X_test, y_test, verbose=0)

print("Training score: ",round(train_acc[1]*100,2))
print("Testing score: ",round(test_acc[1]*100,2))



# plot loss during training
import matplotlib.pyplot as plt
plt.plot(train.history['loss'], label='Training loss')
plt.plot(train.history['val_loss'], label='Validation Loss')
plt.plot(train.history['accuracy'],label='Training Accuracy')
plt.plot(train.history['val_accuracy'],label='Testing Accuracy')
plt.title('Training and Validation Loss - Shallow NN')
plt.xlabel('epochs')
plt.ylabel('loss values')
plt.legend(loc='upper right')
plt.show()


In [None]:
y_pred_snn = shallow_model_adam.predict(X_test)
y_pred_prob_snn = (y_pred_snn >= 0.7)





In [None]:
y_pred_snn = shallow_model_adam.predict(X_test)
y_pred_prob_snn = (y_pred_snn >= 0.7)

NN_Accuracy = accuracy_score(y_test,y_pred_prob_snn)*100

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",NN_Accuracy)


confusion_mat_NN = confusion_matrix(y_test,y_pred_prob_snn)

correctly_classified_NN = (list(confusion_mat_NN)[0][0]+list(confusion_mat_NN)[0][1])/len(y_test)*100
incorrectly_classified_NN = (list(confusion_mat_NN)[1][0]+list(confusion_mat_NN)[1][1])/len(y_test)*100

roc_NN = roc_auc_score(y_test,y_pred_snn)*100



In [None]:
#Function for Deep Neural Network
def create_model_deep(optimizer,loss):
    # create model
    model = Sequential()
    model.add(Dense(200, activation='sigmoid'))
    model.add(Dense(50,  activation='sigmoid'))
    model.add(Dense(50,  activation='sigmoid'))
    model.add(Dense(1,   activation='sigmoid')) # Since it has 2 outputs, 'sigmoid' as activation in the output layer
    # Compile model
    model.compile(loss=loss, optimizer=optimizer,metrics=["accuracy"])
    return model

In [None]:
#Applying DEEP NN with Adam Optimzer, binary_crossentropy and with epoch size of 100

deep_model_adam = create_model_deep(optimizer='adam', loss = 'binary_crossentropy') 

train=deep_model_adam.fit(X_train, y_train, epochs=100, batch_size=100, verbose=0,validation_data=(X_test,y_test))


#Accuracy for Shallow Neural Network
train_acc = deep_model_adam.evaluate(X_train, y_train, verbose=0)
test_acc = deep_model_adam.evaluate(X_test, y_test, verbose=0)

print("Training score: ",round(train_acc[1]*100,2))
print("Testing score: ",round(test_acc[1]*100,2))


# plot loss during training
import matplotlib.pyplot as plt
plt.plot(train.history['loss'], label='Training loss')
plt.plot(train.history['val_loss'], label='Validation Loss')
plt.plot(train.history['accuracy'],label='Training Accuracy')
plt.plot(train.history['val_accuracy'],label='Testing Accuracy')
plt.title('Training and Validation Loss - DEEP NN')
plt.xlabel('epochs')
plt.ylabel('loss values')
plt.legend(loc='upper right')
plt.show()


In [None]:
### Neural Network with GridSearchCV

In [None]:
# Grid Search Cross Validation
# GridSearch Cross Validation Parameters
param_grid = {
   
    'epochs': [50,100,150],
    'batch_size':[32,100,128],
    'optimizer':['RMSprop', 'Adam','SGD'],
    'loss' : ['mean_squared_error','binary_crossentropy']
}

# create model

# Creating Model Object with KerasClassifier
model_cv = KerasClassifier(build_fn=create_model_deep, verbose=0)


grid = GridSearchCV(estimator=model_cv,  
                    n_jobs=-1, 
                    verbose=1,
                    cv=5,
                    param_grid=param_grid)

grid_cv_model = grid.fit(X_train, y_train,) # Fitting the GridSearch Object on the Train Set


means = grid_cv_model.cv_results_['mean_test_score'] # Mean of test scores
stds = grid_cv_model.cv_results_['std_test_score'] # standard deviations of test scores
params = grid_cv_model.cv_results_['params'] # parameters used
# to print all scores, standard deviations and parameters used
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Printing the Best Parameters as a Result of Grid Search Cross Validation on the Screen
print("Best: %f using %s" % (grid_cv_model.best_score_, grid_cv_model.best_params_))

In [None]:
# %% Model Tuning- Building a Tuned Model with Best Parameters
# Creating Tuned Model Object with KerasClassifier
cv_model = grid_cv_model.best_estimator_

In [None]:
# Tuned Model Prediction

y_pred_nn = cv_model.predict(X_test)

In [None]:
import sklearn.metrics as metrics
print("f1_weighted:",metrics.f1_score(y_test, y_pred_nn,average='weighted'))
# %% Accuracy
print("accuracy:",metrics.accuracy_score(y_test, y_pred_nn))

In [None]:
from sklearn.metrics import classification_report

# Classification Report
model_report = classification_report(y_test, y_pred_nn)
print(model_report)

In [None]:
#y_pred = shallow_model_adam.predict(X_test)
y_pred_prob_nn = (y_pred_nn >= 0.7)

NN_Accuracy = accuracy_score(y_test,y_pred_prob_nn)*100

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",NN_Accuracy)


confusion_mat = confusion_matrix(y_test,y_pred_prob_nn)

correctly_classified_NN = (list(confusion_mat)[0][0]+list(confusion_mat)[0][1])/len(y_test)*100
incorrectly_classified_NN = (list(confusion_mat)[1][0]+list(confusion_mat)[1][1])/len(y_test)*100

roc_NN = roc_auc_score(y_test,y_pred_prob_nn)*100


In [None]:
#print(grid_cv_model.cv_results_)

## Plotting Correctly classified and incorrectly classified instances

In [None]:
#Plotting accuracy of different models:

y = ['Random Forest',          'Decision Tree',           'Naive Bayes',             'Logistic Regression',       'KNN',                     'AdaBoost Classifier',       'SVM',   'Neural Network']
X1 = [RF_Accuracy,   DT_Accuracy,   NB_Accuracy,   LR_Accuracy,   KNN_Accuracy,   ADA_Accuracy,   SVM_Accuracy, NN_Accuracy]
X2 = [100-RF_Accuracy,   100-DT_Accuracy,   100-NB_Accuracy,   100-LR_Accuracy,   100-KNN_Accuracy,   100-ADA_Accuracy,   100-SVM_Accuracy, 100-NN_Accuracy]



fig, ax = plt.subplots(figsize=(15,7))
X_axis = np.arange(len(X))

  
plt.bar(X_axis - 0.2, y1, 0.5, label = 'Correctly Classified', color = 'blue')
plt.bar(X_axis + 0.2, y2, 0.5, label = 'Incorrectly Classified', color = 'yellow')
  
    
plt.xticks(X_axis, X)
plt.ylabel('Accuracy')
plt.xlabel("Classification Model Used")
plt.title("Model True Positive and False Positive Comparison")
plt.legend()
plt.show()
    
    

In [None]:
#Plotting accuracy of different models:

X = ['Random Forest',          'Decision Tree',           'Naive Bayes',             'Logistic Regression',       'KNN',                     'AdaBoost Classifier',       'SVM',   'Neural Network']
y1 = [RF_Accuracy,   DT_Accuracy,   NB_Accuracy,   LR_Accuracy,   KNN_Accuracy,   ADA_Accuracy,   SVM_Accuracy, NN_Accuracy]


fig, ax = plt.subplots(figsize=(15,7))
X_axis = np.arange(len(X))

  
plt.barh(X_axis - 0.2, y1, 0.5,color = 'grey')

    
plt.yticks(X_axis, X)
plt.xlabel("Accuracy")
plt.ylabel("Classification Model Used")
plt.title("Model Accuracy Comparison")
plt.legend()
plt.show()
    
    

In [None]:
#Plotting accuracy of different models:

X = ['Random Forest',          'Decision Tree',           'Naive Bayes',             'Logistic Regression',       'KNN',                     'AdaBoost Classifier',       'SVM',   'Neural Network']
y1 = [RF_Accuracy,   DT_Accuracy,   NB_Accuracy,   LR_Accuracy,   KNN_Accuracy,   ADA_Accuracy,   SVM_Accuracy, NN_Accuracy]
y2 = [100-RF_Accuracy,   100-DT_Accuracy,   100-NB_Accuracy,   100-LR_Accuracy,   100-KNN_Accuracy,   100-ADA_Accuracy,   100-SVM_Accuracy, 100-NN_Accuracy]



fig, ax = plt.subplots(figsize=(15,7))
X_axis = np.arange(len(X))

  
plt.barh(X_axis - 0.2, y1, 0.5, label = 'Correctly Classified')
plt.barh(X_axis + 0.2, y2, 0.5, label = 'Incorrectly Classified')
  
    
plt.yticks(X_axis, X)
plt.xlabel('Accuracy')
plt.ylabel("Classification Model Used")
plt.title("Model True Positive and False Positive Comparison")
plt.legend()
plt.show()
    
    

In [None]:
# Plotting comparison of ROC score of models used

In [None]:

X = ['Random Forest','Decision Tree','Naive Bayes', 'Logistic Regression','KNN','AdaBoost Classifier','SVM','Neural Network']
y = [ roc_RF,roc_DT, roc_NB,roc_LR, roc_KNN,roc_ADA,roc_SVM, roc_NN]


fig, ax = plt.subplots(figsize=(15,7))
plt.plot(X,y)
plt.xlabel("Classification Models used")
plt.ylabel("ROC Score")
plt.title("ROC Score Comparison")
plt.show()  # show first chart

In [None]:
#Plotting accuracy of different models:

y = ['Random Forest',          'Decision Tree',           'Naive Bayes',             'Logistic Regression',       'KNN',                     'AdaBoost Classifier',       'SVM',   'Neural Network']
X1 = [RF_Accuracy,   DT_Accuracy,   NB_Accuracy,   LR_Accuracy,   KNN_Accuracy,   ADA_Accuracy,   SVM_Accuracy, NN_Accuracy]
X2 = [100-RF_Accuracy,   100-DT_Accuracy,   100-NB_Accuracy,   100-LR_Accuracy,   100-KNN_Accuracy,   100-ADA_Accuracy,   100-SVM_Accuracy, 100-NN_Accuracy]



fig, ax = plt.subplots(figsize=(15,7))
X_axis = np.arange(len(X))

  
plt.bar(X_axis - 0.2, y1, 0.5, label = 'Correctly Classified', color = 'blue')
plt.bar(X_axis + 0.2, y2, 0.5, label = 'Incorrectly Classified', color = 'yellow')
  
    
plt.xticks(X_axis, X)
plt.ylabel('Accuracy')
plt.xlabel("Classification Model Used")
plt.title("Model True Positive and False Positive Comparison")
plt.legend()
plt.show()
    



In [None]:
#Plotting accuracy of different models:

y = ['Random Forest',          'Decision Tree',           'Naive Bayes',             'Logistic Regression',       'KNN',                     'AdaBoost Classifier',       'SVM',   'Neural Network']
X1 = [RF_Accuracy,   DT_Accuracy,   NB_Accuracy,   LR_Accuracy,   KNN_Accuracy,   ADA_Accuracy,   SVM_Accuracy, NN_Accuracy]


fig, ax = plt.subplots(figsize=(15,7))
X_axis = np.arange(len(X))

  
plt.bar(X_axis - 0.2, y1, 0.5,color = 'green', label = 'Model Accuracy')

    
plt.xticks(X_axis, X)
plt.ylabel("Accuracy")
plt.xlabel("Classification Model Used")
plt.title("Model Accuracy Comparison")
plt.legend()
plt.show()
    
    

In [None]:
sns.pairplot(data_cap)

In [None]:
encoded_cap_data

In [None]:
sns.pairplot(encoded_cap_data)