In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Import the data into google colab directory so that we can work on it
from google.colab import files
uploaded = files.upload()

In [None]:
# Read the attrition data set
attr = pd.read_excel("attrition.xlsx", sheet_name="Existing employees", encoding='utf-8')
attr2 = pd.read_excel("attrition.xlsx", sheet_name ="Employees who have left", encoding='utf-8')


In [None]:
# Head view of the data set: existing employees
attr.head()

In [None]:
# Head view of the employees that have left
attr2.head()

In [None]:
# Tail view of the data set: existing employees
attr.tail()

In [None]:
# Tail view of the data set: employees who have left
attr2.tail()

In [None]:
# Let's check for the data type: existing employees
attr.dtypes

In [None]:
# Let's check for the data type: employees who have left
attr2.dtypes

In [None]:
# Information about the dataset: existing employees
attr.info()


In [None]:
# Information about the dataset: employees who have left
attr2.info()

In [None]:
# Getting some useful information from the data set: existing employees
attr.describe()

In [None]:
# Getting some useful information from the data set: employees who have left
attr2.describe()

In [None]:
# Let's drop the columns that may not be useful for our model (existing employees)
attr = attr.drop(['Emp ID'], axis=1)
attr.head()

In [None]:
# Let's drop the columns that may not be useful for our model (employees who have left)
attr2 = attr2.drop(['Emp ID'], axis=1)
attr2.head()

In [None]:
# Let's perform exploratory data analysis on this data set (existing employees)
# Total number of rows and columns in the data
attr.shape

In [None]:
# Let's perform exploratory data analysis on this data set (employees that have left)
# Total number of rows and columns in the data
attr2.shape

In [None]:
# Let's check for duplicate data
duplicate_rows_of_attr = attr[attr.duplicated()]
print('Total number of duplicated row of the attrition data set is: ', duplicate_rows_of_attr.shape)

In [None]:
# Let's check for duplicate data
duplicate_rows_of_attr2 = attr2[attr2.duplicated()]
print('Total number of duplicated row of the attrition data set is: ', duplicate_rows_of_attr2.shape)

In [None]:
# Count the number of rows before removing duplicated rows
attr.count()

In [None]:
# Count the number of rows before removing duplicated rows (employees who have left)
attr2.count()

In [None]:
# Now we drop the duplicated rows of the data set (existing employees)
attr = attr.drop_duplicates()
# Head view of the data set
attr.head()

In [None]:
# Now we drop the duplicated rows of the data set (employees who have left)
attr2 = attr2.drop_duplicates()
# Head view of the data set
attr2.head()

In [None]:
# Let's count again
attr.count()

In [None]:
# Let's count again
attr2.count()


In [None]:
# Let's check out for null values(existing employees)
print(attr.isnull().sum())
# Great!! No Null value in our data set

In [None]:
# Let's check out for null values (employees who have left)
print(attr2.isnull().sum())
# Great!! No Null value in our data set

In [None]:
# Let's check out for outliers, Outliers will affect the accracy of our model if not dealt with properly
sns.boxplot(x= attr['satisfaction_level'])
plt.show()

In [None]:
sns.boxplot(x= attr['number_project'])
plt.show()

In [None]:
sns.boxplot(x= attr['promotion_last_5years'])
plt.show()

In [None]:
sns.boxplot(x= attr['average_montly_hours'])
plt.show()

In [None]:
sns.boxplot(x= attr['time_spend_company'])
plt.show()

In [None]:
sns.boxplot(x= attr['Work_accident'])
plt.show()

In [None]:
sns.boxplot(x= attr['last_evaluation'])
plt.show()

In [None]:
# Dealing with outliers using IQR
Q1 = attr.quantile(0.25)
Q3 = attr.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

In [None]:
# Still dealing with Outliers
attr = attr[~((attr < (Q1-1.5 * IQR)) |(attr > (Q3 + 1.5 * IQR))).any(axis=1)]
attr.shape

In [None]:
#Let's visualize some of out features in pairs
sns.pairplot(attr[[ 'last_evaluation', 'number_project',  'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years', 'satisfaction_level' ]])
plt.show()

In [None]:
#Let's make some pie chart of the dataset
f,ax=plt.subplots(1,2,figsize=(16,7))
attr.loc[attr['promotion_last_5years']==1, 'satisfaction_level'].value_counts().plot.pie(explode=None ,autopct='%1.1f%%',ax=ax[0],shadow=True)
attr.loc[attr['time_spend_company']==0, 'satisfaction_level'].value_counts().plot.pie(explode=None ,autopct='%1.1f%%',ax=ax[1],shadow=True)
ax[0].set_title('')
ax[1].set_title('')
plt.show()

In [None]:
attr['satisfaction_level'].hist();

In [None]:
attr['average_montly_hours'].hist()

In [None]:
attr['last_evaluation'].hist()

In [None]:
attr['time_spend_company'].hist();

In [None]:
attr['promotion_last_5years'].hist();

In [None]:
attr['Work_accident'].hist();

In [None]:
attr.boxplot(column = 'satisfaction_level', by = 'last_evaluation');

In [None]:
attr.boxplot(column = 'satisfaction_level', by = 'average_montly_hours');

In [None]:
attr.boxplot(column = 'satisfaction_level', by = 'Work_accident');

In [None]:
attr.boxplot(column = 'satisfaction_level', by = 'promotion_last_5years');

In [None]:
attr.boxplot(column = 'satisfaction_level', by = 'number_project');

In [None]:
attr.boxplot(column = 'satisfaction_level', by = 'time_spend_company');

In [None]:
# Finding the relations between the variables.
plt.figure(figsize=(20,10))
corr = attr.corr()
sns.heatmap(corr,cmap="BrBG",annot=True);
corr

Exploratory Data Analysis on employees who have left


In [None]:
# Finding the relations between the variables.
plt.figure(figsize=(20,10))
corr2 = attr2.corr()
sns.heatmap(corr,cmap="BrBG",annot=True);
corr2

In [None]:
attr2.boxplot(column = 'satisfaction_level', by = 'time_spend_company');

In [None]:
# Let's check out for outliers, Outliers will affect the accracy of our model if not dealt with properly
sns.boxplot(y= attr2['satisfaction_level'])
plt.show()

In [None]:
sns.boxplot(y= attr2['last_evaluation'])
plt.show()

In [None]:
sns.boxplot(y= attr2['promotion_last_5years'])
plt.show()

In [None]:
sns.boxplot(y= attr2['time_spend_company'])
plt.show()

In [None]:
sns.boxplot(y= attr2['average_montly_hours'])
plt.show()

In [None]:
sns.boxplot(y= attr2['Work_accident'])
plt.show()

In [None]:
attr2.describe()

In [None]:
sns.boxplot(y= attr2['number_project'])
plt.show()

In [None]:
# Dealing with outliers using IQR
Q2 = attr.quantile(0.25)
Q4 = attr.quantile(0.75)
IQRs = Q4 - Q2
print(IQRs)

In [None]:
# Still dealing with Outliers
attr2 = attr2[~((attr2 < (Q2-1.5 * IQRs)) |(attr2 > (Q4 + 1.5 * IQRs))).any(axis=1)]
attr2.shape

In [None]:
#Let's visualize some of out features in pairs
sns.pairplot(attr2[[ 'last_evaluation', 'number_project',  'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years', 'satisfaction_level' ]])
plt.show()

In [None]:
# Let's merge our data together so that we can build our classification algorithm on it
classification_data = attr.append(attr2)
X = classification_data.iloc[:,:-1].values
y = classification_data.iloc[:,-1].values

In [None]:
classification_data.head()

In [None]:
classification_data.drop(['dept', 'salary', 'Emp ID'], axis=1)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [None]:
from sklearn.model_selection import train_test_split
XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size=0.2 , random_state=5)

In [None]:
from sklearn.ensemble import RandomForestClassifier as RF
classifier = RF(n_estimators=10, criterion='entropy', random_state=0)
classifier.fit(XTrain,yTrain)
yPred = classifier.predict(XTest)
msquared = mean_squared_error(yTest,yPred)
r = r2_score(yTest,yPred)
m_abs_error = mean_absolute_error(yTest,yPred)
accuracy = accuracy_score(yTest,yPred)
print("Random Forest Classifier :")
print("Accuracy = ", accuracy)
print("Mean Squared Error:",msquared)
print("R score:",r)
print("Mean Absolute Error:",m_abs_error)

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel='linear',random_state=0)
classifier.fit(XTrain,yTrain)
yPred = classifier.predict(XTest)
msquared = mean_squared_error(yTest,yPred)
r = r2_score(yTest,yPred)
m_abs_error = mean_absolute_error(yTest,yPred)
accuracy = accuracy_score(yTest,yPred)
print("Support Vector Machine :")
print("Accuracy = ", accuracy)
print("Mean Squared Error:",msquared)
print("R score:",r)
print("Mean Absolute Error:",m_abs_error)