In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, wilcoxon, friedmanchisquare, mannwhitneyu, chi2_contingency
import seaborn as sns

In [2]:
na_vals = ["NA", "Na", "NaN", "Missing"]
df = pd.read_csv("general_data.csv", na_values = na_vals, index_col = "EmployeeID")

In [3]:
df.drop_duplicates(inplace=True)
df.dropna(how="any", inplace=True)
df.drop(columns = ["Over18", "EmployeeCount", "StandardHours"], inplace=True)

In [4]:
df.head(15)

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
EmployeeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,51,No,Travel_Rarely,Sales,6,2,Life Sciences,Female,1,Healthcare Representative,Married,131160,1.0,11,0,1.0,6,1,0,0
2,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,Female,1,Research Scientist,Single,41890,0.0,23,1,6.0,3,5,1,4
3,32,No,Travel_Frequently,Research & Development,17,4,Other,Male,4,Sales Executive,Married,193280,1.0,15,3,5.0,2,5,0,3
4,38,No,Non-Travel,Research & Development,2,5,Life Sciences,Male,3,Human Resources,Married,83210,3.0,11,3,13.0,5,8,7,5
5,32,No,Travel_Rarely,Research & Development,10,1,Medical,Male,1,Sales Executive,Single,23420,4.0,12,2,9.0,2,6,0,4
6,46,No,Travel_Rarely,Research & Development,8,3,Life Sciences,Female,4,Research Director,Married,40710,3.0,13,0,28.0,5,7,7,7
7,28,Yes,Travel_Rarely,Research & Development,11,2,Medical,Male,2,Sales Executive,Single,58130,2.0,20,1,5.0,2,0,0,0
8,29,No,Travel_Rarely,Research & Development,18,3,Life Sciences,Male,2,Sales Executive,Married,31430,2.0,22,3,10.0,2,0,0,0
9,31,No,Travel_Rarely,Research & Development,1,3,Life Sciences,Male,3,Laboratory Technician,Married,20440,0.0,21,0,10.0,2,9,7,8
10,25,No,Non-Travel,Research & Development,7,4,Medical,Female,4,Laboratory Technician,Divorced,134640,1.0,13,1,6.0,2,6,1,5


In [5]:
df_left = df[df["Attrition"] == "Yes"]
df_stay = df[df["Attrition"] == "No"]
df_stay.reset_index(drop=True, inplace=True)
df_left.reset_index(drop=True, inplace=True)

In [6]:
def hypo_chooser(p):
    print(f"p = {p}\nHence ",end = "")
    if p >= 0.05:
        print("Ho is considered since p >= 0.05")
    else:
        print("H1 is considered since p < 0.05")

<h2>Hypothesis 1:</h2>
<p style = "font-size: 19px; margin-bottom: 7px">Ho:There is no significant difference in the age of the employees who left and who didn't leave<br>
H1:There is a significant difference in the age of the employees who left and who didn't leave<br></p>
<p style="margin-top: 10px;font-size: 19px;color:#333">There are two two independent variables that are the people who left the company and the one's who didn't. We're comparing both these variable's age<br>
Hence we will be using Mann-Whitney Test statistic</p>

In [7]:
stats, p = mannwhitneyu(df_stay.Age, df_left.Age)
hypo_chooser(p)
print("Therefore there's a ")

p = 2.652171037670851e-11
Hence H1 is considered since p < 0.05
Therefore there's a 


<h2>Hypothesis 2:</h2>
<p style = "font-size: 19px; margin-bottom: 7px">Ho:There is no dependency between BusinessTravel and Attribution<br>
H1:There is dependency between BusinessTravel and Attribution<br></p>
<div style="margin-top: 10px;font-size: 19px;color:#333">Since we're checking the dependency of categorical variables<br>
Therefore we will be using chi square test statistic<br>
Making A Chi Table</div>

In [8]:
chitable = pd.crosstab(df.Attrition, df.BusinessTravel)
chitable

BusinessTravel,Non-Travel,Travel_Frequently,Travel_Rarely
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,138,208,887
Yes,12,69,156


<div style="font-size: 19px; color:#333">Finding value of p</div>

In [9]:
stats, p, dof, expected = chi2_contingency(chitable)
hypo_chooser(p)

p = 5.608614476449931e-06
Hence H1 is considered since p < 0.05


<h2>Hypothesis 3:</h2>
<p style = "font-size: 19px; margin-bottom: 7px">Ho:There is no dependency between Department and Attribution<br>
H1:There is dependency between Department and Attribution<br></p>
<div style="margin-top: 10px;font-size: 19px;color:#333">Since we're checking the dependency of categorical variables<br>
Therefore we will be using chi square test statistic<br>
Making A Chi Table</div>

In [10]:
chitable = pd.crosstab(df.Attrition, df.Department)
chitable

Department,Human Resources,Research & Development,Sales
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,44,810,379
Yes,19,151,67


<div style="font-size: 19px; color:#333">Finding value of p</div>

In [11]:
stats, p, dof, expected = chi2_contingency(chitable)
hypo_chooser(p)

p = 0.007841076431711646
Hence H1 is considered since p < 0.05


<h2>Hypothesis 4:</h2>
<p style = "font-size: 19px; margin-bottom: 7px">Ho:There is no significant difference in the distance from home of the employees who left and who didn't leave<br>
H1:There is no significant difference in the distance from home of the employees who left and who didn't leave<br></p>
<p style="margin-top: 10px;font-size: 19px;color:#333">There are two two independent variables that are the people who left the company and the one's who didn't. We're comparing both these variable's distance from home<br>
Hence we will be using Mann-Whitney Test statistic</p>

In [12]:
stats, p = mannwhitneyu(df_left.DistanceFromHome, df_stay.DistanceFromHome)
hypo_chooser(p)

p = 0.4786049358224514
Hence Ho is considered since p >= 0.05


<h2>Hypothesis 5:</h2>
<p style = "font-size: 19px; margin-bottom: 7px">Ho:There is no dependency between Education and Attribution<br>
H1:There is dependency between Education and Attribution<br></p>
<div style="margin-top: 10px;font-size: 19px;color:#333">Since we're checking the dependency of categorical variables<br>
Therefore we will be using chi square test statistic<br>
Making A Chi Table</div>

In [13]:
chitable = pd.crosstab(df.Attrition,df.Education)
chitable

Education,1,2,3,4,5
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
No,144,229,483,336,41
Yes,26,53,89,62,7


<div style="font-size: 19px; color:#333">Finding value of p</div>

In [14]:
stats, p, dof, expected = chi2_contingency(chitable)
hypo_chooser(p)

p = 0.7577477640783867
Hence Ho is considered since p >= 0.05


<h2>Hypothesis 6:</h2>
<p style = "font-size: 19px; margin-bottom: 7px">Ho:There is no dependency between EducationField and Attribution<br>
H1:There is dependency between EducationField and Attribution<br></p>
<div style="margin-top: 10px;font-size: 19px;color:#333">Since we're checking the dependency of categorical variables<br>
Therefore we will be using chi square test statistic<br>
Making A Chi Table</div>

In [15]:
chitable = pd.crosstab(df.Attrition,df.EducationField)
chitable

EducationField,Human Resources,Life Sciences,Marketing,Medical,Other,Technical Degree
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
No,16,505,134,389,72,117
Yes,11,101,25,75,10,15


<div style="font-size: 19px; color:#333">Finding value of p</div>

In [16]:
stats, p, dof, expected = chi2_contingency(chitable)
hypo_chooser(p)

p = 0.00878935071037593
Hence H1 is considered since p < 0.05


<h2>Hypothesis 7:</h2>
<p style = "font-size: 19px; margin-bottom: 7px">Ho:There is no dependency between JobRole and Attrition<br>
H1:There is dependency between JobRole and Attribution<br></p>
<div style="margin-top: 10px;font-size: 19px;color:#333">Since we're checking the dependency of categorical variables<br>
Therefore we will be using chi square test statistic<br>
Making A Chi Table</div>

In [17]:
chitable = pd.crosstab(df.Attrition,df.JobRole)
chitable

JobRole,Healthcare Representative,Human Resources,Laboratory Technician,Manager,Manufacturing Director,Research Director,Research Scientist,Sales Executive,Sales Representative
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,112,45,217,88,129,61,239,271,71
Yes,19,7,42,14,16,19,53,55,12


<div style="font-size: 19px; color:#333">Finding value of p</div>

In [18]:
stats, p, dof, expected = chi2_contingency(chitable)
hypo_chooser(p)

p = 0.3979911431958358
Hence Ho is considered since p >= 0.05


<h2>Hypothesis 8:</h2>
<p style = "font-size: 19px; margin-bottom: 7px">Ho:There is no dependency between MaritalStatus and Attrition<br>
H1:There is dependency between MaritalStatus and Attribution<br></p>
<div style="margin-top: 10px;font-size: 19px;color:#333">Since we're checking the dependency of categorical variables<br>
Therefore we will be using chi square test statistic<br>
Making A Chi Table</div>

In [19]:
chi_table = pd.crosstab(df.Attrition, df.MaritalStatus)
chi_table

MaritalStatus,Divorced,Married,Single
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,294,589,350
Yes,33,84,120


<div style="font-size: 19px; color:#333">Finding value of p</div>

In [20]:
stats, p, dof, expected = chi2_contingency(chitable)
hypo_chooser(p)

p = 0.3979911431958358
Hence Ho is considered since p >= 0.05


<h2>Hypothesis 9:</h2>
<p style = "font-size: 19px; margin-bottom: 7px">Ho:There is no significant difference in the monthly income of employees who left and who didn't leave<br>
H1:There is significant difference in the monthly income of employees who left and who didn't leave<br></p>
<div style="margin-top: 10px;font-size: 19px;color:#333">There are two two independent variables that are the people who left the company and the one's who didn't. We're comparing both these variable's monthly income<br>
Hence we will be using Mann-Whitney Test statistic</div>

In [21]:
stats, p = mannwhitneyu(df_left.MonthlyIncome, df_stay.MonthlyIncome)
hypo_chooser(p)

p = 0.17621332994327893
Hence Ho is considered since p >= 0.05


<h2>Hypothesis 10:</h2>
<p style = "font-size: 19px; margin-bottom: 7px">Ho:There is no significant difference in total working years of employees who left and who didn't leave<br>
H1:There is significant difference in total working years of employees who left and who didn't leave<br></p>
<div style="margin-top: 10px;font-size: 19px;color:#333">There are two two independent variables that are the people who left the company and the one's who didn't. We're comparing both these variable's total working years<br>
Hence we will be using Mann-Whitney Test statistic</div>

In [22]:
stats, p = mannwhitneyu(df_left.TotalWorkingYears, df_stay.TotalWorkingYears)
hypo_chooser(p)

p = 1.199784682399476e-14
Hence H1 is considered since p < 0.05


<h2>Hypothesis 11:</h2>
<p style = "font-size: 19px; margin-bottom: 7px">Ho:There is no significant difference in number of years at company of employees who left and who didn't leave<br>
H1:There is significant difference in number of years at company of employees who left and who didn't leave<br></p>
<div style="margin-top: 10px;font-size: 19px;color:#333">There are two two independent variables that are the people who left the company and the one's who didn't. We're comparing both these variable's number of years spent at this company<br>
Hence we will be using Mann-Whitney Test</div>

In [23]:
stats, p = mannwhitneyu(df_left.YearsAtCompany, df_stay.YearsAtCompany)
hypo_chooser(p)

p = 1.458095684978208e-13
Hence H1 is considered since p < 0.05


<h2>Hypothesis 12:</h2>
<p style = "font-size: 19px; margin-bottom: 7px">Ho:There is no significant difference in number of years with current manager of employees who left and who didn't leave<br>
H1:There is significant difference in number of years with current boss of employees who left and who didn't leave<br></p>
<div style="margin-top: 10px;font-size: 19px;color:#333">There are two two independent variables that are the people who left the company and the one's who didn't. We're comparing both these variable's number of years with the current manager<br>
Hence we will be using Mann-Whitney Test</div>

In [24]:
stats, p = mannwhitneyu(df_left.YearsWithCurrManager, df_stay.YearsWithCurrManager)
hypo_chooser(p)

p = 9.033771291572203e-12
Hence H1 is considered since p < 0.05


<h2>Hypothesis 13:</h2>
<p style = "font-size: 19px; margin-bottom: 7px">Ho:There is no dependency between Gender and Attrition<br>
H1:There is dependency between Gender and Attribution<br></p>
<div style="margin-top: 10px;font-size: 19px;color:#333">Since we're checking the dependency of categorical variables<br>
Therefore we will be using chi square test statistic<br>
Making A Chi Table</div>

In [25]:
chi_table = pd.crosstab(df.Attrition, df.Gender)
</div>chi_table

Gender,Female,Male
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1
No,498,735
Yes,90,147


<div style="font-size: 19px; color:#333">Finding value of p

In [26]:
stats, p, dof, expected = chi2_contingency(chi_table)
hypo_chooser(p)

p = 0.5335882358793274
Hence Ho is considered since p >= 0.05


<h2>Hypothesis 14:</h2>
<p style = "font-size: 19px; margin-bottom: 7px">Ho:There is no dependency between JobLevel and Attrition<br>
H1:There is dependency between JobLevel and Attribution<br></p>
<div style="margin-top: 10px;font-size: 19px;color:#333">Since we're checking the dependency of categorical variables<br>
Therefore we will be using chi square test statistic<br>
Making A Chi Table</div>

In [27]:
chi_table = pd.crosstab(df.Attrition, df.JobLevel)
</div>chi_table

JobLevel,1,2,3,4,5
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
No,459,439,186,89,60
Yes,84,95,32,17,9


<div style="font-size: 19px; color:#333">Finding value of p</div>

In [28]:
stats, p, dof, expected = chi2_contingency(chi_table)
hypo_chooser(p)

p = 0.7192602477461423
Hence Ho is considered since p >= 0.05


<h2>Hypothesis 15:</h2>
<p style = "font-size: 19px; margin-bottom: 7px">Ho:There is no significant difference in number of companies worked by employees who left and who didn't leave<br>
H1:There is no significant difference in number of companies worked by employees who left and who didn't leave<br></p>
<div style="margin-top: 10px;font-size: 19px;color:#333">There are two two independent variables that are the people who left the company and the one's who didn't. We're comparing both these variable's number of companies they worked in<br>
Hence we will be using Mann-Whitney Test<div>

In [29]:
stats, p = mannwhitneyu(df_left.NumCompaniesWorked, df_stay.NumCompaniesWorked)
hypo_chooser(p)

p = 0.12118252544058267
Hence Ho is considered since p >= 0.05


<h2>Hypothesis 16:</h2>
<p style = "font-size: 19px; margin-bottom: 7px">Ho:There is no significant difference in percent salary hike of employees who left and who didn't leave<br>
H1:There is significant difference in percent salary hike of employees who left and who didn't leave<br></p>
<div style="margin-top: 10px;font-size: 19px;color:#333">There are two two independent variables that are the people who left the company and the one's who didn't. We're comparing both these variable's percent salary hike<br>
Hence we will be using Mann-Whitney Test<div>

In [30]:
stats, p = mannwhitneyu(df_left.PercentSalaryHike, df_stay.PercentSalaryHike)
hypo_chooser(p)

p = 0.11471348352261845
Hence Ho is considered since p >= 0.05


In [31]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'Gender', 'JobLevel', 'JobRole',
       'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked',
       'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')