# Assignment 11

For questions refer [README](https://github.com/DSAghicha/LU-AI-ML/blob/main/Day_11/README.md)

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from warnings import filterwarnings

filterwarnings('ignore')


---


In [2]:
data_house = pd.read_csv("./data.csv")
data_house.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [7]:
rows, columns = data_house.shape

print(f"The given dataset has {rows} rows & {columns} columns.")

The given dataset has 4410 rows & 24 columns.



---


## Checking & Eliminating null values

In [11]:
data_house.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeID                  0
Gender                      0
JobLevel                    0
JobRole                     0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked         19
Over18                      0
PercentSalaryHike           0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           9
TrainingTimesLastYear       0
YearsAtCompany              0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

This shows that columns **NumCompaniesWorked** & **TotalWorkingYears** has null values.

∴ Replacing these values with mean of that column.


In [18]:
mean_ncw = data_house['NumCompaniesWorked'].mean().round(2)
mean_twy = data_house['TotalWorkingYears'].mean().round(2)

data_house['NumCompaniesWorked'] = data_house['NumCompaniesWorked'].fillna(mean_ncw)
data_house['TotalWorkingYears'] = data_house['TotalWorkingYears'].fillna(mean_twy)


In [19]:
data_house.isnull().sum()

Age                        0
Attrition                  0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
EmployeeCount              0
EmployeeID                 0
Gender                     0
JobLevel                   0
JobRole                    0
MaritalStatus              0
MonthlyIncome              0
NumCompaniesWorked         0
Over18                     0
PercentSalaryHike          0
StandardHours              0
StockOptionLevel           0
TotalWorkingYears          0
TrainingTimesLastYear      0
YearsAtCompany             0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
dtype: int64


---


## Checking & Eliminating Duplicates

In [22]:
data_house = data_house.drop_duplicates()
data_house.shape

(4410, 24)

∵ The shape remains same, ∄ any duplicates


---


## Information of dataset

In [23]:
data_house.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4410 entries, 0 to 4409
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      4410 non-null   int64  
 1   Attrition                4410 non-null   object 
 2   BusinessTravel           4410 non-null   object 
 3   Department               4410 non-null   object 
 4   DistanceFromHome         4410 non-null   int64  
 5   Education                4410 non-null   int64  
 6   EducationField           4410 non-null   object 
 7   EmployeeCount            4410 non-null   int64  
 8   EmployeeID               4410 non-null   int64  
 9   Gender                   4410 non-null   object 
 10  JobLevel                 4410 non-null   int64  
 11  JobRole                  4410 non-null   object 
 12  MaritalStatus            4410 non-null   object 
 13  MonthlyIncome            4410 non-null   int64  
 14  NumCompaniesWorked      

In [24]:
data_house.describe()

Unnamed: 0,Age,DistanceFromHome,Education,EmployeeCount,EmployeeID,JobLevel,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
count,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0
mean,36.92381,9.192517,2.912925,1.0,2205.5,2.063946,65029.312925,2.69481,15.209524,8.0,0.793878,11.279937,2.79932,7.008163,2.187755,4.123129
std,9.133301,8.105026,1.023933,0.0,1273.201673,1.106689,47068.888559,2.493497,3.659108,0.0,0.851883,7.774275,1.288978,6.125135,3.221699,3.567327
min,18.0,1.0,1.0,1.0,1.0,1.0,10090.0,0.0,11.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,30.0,2.0,2.0,1.0,1103.25,1.0,29110.0,1.0,12.0,8.0,0.0,6.0,2.0,3.0,0.0,2.0
50%,36.0,7.0,3.0,1.0,2205.5,2.0,49190.0,2.0,14.0,8.0,1.0,10.0,3.0,5.0,1.0,3.0
75%,43.0,14.0,4.0,1.0,3307.75,3.0,83800.0,4.0,18.0,8.0,1.0,15.0,3.0,9.0,3.0,7.0
max,60.0,29.0,5.0,1.0,4410.0,5.0,199990.0,9.0,25.0,8.0,3.0,40.0,6.0,40.0,15.0,17.0



---


## Preparing attrition

Replacing **Yes** with **1** & **No** with **0** for correlation analysis.

In [25]:
data_house.Attrition = data_house.Attrition.replace("Yes", 1)
data_house.Attrition = data_house.Attrition.replace("No", 0)

data_house.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,0,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,1,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,0,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,0,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,0,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4



---


## Correlation Analysis

H<sub>0</sub> : Null Hypothesis. There is **no** correlation.

H<sub>a</sub> : Alternative Hypothesis. There is **significant** correlation.


In [31]:
def type_correlation(p) -> str:
    if p < 0.05:
        return "positively correlated."
    elif p > 0.05:
        return "negatively correlated."
    else:
        return "stable."

### 1. Finding Correlation between Attrition & Age


In [32]:
r_val, p_val = pearsonr(data_house.Attrition, data_house.Age)

print(f"The p value is {p_val}.\n∴ Age & Attrition are {type_correlation(p_val)}")

The p value is 1.996801615886744e-26.
∴ Age & Attrition are positively correlated.


∴ H<sub>0</sub> is **rejected** & H<sub>a</sub> is **accepted**.

### 2. Finding Correlation between Attrition & DistanceFromHome

In [35]:
r_val, p_val = pearsonr(data_house.Attrition, data_house.DistanceFromHome)

print(f"The p value is {p_val}.\n∴ DistanceFromHome & Attrition are {type_correlation(p_val)}")

The p value is 0.5182860428050771.
∴ DistanceFromHome & Attrition are negatively correlated.


∴ H<sub>a</sub> is **rejected** & H<sub>0</sub> is **accepted**.

### 3. Finding Correlation between Attrition & Education

In [36]:
r_val, p_val = pearsonr(data_house.Attrition, data_house.Education)

print(f"The p value is {p_val}.\n∴ Education & Attrition are {type_correlation(p_val)}")

The p value is 0.3157293177117888.
∴ Education & Attrition are negatively correlated.


∴ H<sub>a</sub> is **rejected** & H<sub>0</sub> is **accepted**.

### 4. Finding Correlation between Attrition & EmployeeID

In [39]:
r_val, p_val = pearsonr(data_house.Attrition, data_house.EmployeeID)

print(f"The p value is {p_val}.\n∴ EmployeeID & Attrition are {type_correlation(p_val)}")

The p value is 0.7535487401886252.
∴ EmployeeID & Attrition are negatively correlated.


∴ H<sub>a</sub> is **rejected** & H<sub>0</sub> is **accepted**.

### 5. Finding Correlation between Attrition & JobLevel

In [40]:
r_val, p_val = pearsonr(data_house.Attrition, data_house.JobLevel)

print(f"The p value is {p_val}.\n∴ JobLevel & Attrition are {type_correlation(p_val)}")

The p value is 0.49451717271828405.
∴ JobLevel & Attrition are negatively correlated.


∴ H<sub>a</sub> is **rejected** & H<sub>0</sub> is **accepted**.

### 6. Finding Correlation between Attrition & MonthlyIncome

In [41]:
r_val, p_val = pearsonr(data_house.Attrition, data_house.MonthlyIncome)

print(f"The p value is {p_val}.\n∴ MonthlyIncome & Attrition are {type_correlation(p_val)}")

The p value is 0.03842748490600132.
∴ MonthlyIncome & Attrition are positively correlated.


∴ H<sub>0</sub> is **rejected** & H<sub>a</sub> is **accepted**.

### 7. Finding Correlation between Attrition & NumCompaniesWorked

In [42]:
r_val, p_val = pearsonr(data_house.Attrition, data_house.NumCompaniesWorked)

print(f"The p value is {p_val}.\n∴ NumCompaniesWorked & Attrition are {type_correlation(p_val)}")

The p value is 0.005033438384337837.
∴ NumCompaniesWorked & Attrition are positively correlated.


∴ H<sub>0</sub> is **rejected** & H<sub>a</sub> is **accepted**.

### 8. Finding Correlation between Attrition & PercentSalaryHike

In [43]:
r_val, p_val = pearsonr(data_house.Attrition, data_house.PercentSalaryHike)

print(f"The p value is {p_val}.\n∴ PercentSalaryHike & Attrition are {type_correlation(p_val)}")

The p value is 0.030743386433355353.
∴ PercentSalaryHike & Attrition are positively correlated.


∴ H<sub>0</sub> is **rejected** & H<sub>a</sub> is **accepted**.

### 9. Finding Correlation between Attrition & StockOptionLevel

In [44]:
r_val, p_val = pearsonr(data_house.Attrition, data_house.StockOptionLevel)

print(f"The p value is {p_val}.\n∴ StockOptionLevel & Attrition are {type_correlation(p_val)}")

The p value is 0.6498072937477134.
∴ StockOptionLevel & Attrition are negatively correlated.


∴ H<sub>a</sub> is **rejected** & H<sub>0</sub> is **accepted**.

### 10. Finding Correlation between Attrition & TotalWorkingYears

In [45]:
r_val, p_val = pearsonr(data_house.Attrition, data_house.TotalWorkingYears)

print(f"The p value is {p_val}.\n∴ TotalWorkingYears & Attrition are {type_correlation(p_val)}")

The p value is 5.4731597517922494e-30.
∴ TotalWorkingYears & Attrition are positively correlated.


∴ H<sub>0</sub> is **rejected** & H<sub>a</sub> is **accepted**.

### 11. Finding Correlation between Attrition & TrainingTimesLastYear

In [46]:
r_val, p_val = pearsonr(data_house.Attrition, data_house.TrainingTimesLastYear)

print(f"The p value is {p_val}.\n∴ TrainingTimesLastYear & Attrition are {type_correlation(p_val)}")

The p value is 0.00102470619153603.
∴ TrainingTimesLastYear & Attrition are positively correlated.


∴ H<sub>0</sub> is **rejected** & H<sub>a</sub> is **accepted**.

### 12. Finding Correlation between Attrition & YearsAtCompany

In [47]:
r_val, p_val = pearsonr(data_house.Attrition, data_house.YearsAtCompany)

print(f"The p value is {p_val}.\n∴ YearsAtCompany & Attrition are {type_correlation(p_val)}")

The p value is 3.1638831224870065e-19.
∴ YearsAtCompany & Attrition are positively correlated.


∴ H<sub>0</sub> is **rejected** & H<sub>a</sub> is **accepted**.

### 13. Finding Correlation between Attrition & YearsSinceLastPromotion

In [48]:
r_val, p_val = pearsonr(data_house.Attrition, data_house.YearsSinceLastPromotion)

print(f"The p value is {p_val}.\n∴ YearsSinceLastPromotion & Attrition are {type_correlation(p_val)}")

The p value is 0.028330336189402967.
∴ YearsSinceLastPromotion & Attrition are positively correlated.


∴ H<sub>0</sub> is **rejected** & H<sub>a</sub> is **accepted**.

### 14. Finding Correlation between Attrition & YearsWithCurrManager

In [49]:
r_val, p_val = pearsonr(data_house.Attrition, data_house.YearsWithCurrManager)

print(f"The p value is {p_val}.\n∴ YearsWithCurrManager & Attrition are {type_correlation(p_val)}")

The p value is 1.7339322652900218e-25.
∴ YearsWithCurrManager & Attrition are positively correlated.


∴ H<sub>0</sub> is **rejected** & H<sub>a</sub> is **accepted**.


---


In [38]:
data_house.corr()

Unnamed: 0,Age,Attrition,DistanceFromHome,Education,EmployeeCount,EmployeeID,JobLevel,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
Age,1.0,-0.159205,0.006963,-0.035706,,0.008649,-0.002884,-0.044314,0.298531,-0.033137,,-0.031753,0.680419,-0.027308,0.311309,0.216513,0.202089
Attrition,-0.159205,1.0,-0.00973,-0.015111,,-0.004729,-0.01029,-0.031176,0.04223,0.032533,,-0.006839,-0.170111,-0.049431,-0.134392,-0.033019,-0.156199
DistanceFromHome,0.006963,-0.00973,1.0,-0.008638,,-0.001097,-0.037329,-0.021607,-0.013816,0.038125,,0.011169,0.009361,-0.009001,0.031684,0.00229,0.021584
Education,-0.035706,-0.015111,-0.008638,1.0,,-0.00968,0.045746,0.00641,-0.016219,-0.040531,,0.001261,-0.010712,0.010472,0.00608,0.02249,0.005358
EmployeeCount,,,,,,,,,,,,,,,,,
EmployeeID,0.008649,-0.004729,-0.001097,-0.00968,,1.0,-0.003303,0.007338,-0.001111,-0.004456,,-0.014254,-0.001061,-0.010191,0.004086,0.000256,0.008579
JobLevel,-0.002884,-0.01029,-0.037329,0.045746,,-0.003303,1.0,0.047316,-0.009846,0.010973,,0.000993,-0.036934,-0.0325,-0.064219,-0.060811,-0.055251
MonthlyIncome,-0.044314,-0.031176,-0.021607,0.00641,,0.007338,0.047316,1.0,-0.020709,0.004325,,0.02693,-0.033758,0.050112,0.000995,0.065219,0.024304
NumCompaniesWorked,0.298531,0.04223,-0.013816,-0.016219,,-0.001111,-0.009846,-0.020709,1.0,0.030827,,0.017131,0.238054,-0.032239,-0.117818,-0.036616,-0.109478
PercentSalaryHike,-0.033137,0.032533,0.038125,-0.040531,,-0.004456,0.010973,0.004325,0.030827,1.0,,0.012548,-0.018717,-0.037392,-0.029707,-0.029542,-0.040864
