In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pydataset import data
from scipy import stats

# Chi$^2$ ($\chi^2$) Test for Independence

aka Pearson's Chi$^2$ test. Pronounced as 'Ki' as in kite.

- Lets us test the hypothesis that one group is independent of another
- $H_0$ is always that there is independence between the groups
- $H_0$ is that there is no dependence


The null hypothesis assumes that the observed frequencies for a categorical variable match the expected frequencies for the categorical variable

## The Quick Way To Run a Chi$^2$ Test

In [None]:
# get data from pydataset
df = data('tips')

In [None]:
df.head()

- $H_0$ There is independence between the smoker and time of the
day
- $H_a$ is that there is a dependence

In [None]:
# pandas crosstab to make a 'contingency' table
observed = pd.crosstab(df.time, df.smoker)
observed

In [None]:
# Set our alpha
alpha = .01

In [None]:
# chi2_contingency returns 4 different values
chi2, p, degf, expected = stats.chi2_contingency(observed)

## Attrition Data

In [None]:
# get your data
df = pd.read_csv("https://gist.githubusercontent.com/ryanorsinger/6ba2dd985c9aa92f5598fc0f7c359f6a/raw/b20a508cee46e6ac69eb1e228b167d6f42d665d8/attrition.csv")

In [None]:
df.head(3)

In [None]:
# check shape of the dataframe
df.shape

In [None]:
# Check for which columns are discrete


In [None]:
# Question we want to answer:

# 1. Is Attrition independent from Business Travel amount?
# 2. Is Attrition independent from Department?
# 3. Is Attrition indpendent from WorkLife balance

Form hypothesis:

$H_0$: Attrition and Travel Frequency are independent (not dependent)

$H_a$: Attrition and Travel Frequency are dependent

In [None]:
# cross tab Attrition vs Business Travel
# observed = 

In [None]:
# observed

In [None]:
# Set our alpha
alpha = .01

In [None]:
# .chi2_contingency returns 4 different values
chi2, p, degf, expected = stats.chi2_contingency(observed)

In [None]:
chi2, p, degf

In [None]:
expected 

In [None]:
null_hypothesis = "Attrition and Business Travel are independent"

if p < alpha:
    print("We reject the null hypothesis")
    print("We reject the hypothesis that", null_hypothesis)
else:
    print("We fail to reject the null hypothesis")

print(p)

In [None]:
#Normalized crosstab


In [None]:
# make a heatmap


## Let's Test for Independence of Attrition and Deparment
- $H_0$: There is no relationship between them, Attrition and Deparment are independent
- $H_a$: There is a relationship

In [None]:
#crosstab for observed values between Attrition and Depts


In [None]:
# Let's get the p value from a chi2 test for independence
chi2, p, degf, expected = stats.chi2_contingency(observed)

In [None]:
if p < alpha:
    print("We reject the null")
else:
    print("We fail to reject the null")



In [None]:
# Is attrition and being in sales related?

In [None]:
df["in_sales"] = df.Department == "Sales"

In [None]:
df.head(2)

In [None]:
# crosstab between Attrition and in_sales column


#### $H_0$: Attrition and Being in Sales or Not are independent
#### $H_a$: There is a relationship

In [None]:
chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null")
else:
    print("We fail to reject the null")

p

## Let's test for independence between WorkLifeBalance and Attrition
- $H_0$: WorkLifeBalance and Attrition are independent, no relationship
- $H_a$: They are dependent - there is a relationship

WorkLifeBalance   
1 'Bad'  
2 'Good'  
3 'Better'  
4 'Best'  

In [None]:
#look at value counts

In [None]:
# Crosstab for Attrition and WorklifeBalance
# observed = 

In [None]:
chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null")

In [None]:
p

In [None]:
# Now, let's control for Department

In [None]:
# df.Department.value_counts()

In [2]:
# make new dataframes for each dept


In [None]:
# Run the chi squared test for independence on only RND


In [None]:
chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null")
p

In [None]:
# how about for sales?


In [None]:
chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null")
p

In [None]:
# How about for HR?


In [None]:
chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null")
p

## Findings So Far:
$H_0$ is that there is no relationship. Worklife and Attrition are indpendent

- Research and Development, we reject the null hypothesis
- Sales, we fail to reject the null. This could be due to small population size.
- HR, we fail to reject the null. This could be due to small population size