In [1]:
import pandas as pd
from scipy import stats
from pydataset import data
import itertools

## Exercise 1

In [2]:
index = ["Uses Macbook", "Doesn't Macbook"]
columns = ["Codeup Student", "Not Codeup Student"]

observed = pd.DataFrame([[49, 20], [1, 30]], index=index, columns=columns)
observed

Unnamed: 0,Codeup Student,Not Codeup Student
Uses Macbook,49,20
Doesn't Macbook,1,30


In [3]:
α = 0.05

In [4]:
chi2, p, degf, expected = stats.chi2_contingency(observed)
p

1.4116760526193828e-09

In [5]:
if p < α:
    print("Reject the null hypothesis")
else:
    print("Fail to reject the null")

Reject the null hypothesis


## Exercise 2 w/ automation
Choose another 2 categorical variables from the mpg dataset and perform a chi_2 contingency table test with them. Be sure to state your null and alternative hypotheses.

In [6]:
df = data("mpg")
# Rename "class" since it's a reserved word
df = df.rename(columns={"class": "vehicle_type"})

# Specify automatic as a boolean
df["is_automatic"] = df.trans.str.startswith("a")
df.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,vehicle_type,is_automatic
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,True
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,False
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,False
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,True
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,True


In [7]:

categorical_columns = ["manufacturer", "model", "drv", "fl", "vehicle_type", "is_automatic"]

In [8]:
combinations = list(itertools.combinations(categorical_columns, 2))
combinations

[('manufacturer', 'model'),
 ('manufacturer', 'drv'),
 ('manufacturer', 'fl'),
 ('manufacturer', 'vehicle_type'),
 ('manufacturer', 'is_automatic'),
 ('model', 'drv'),
 ('model', 'fl'),
 ('model', 'vehicle_type'),
 ('model', 'is_automatic'),
 ('drv', 'fl'),
 ('drv', 'vehicle_type'),
 ('drv', 'is_automatic'),
 ('fl', 'vehicle_type'),
 ('fl', 'is_automatic'),
 ('vehicle_type', 'is_automatic')]

In [9]:
alpha = 0.5
outcomes = []
for combination in combinations:
    outcome = {}
    outcome["column1"] = combination[0]
    outcome["column2"] = combination[1]
    observed = pd.crosstab(df[combination[0]], df[combination[1]])
    chi2, p, degf, expected = stats.chi2_contingency(observed)
    outcome["p"] = p
    if p < alpha:
        outcome["reject_null"] = True
    else:
        outcome["reject_null"] = False
    
    outcomes.append(outcome)

    
pd.DataFrame(outcomes)

Unnamed: 0,column1,column2,p,reject_null
0,manufacturer,model,0.0,True
1,manufacturer,drv,5.128094e-34,True
2,manufacturer,fl,1.062455e-13,True
3,manufacturer,vehicle_type,5.267718e-54,True
4,manufacturer,is_automatic,0.009534443,True
5,model,drv,1.4739529999999999e-58,True
6,model,fl,1.85469e-07,True
7,model,vehicle_type,9.146376e-164,True
8,model,is_automatic,0.05171909,True
9,drv,fl,0.584558,False


## Exercise 3
- Is an employee's gender independent of whether an employee works in sales or marketing? (only look at current employees)
- Is an employee's gender independent of whether or not they are or have been a manager?


In [10]:
from env import get_db_url

In [11]:
sql = """
SELECT gender, dept_name
FROM employees
JOIN dept_emp using(emp_no)
JOIN departments using(dept_no)
WHERE to_date > curdate()
"""

url = get_db_url("employees")

df = pd.read_sql(sql, url)
df.head()

Unnamed: 0,gender,dept_name
0,M,Customer Service
1,F,Customer Service
2,M,Customer Service
3,F,Customer Service
4,F,Customer Service


In [12]:
observed = pd.crosstab(df.gender, df.dept_name)
observed

dept_name,Customer Service,Development,Finance,Human Resources,Marketing,Production,Quality Management,Research,Sales
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
F,7007,24533,5014,5147,5864,21393,5872,6181,14999
M,10562,36853,7423,7751,8978,31911,8674,9260,22702


In [13]:
chi2, p, degf, expected = stats.chi2_contingency(observed)
if p < α:
    print("Reject the null")
else:
    print("Fail to reject the null")

Fail to reject the null


## Is gender independent of department?
- Null hypothesis = there is no relationship, no dependence, between gender and department
- We have insufficient evidence to reject the null hypothesis. The evidence supports the claim that gender and department are independent.

## Exercise 3 Part 2
Is an employee's gender independent of whether or not they are or have been a manager?

In [14]:
sql = """
SELECT gender, title
FROM employees
JOIN titles using(emp_no)
"""

In [15]:
df = pd.read_sql(sql, url)
df.head()

Unnamed: 0,gender,title
0,M,Senior Engineer
1,F,Staff
2,M,Senior Engineer
3,M,Engineer
4,M,Senior Engineer


In [16]:
df["is_manager"] = df.title == "Manager"
df.head()

Unnamed: 0,gender,title,is_manager
0,M,Senior Engineer,False
1,F,Staff,False
2,M,Senior Engineer,False
3,M,Engineer,False
4,M,Senior Engineer,False


In [17]:
observed = pd.crosstab(df.gender, df.is_manager)
observed

is_manager,False,True
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
F,177211,13
M,266073,11


In [18]:
chi2, p, degf, expected = stats.chi2_contingency(observed)
p

0.22600394509880642

## We don't have sufficient evidence to reject the null hypothesis


We move forward with the understanding that gender and management at this company are independent