# Import

In [10]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import pandas as pd
from IPython.display import display
import math

# Class

In [43]:
class HypTest:
    def __init__(self, data_path='cleaned_census_income.csv', variable='income', value='>50K'):
        self.data = pd.read_csv(data_path)
        self.variable = variable
        self.value = value

    def proportion(self):
        # Calculate proportion
        self.n = len(self.data)
        proportion_n = len(self.data[(self.data.workclass =='Private') & (self.data[self.variable]==self.value)])
        self.p_hat = proportion_n / self.n
        print(f"Estimated proportion of (workclass=='Private') & ({self.variable}=='{self.value}') is {self.p_hat}")
        print(self.n)

    def CI(self):
        # Calculate confidence interval
        m = 2*math.sqrt(self.p_hat * (1-self.p_hat) / self.n)  # Margin Error
        print('Margin Error is: ', m)
        print(f'Confidence Interval is ({self.p_hat-m}, {self.p_hat+m})')

    def hypothesis_test(self, p_0):
        H_0 = f'H_0: p = {p_0}'
        H_a = f'H_a: p > {p_0}'
        print(H_0)
        print(H_a)

        z = (self.p_hat-p_0)/math.sqrt(p_0*(1-p_0)/self.n)
        P_z_right = 1 - st.norm.cdf(z)

        print(f'z score is {z}')

        if P_z_right < 0.05:
            print(f'p-value is {P_z_right}, We have 0.95 confidence to reject {H_0}')
        else:
            print(f"p-value is {P_z_right}, We don't have enough evidence to reject {H_0}")

# 3）

In [44]:
test_1 = HypTest(variable='income', value='>50K')
test_1.proportion()
test_1.CI()
test_1.hypothesis_test(p_0=0.25)

Estimated proportion of (workclass=='Private') & (income=='>50K') is 0.16166036734964526
30162
Margin Error is:  0.00423947137768775
Confidence Interval is (0.15742089597195752, 0.165899838727333)
H_0: p = 0.25
H_a: p > 0.25
z score is -35.43113141080841
p-value is 1.0, We don't have enough evidence to reject H_0: p = 0.25


# 4）

In [45]:
test_2 = HypTest(variable='education', value='Bachelors')
test_2.proportion()
test_2.CI()
test_2.hypothesis_test(p_0=0.05)

Estimated proportion of (workclass=='Private') & (education=='Bachelors') is 0.11454810688946357
30162
Margin Error is:  0.0036675527697740714
Confidence Interval is (0.1108805541196895, 0.11821565965923764)
H_0: p = 0.05
H_a: p > 0.05
z score is 51.43595083168901
p-value is 0.0, We have 0.95 confidence to reject H_0: p = 0.05
