In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.formula.api import ols
import statsmodels.api as sm

%matplotlib inline

In [None]:
weather = pd.read_excel("data/weather.xlsx")

In [None]:
weather.sort_values('outlook', inplace=True)

In [None]:
weather['humidity_num'] = np.array([86, 65, 90, 75, 96, 80, 70, 80, 91, 85, 90, 95, 70, 70])
weather

In [None]:
weather.groupby('outlook')['humidity_num'].mean()

In [None]:
weather.groupby('outlook')['humidity_num'].std()

In [None]:
weather.boxplot('humidity_num', by = 'outlook');

In [None]:
mod = ols('humidity_num ~ outlook', data = weather).fit()

In [None]:
sm.stats.anova_lm(mod, type = 2)

In [None]:
exam = pd.read_csv("data/exams_dummy.csv")
exam

In [None]:
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)

fig1 = fig.add_subplot(221); sns.boxplot(x='gender', y='Avg_score', data=exam[['gender', 'Avg_score']]);

In [None]:
model = ols('Avg_score ~ gender', data=exam).fit()

In [None]:
sm.stats.anova_lm(model)

In [None]:
exam1 = pd.read_csv("data/exams_dummy1.csv")
exam1

In [None]:
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)

fig1 = fig.add_subplot(221); sns.boxplot(x='gender', y='Avg_score', data=exam1[['gender', 'Avg_score']]);

In [None]:
model = ols('Avg_score ~ gender', data=exam1).fit()

In [None]:
sm.stats.anova_lm(model)

In [None]:
exam2 = pd.read_csv("data/exams_dummy2.csv")
exam2.head()

In [None]:
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)

fig1 = fig.add_subplot(221); sns.boxplot(x='gender', y='Avg_score', data=exam2[['gender', 'Avg_score']]);

In [None]:
model = ols('Avg_score ~ gender', data=exam2).fit()
sm.stats.anova_lm(model)

In [None]:
exam2.gender.value_counts()

### Chi-Square Test

In [None]:
cols = ['age', 'workclass', 'fnlwg', 'education', 'education-num', 
        'marital-status','occupation','relationship', 'race','sex',
        'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

In [None]:
adult = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None, names=cols)

In [None]:
adult.head()

In [None]:
adult.info()

In [None]:
def process_hours(df):
    cut_points = [0,9,19,29,39,49,1000]
    label_names = ["0-9","10-19","20-29","30-39","40-49","50+"]
    df["hours_per_week_categories"] = pd.cut(df["hours-per-week"],
                                             cut_points,labels=label_names)
    return df

In [None]:
data = process_hours(adult)
workhour_by_sex = data[['sex', 'hours_per_week_categories']]
workhour_by_sex.head()

In [None]:
workhour_by_sex['sex'].value_counts()

In [None]:
workhour_by_sex['hours_per_week_categories'].value_counts()

![](img/hypothesis.png)

In [None]:
contingency_table = pd.crosstab(
    workhour_by_sex['sex'],
    workhour_by_sex['hours_per_week_categories'],
    margins = True
)
contingency_table

In [None]:
#Assigns the frequency values
malecount = contingency_table.iloc[0][0:6].values
femalecount = contingency_table.iloc[1][0:6].values

#Plots the bar chart
fig = plt.figure(figsize=(10, 5))
sns.set(font_scale=1.8)
categories = ["0-9","10-19","20-29","30-39","40-49","50+"]
p1 = plt.bar(categories, malecount, 0.55, color='#d62728')
p2 = plt.bar(categories, femalecount, 0.55, bottom=malecount)
plt.legend((p2[0], p1[0]), ('Male', 'Female'))
plt.xlabel('Hours per Week Worked')
plt.ylabel('Count')
plt.show()


![](img/chisquare.png)

In [None]:
f_obs = np.append(contingency_table.iloc[0][0:6].values, contingency_table.iloc[1][0:6].values)
f_obs

![](img/expected.png)

In [None]:
row_sums = contingency_table.iloc[0:2,6].values
row_sums

In [None]:
col_sums = contingency_table.iloc[2,0:6].values
col_sums

In [None]:
total = contingency_table.loc['All', 'All']

f_expected = []
for j in range(2):
    for i in col_sums:
        f_expected.append(i*row_sums[j]/total)
f_expected

In [None]:
(10771 *  6462) / 32561

In [None]:
chi_squared_statistic = ((f_obs - f_expected)**2/f_expected).sum()
print('Chi-squared Statistic: {}'.format(chi_squared_statistic))

![](img/df.png)

#### Chi-Square using Scipy

In [None]:
f_obs = np.array([contingency_table.iloc[0][0:6].values,
                  contingency_table.iloc[1][0:6].values])
f_obs

In [None]:
from scipy import stats
stats.chi2_contingency(f_obs)[0:3]