[Understandable_Statistics](https://www.youtube.com/watch?v=RvGb5L_A16w&list=PL4BztsgicSEeC4Oic6s5vW4LE-0YZTuo-&index=18&t=0s)
[scipy](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html)

$$\begin{array}{ccc}
   &\mbox{goodness of fit}&\mbox{independence}&\mbox{homogeneity}&\\
H_0&\mbox{dist}(X)=\mbox{benchmark dist}&X,\ Y\ \mbox{independent}&\mbox{dist}(X|Y=i)=\mbox{dist}(X|Y=j)\\
H_a&\mbox{dist}(X)\not=\mbox{benchmark dist}&X,\ Y\ \mbox{not independent}&\mbox{dist}(X|Y=i)\not=\mbox{dist}(X|Y=j)\\
\mbox{Data}&x\sim X&(x,y)\sim (X,Y)&x_i\sim X|Y=i, x_j\sim X|Y=j\\
\mbox{scipy}&\mbox{scipy.stats.chisquare}&\mbox{scipy.stats.chi2_contingency}&\mbox{scipy.stats.chi2_contingency}
\end{array}$$

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

observed = np.array([[25,30,25],[30,71,19],[35,49,16]])
df = pd.DataFrame(observed, index=['A','B','C'])
df.columns = ['21-40','41-60','61-80']
# print(df)
# print()

chi2, p_value, DF, expected = stats.chi2_contingency(df)
print(chi2)
print(p_value)
# print(DF)
# print(expected)

13.315833333333334
0.009831372848336144


<div align="center"><img src="img/Screen Shot 2019-08-16 at 12.39.25 PM.png" width="100%" height="30%"></div>

[Understandable_Statistics](https://www.youtube.com/watch?v=RvGb5L_A16w&list=PL4BztsgicSEeC4Oic6s5vW4LE-0YZTuo-&index=18&t=0s)

In [2]:
contingency_table = df.copy()
contingency_table.loc['Column_Total',:] = contingency_table.sum()
contingency_table.loc[:,'Row_Total'] = contingency_table.sum(axis=1)
print(contingency_table)

              21-40  41-60  61-80  Row_Total
A              25.0   30.0   25.0       80.0
B              30.0   71.0   19.0      120.0
C              35.0   49.0   16.0      100.0
Column_Total   90.0  150.0   60.0      300.0


$$
P(X=i,Y=j)=P(X=i)P(Y=j)=\frac{\mbox{Row_Total[i]}}{\mbox{Sample_Size}}\frac{\mbox{Column_Total[j]}}{\mbox{Sample_Size}}
$$
$$$$
$$
\mbox{Expected}(X=i,Y=j)=\frac{\mbox{Row_Total[i]}*\mbox{Column_Total[j]}}{\mbox{Sample_Size}}
$$
$$$$

<div align="center"><img src="img/Screen Shot 2019-08-16 at 12.39.44 PM.png" width="60%" height="30%"></div>

[Understandable_Statistics](https://www.youtube.com/watch?v=RvGb5L_A16w&list=PL4BztsgicSEeC4Oic6s5vW4LE-0YZTuo-&index=18&t=0s)

<div align="center"><img src="img/Screen Shot 2019-08-16 at 12.40.06 PM.png" width="100%" height="30%"></div>

[Understandable_Statistics](https://www.youtube.com/watch?v=RvGb5L_A16w&list=PL4BztsgicSEeC4Oic6s5vW4LE-0YZTuo-&index=18&t=0s)

In [3]:
df_expected = df.copy()
for i in df_expected.index:
    for j in df_expected.columns:
        row_total = contingency_table.loc[i,'Row_Total'] 
        column_total = contingency_table.loc['Column_Total',j] 
        sample_size = contingency_table.loc['Column_Total','Row_Total'] 
        df_expected.loc[i,j] = row_total*column_total/sample_size
print(df_expected)

   21-40  41-60  61-80
A   24.0   40.0   16.0
B   36.0   60.0   24.0
C   30.0   50.0   20.0


<div align="center"><img src="img/Screen Shot 2019-08-16 at 12.40.32 PM.png" width="60%" height="30%"></div>

[Understandable_Statistics](https://www.youtube.com/watch?v=RvGb5L_A16w&list=PL4BztsgicSEeC4Oic6s5vW4LE-0YZTuo-&index=18&t=0s)

In [4]:
chi2 = np.sum(np.sum((df - df_expected)**2 / df_expected))
print(chi2)

13.315833333333334


<div align="center"><img src="img/Screen Shot 2019-08-16 at 12.40.45 PM.png" width="60%" height="30%"></div>

[Understandable_Statistics](https://www.youtube.com/watch?v=RvGb5L_A16w&list=PL4BztsgicSEeC4Oic6s5vW4LE-0YZTuo-&index=18&t=0s)

In [5]:
DF = (df.shape[0]-1) * (df.shape[1]-1)
print(DF)

4


<div align="center"><img src="img/Screen Shot 2019-08-16 at 12.41.07 PM.png" width="30%" height="30%"></div>

[Understandable_Statistics](https://www.youtube.com/watch?v=RvGb5L_A16w&list=PL4BztsgicSEeC4Oic6s5vW4LE-0YZTuo-&index=18&t=0s)

In [6]:
import scipy.stats as stats

p_value = 1 - stats.chi2.cdf(chi2, DF)
print(p_value)

0.009831372848336128
