# Chi2 Test

The test is applied when you have two categorical variables from a single population. It is used to determine whether there is a significant association between the two variables.

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns

dataset = sns.load_dataset('tips')
dataset

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [2]:
table = pd.crosstab(dataset.sex,dataset.smoker)
table

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,60,97
Female,33,54


In [5]:
Observed_val = table.values
Observed_val

array([[60, 97],
       [33, 54]], dtype=int64)

In [6]:
val = stats.chi2_contingency(table)
val

(0.008763290531773594,
 0.925417020494423,
 1,
 array([[59.84016393, 97.15983607],
        [33.15983607, 53.84016393]]))

In [8]:
no_of_rows=len(table.iloc[0:2,0])
no_of_columns=len(table.iloc[0,0:2])
ddof=(no_of_rows-1)*(no_of_columns-1)
print("Degree of Freedom:-",ddof)
alpha = 0.05

Degree of Freedom:- 1


In [14]:
from scipy.stats import chi2

chisquare = sum([(o-e)**2./e for o,e in zip(Observed_val,val[3])])
chisquare

array([0.00119737, 0.00073745])

In [15]:
chi_square_statistic=chisquare[0]+chisquare[1]
chi_square_statistic

0.001934818536627623

In [16]:
critical_val = chi2.ppf(q=1-alpha,df=ddof)
critical_val

3.841458820694124

In [17]:
p_value = 1-chi2.cdf(x=chi_square_statistic,df=ddof)
p_value

0.964915107315732

In [18]:
if chi_square_statistic>=critical_val:
    print("Reject H0,There is a relationship between 2 categorical variables")
else:
    print("Retain H0,There is no relationship between 2 categorical variables")
    
if p_value<=alpha:
    print("Reject H0,There is a relationship between 2 categorical variables")
else:
    print("Retain H0,There is no relationship between 2 categorical variables")

Retain H0,There is no relationship between 2 categorical variables
Retain H0,There is no relationship between 2 categorical variables
