# Contraste de independencia y homogeneidad

$$
\left.
\begin{array}{ll}
H_0: & \mathrm{La\ raza\ de\ los\ habitantes\ es\ independiente\ de\ su\ tendencia\ política} \\
H_1: & \mathrm{La\ raza\ de\ los\ habitantes\ NO\ es\ independiente\ de\ su\ tendencia\ política}
\end{array}
\right\}
$$

In [72]:
# External imports
import numpy as np
import pandas as pd
import scipy.stats as stats

In [73]:
# Define the seed 
np.random.seed(2020)

In [74]:
# Create the samples
voter_race = np.random.choice(
    a = ["asiatico","negro","hispano","blanco","otro"],
    p = [0.05,0.10,0.25,0.55,0.05],
    size = 2000
)

voter_politics = np.random.choice(
    a = ["democrata","republicano","independiente"],
    p = [0.4,0.35,0.25],
    size = 2000
)

# Define a Pandas DataFrame
voters = pd.DataFrame({
    "raza":voter_race,
    "politica":voter_politics
})

In [75]:
# Define a cross table with the variables
voters_tab = pd.crosstab(voters.raza, voters.politica, margins=True)
voters_tab

politica,democrata,independiente,republicano,All
raza,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
asiatico,50,22,27,99
blanco,428,277,383,1088
hispano,207,115,182,504
negro,92,61,66,219
otro,37,17,36,90
All,814,492,694,2000


In [76]:
# Define the observed values
o_ij = voters_tab.iloc[:5,:3]
o_ij

politica,democrata,independiente,republicano
raza,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
asiatico,50,22,27
blanco,428,277,383
hispano,207,115,182
negro,92,61,66
otro,37,17,36


El estadistico de contraste es:

$$\chi_0 = \sum_{i=1}^I\sum_{j=1}^J\frac{(o_{ij} - e_{ij})^2}{e_{ij}}$$

In [77]:
# Define the theorical values
e_ij = np.outer(voters_tab["All"][:5], voters_tab.loc["All"][:3]/2000)
e_ij = pd.DataFrame(e_ij)
e_ij.columns = ["democrata","independiente","republicano"]
e_ij.index = ["asiatico","blanco","hispano","negro","otro"]
e_ij

Unnamed: 0,democrata,independiente,republicano
asiatico,40.293,24.354,34.353
blanco,442.816,267.648,377.536
hispano,205.128,123.984,174.888
negro,89.133,53.874,75.993
otro,36.63,22.14,31.23


In [78]:
# Define the contrast statistics
chi0 = ((o_ij - e_ij)**2/e_ij).sum().sum()
chi0

10.273214989515228

In [79]:
# Calculate the critic area
alpha = 0.05
crit = stats.chi2.ppf(q = 1-alpha, df = (5-1)*(3-1))
crit

15.50731305586545

In [80]:
# Compare values
crit > chi0

True

In [81]:
p_val = 1 - stats.chi2.cdf(x=chi0, df = 8)
p_val

0.24637107148949744

In [82]:
p_val > alpha

True

In [83]:
# Use of the <chi2_contingency> to make the contrast 
stats.chi2_contingency(observed=o_ij)

(10.273214989515228,
 0.24637107148949744,
 8,
 array([[ 40.293,  24.354,  34.353],
        [442.816, 267.648, 377.536],
        [205.128, 123.984, 174.888],
        [ 89.133,  53.874,  75.993],
        [ 36.63 ,  22.14 ,  31.23 ]]))