# Demo to check for associations between categoricals



Source: https://towardsdatascience.com/contingency-tables-chi-squared-and-cramers-v-ada4f93ec3fd

Data: the Open University Learning Analytics dataset. The dataset consists of 7 csv files of student demographic, assessment, and registration data; and course, course assessment, and learning environment data in tabular form.

We will be looking at two of the categorical features, final_result and highest_education, from the studentInfo.csv file.

In [20]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats

In [4]:
df = pd.read_csv('studentInfo.csv')

In [5]:
df.head()

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass


In [9]:
# creating crosstab data frame
tabs = pd.crosstab(df.highest_education, df.final_result)
# creating a statsmodels table object
table = sm.stats.Table(tabs)

In [10]:
# a contingency table of the origin data
table.table_orig

final_result,Distinction,Fail,Pass,Withdrawn
highest_education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A Level or Equivalent,1496,2707,5812,4030
HE Qualification,697,790,1960,1283
Lower Than A Level,727,3426,4385,4620
No Formal quals,16,95,87,149
Post Graduate Qualification,88,34,117,74


In [11]:
# a contingency table of values from the best fitting independent distribution for the data
table.fittedvalues

final_result,Distinction,Fail,Pass,Withdrawn
highest_education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A Level or Equivalent,1303.104348,3038.853128,5326.611389,4376.431136
HE Qualification,438.852514,1023.408707,1793.867702,1473.871077
Lower Than A Level,1220.807904,2846.93695,4990.213788,4100.041359
No Formal quals,32.194888,75.078821,131.600865,108.125426
Post Graduate Qualification,29.040346,67.722394,118.706256,97.531004


In [14]:
# a table of residuals which will reveal any associations present in the data
# Positive values indicate more observations than expected if the features were independent, and negative values indicate fewer.
table.resid_pearson

final_result,Distinction,Fail,Pass,Withdrawn
highest_education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A Level or Equivalent,5.343586,-6.019925,6.65065,-5.23669
HE Qualification,12.322777,-7.296129,3.922463,-4.971763
Lower Than A Level,-14.133,10.852678,-8.567404,8.120349
No Formal quals,-2.854201,2.299092,-3.887889,3.930876
Post Graduate Qualification,10.940925,-4.097814,-0.156606,-2.382699


Hypothesis test

In [17]:
# For testing will use Pearson’s chi-squared test

# Ho = highest_education and final_result are independent.
# Ha = highest_education and final_result are not independent.
# The significance level alpha = 0.05.

In [18]:
def chi_sq_test(cross_tabs):
    """
    Prints the Chi-Squared Statistic, p-value, and degress of freedom from a Chi-Squared test.
    
    Args:
        cross_tabs: A crosstab dataframe.
    """
    chi2, p, dof, con_table = stats.chi2_contingency(cross_tabs)
    print(f'chi-squared = {chi2}\np value= {p}\ndegrees of freedom = {dof}')

In [22]:
chi_sq_test(table.table_orig)

chi-squared = 1024.6961991440007
p value= 9.18211300726649e-212
degrees of freedom = 12


In [24]:
# p<0.5, so we reject the null hypothesis and conclude that highest_education and final_result are not independent

Effect Size

In [25]:
# Effect size is a measure of the power of the association between the two features
def cramers_v(cross_tabs):
    """
    Prints the degrees of freedom, effect size thresholds, and Cramer's V value.
    
    Args:
        cross_tabs: A crosstab dataframe.
    """
    
    # effect size data frame for cramer's v function
    data = np.array([[1, .1, .3, .5],
       [2, .07, .21, .35],
       [3, .06, .17, .29],
       [4, .05,.15,.25],
       [5, .04, .13, .22]])
    sizes = pd.DataFrame(data, columns=['Degrees of Freedom', 'Small Effect', 'Medium Effect', 'Large Effect']) 
    
    # getting the chi sq. stat
    chi2 = stats.chi2_contingency(cross_tabs)[0]
    # calculating the total number of observations
    n = cross_tabs.sum().sum()
    # getting the degrees of freedom
    dof = min(cross_tabs.shape)-1
    # calculating cramer's v
    v = np.sqrt(chi2/(n*dof))
    # printing results
    print(f'V = {v}')
    print(f'Cramer\'s V Degrees of Freedom = {dof}')
    print(f'\nEffect Size Thresholds\n{sizes}\n')

In [26]:
cramers_v(table.table_orig)

V = 0.10237048644403951
Cramer's V Degrees of Freedom = 3

Effect Size Thresholds
   Degrees of Freedom  Small Effect  Medium Effect  Large Effect
0                 1.0          0.10           0.30          0.50
1                 2.0          0.07           0.21          0.35
2                 3.0          0.06           0.17          0.29
3                 4.0          0.05           0.15          0.25
4                 5.0          0.04           0.13          0.22



In [27]:
# Adjusted for degrees of freedom, the Cramer’s V result indicates that highest_education has a small, statistically significant effect on final_result.