CramerV is measure of association between two categorical variables
https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
> This notebook is for benchmarking several different implementation
- current running on i5-8400 CPU, 


In [4]:
import numpy as np
import pandas as pd


generate data

In [5]:
# generate a random numpy matrix with two categorical variables

# Define the categories
animal = ['cat', 'dog', 'mouse']
color = ['red', 'blue', 'green']
size = 100

# Generate pd.Dataframe with two categorical variables
df = pd.DataFrame({
    'animal': np.random.choice(animal, size),
    'color': np.random.choice(color, size)
})
df.head(2)

Unnamed: 0,animal,color
0,dog,green
1,mouse,red


In [6]:
df['animal'] = df['animal'].astype('category').cat.codes
df['color'] = df['color'].astype('category').cat.codes

method 1, consise method with scipy and pandas

In [7]:
from scipy.stats.contingency import association



In [8]:
def cramer1(a, b):
    xtab = pd.crosstab(a, b)
    return association(xtab, method='cramer')
cramer1(df['animal'], df['color'])

0.14698867043068176

method 2, implemented like algorithm described in wiki

In [9]:
from scipy.stats import chi2_contingency

In [10]:
def cramer2(a, b ):
    xtab = pd.crosstab(a, b)
    chi2 = chi2_contingency(xtab)[0]
    return np.sqrt((chi2 / xtab.values.sum()) / min(xtab.shape[0] - 1, xtab.shape[1] - 1))
cramer2(df['animal'], df['color'])

0.14698867043068176

method 3, improve the xtab with numpy

In [8]:
# modifed based on 
# https://gist.github.com/alexland/d6d64d3f634895b9dc8e

def numpy_crosstab(a,b):
    uniq_vals_a, idx_a = np.unique(a, return_inverse=True)
    uniq_vals_b, idx_b = np.unique(b, return_inverse=True)
    shape_xt = (uniq_vals_a.size, uniq_vals_b.size)
    xt = np.zeros(shape_xt, dtype='uint')
    np.add.at(xt, (idx_a, idx_b), 1)
    return xt
    
def cramer3(a, b ):
    xtab = numpy_crosstab(a, b)
    chi2 = chi2_contingency(xtab)[0]
    return np.sqrt((chi2 / xtab.sum()) / min(xtab.shape[0] - 1, xtab.shape[1] - 1))
cramer3(df['animal'], df['color'])

0.12348076334738846

method 4, futher improve xtab with numba JIT

In [42]:
import numba

In [46]:
def custom_np_unqiue_with_inverse(ar):
    '''
    the simplifed version of np.unique with return_inverse 
    # https://github.com/numpy/numpy/blob/a60de40f14580078dcfd9d0faf33ba3ec768fc8a/numpy/lib/_arraysetops_impl.py#L336
    becuase we only need the return_inverse
    And we know we will only have 1-d array without any nan and it will be integer.
    '''
    ar = np.ascontiguousarray(ar)

    perm = ar.argsort(kind = 'quicksort')
    aux = ar[perm]
    # get unique
    mask = np.empty(aux.shape, dtype=np.bool_)
    mask[:1] = True
    mask[1:] = aux[1:] != aux[:-1]
    ret = (aux[mask],)
    # get return_inverse
    imask = np.cumsum(mask) - 1
    inv_idx = np.empty(mask.shape, dtype=np.intp)
    inv_idx[perm] = imask
    ret += (inv_idx,)
    return ret
@numba.jit(nopython=True)
def numba_crosstab(a,b):
    uniq_vals_a, idx_a = custom_np_unqiue_with_inverse(a)
    uniq_vals_b, idx_b = custom_np_unqiue_with_inverse(b)
    shape_xt = (uniq_vals_a.size, uniq_vals_b.size)
    xt = np.zeros(shape_xt, dtype='uint')
    np.add.at(xt, (idx_a, idx_b), 1)
    return xt


In [47]:
def cramer4(a, b ):
    xtab = numba_crosstab(a, b)
    chi2 = chi2_contingency(xtab)[0]
    return np.sqrt((chi2 / xtab.sum()) / min(xtab.shape[0] - 1, xtab.shape[1] - 1))
cramer4(df['animal'].values, df['color'].values)

TypingError: Failed in nopython mode pipeline (step: nopython frontend)
[1mUntyped global name 'custom_np_unqiue_with_inverse':[0m [1m[1mCannot determine Numba type of <class 'function'>[0m
[1m
File "../../../../../tmp/ipykernel_14621/2638988101.py", line 26:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m

benchmarking

In [8]:
%%timeit
cramer1(df['animal'], df['color'])

6.52 ms ± 835 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
%%timeit
cramer2(df['animal'], df['color'])

5.38 ms ± 149 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
%%timeit
cramer3(df['animal'], df['color'])

361 µs ± 7.42 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
