In [9]:
# Imports
from scipy.stats import chi2_contingency
from scipy.stats import chi2

from itertools import combinations

import pandas as pd

pd.set_option('display.max_columns', None) # Print all columns to jupyter notebook


def is_dependent(df, attr1, attr2, significance=0.05):
    # Returns True if attr1 and attr2 in a specificied
    # dataframe are considered dependent using the Chi^2 test
    
    observation = create_observation_table(df, attr1, attr2)
    chi, pval, dof, exp = chi2_contingency(observation)
    
    p = 1 - significance
    
    critical_value = chi2.ppf(p, dof)
    
    return (chi > critical_value)


def create_observation_table(df, attr1, attr2):
    # Creates the observation table for two attributes
    # in a specified dataframe
    
    # Get unique values for attributes
    index = df[attr1].unique()
    cols = df[attr2].unique()

    # Sort elements in cols/index
    [arr.sort() for arr in [index, cols]]
    
    # Create empty table
    observation = pd.DataFrame([], index=index, columns=cols)
    
    # Insert data
    for idx, val in df.groupby([attr1, attr2]).size().items():
        row, col = idx
        observation[col].loc[row] = val
        
    observation.fillna(0, inplace=True)
        
    return observation


if __name__ == '__main__':
    # Read from data source
    DATA_SOURCE = r'../data/credit_output.csv'

    df = pd.read_csv(DATA_SOURCE)

    # List of all nominal attributes
    nominal_attributes = [
        'personal_status', 
        'own_telephone', 
        'job', 
        'other_parties', 
        'works_outside_US', 
        'purpose', 
        'foreign_worker', 
        'location', 
        'property_magnitude', 
        'housing', 
        'employment', 
        'state',
        'other_payment_plans', 
        'class', 
        'credit_history',
        'installment_commitment', # These last 3 are numerical but because
        'residence_since',        # small range of values they can act as categorical
        'existing_credits'        #
    ]

    
    corr_count = {k: 0 for k in nominal_attributes}

    
    # Iterate through combinations, determine dependence
    for c in combinations(nominal_attributes, 2):
        is_corr = is_dependent(df, *c)
        if is_corr:
            for attr in c:
                corr_count[attr] += 1
            print(f'{str(c[0]) + " & " + str(c[1]):<50}: {is_corr}')
    
    print('\n\n')
    for k in sorted(corr_count, key=lambda x: corr_count[x]):
        print(f'{str(k):<25} : {str(corr_count[k])}')
    

personal_status & own_telephone                   : True
personal_status & property_magnitude              : True
personal_status & housing                         : True
personal_status & employment                      : True
personal_status & credit_history                  : True
personal_status & installment_commitment          : True
personal_status & residence_since                 : True
own_telephone & job                               : True
own_telephone & property_magnitude                : True
own_telephone & housing                           : True
own_telephone & employment                        : True
job & property_magnitude                          : True
job & housing                                     : True
job & employment                                  : True
job & existing_credits                            : True
other_parties & purpose                           : True
other_parties & property_magnitude                : True
works_outside_US & foreign_work

In [10]:
create_observation_table(df, 'works_outside_US', 'foreign_worker')

Unnamed: 0,NO,YES
NO,459,0
YES,0,531
