In [1]:
import ast
import pandas as pd

def parse_list_string(string):
        return ast.literal_eval(string)

independence_tests = pd.read_csv('data/independence_tests.csv')
independence_tests['con'] = independence_tests['con'].apply(parse_list_string)
display(independence_tests.head())

Unnamed: 0,var1,var2,con,CI95%,correlation,p-value
0,nearc4,educ,[],[0.11 0.18],0.144,2.204911e-15
1,nearc4,black,[],[-0.11 -0.04],-0.075199,3.707318e-05
2,nearc4,smsa,[],[0.32 0.38],0.352974,7.985801e-89
3,nearc4,south,[],[-0.26 -0.19],-0.222357,5.888085e-35
4,nearc4,married,[],[-0.02 0.05],0.016058,0.3790447


In [2]:
display(independence_tests.loc[independence_tests['p-value']>0.05])

Unnamed: 0,var1,var2,con,CI95%,correlation,p-value
4,nearc4,married,[],[-0.02 0.05],0.016058,0.379045
15,nearc4,black,[south],[-0.03 0.04],0.000803,0.964916
18,nearc4,black,[lwage],[-0.06 0.01],-0.027663,0.129696
31,nearc4,married,[educ],[-0.03 0.04],0.005915,0.745987
32,nearc4,married,[black],[-0.01 0.06],0.024453,0.180436
...,...,...,...,...,...,...
1763,exper,lwage,"[south, married, smsa]",[-0.01 0.06],0.021073,0.248559
1765,exper,lwage,"[married, nearc4, black]",[-0.01 0.06],0.021878,0.230936
1766,exper,lwage,"[married, nearc4, smsa]",[-0.04 0.03],-0.001745,0.923885
1775,exper,lwage,"[south, nearc4, married, black]",[-0. 0.07],0.032042,0.079353


In [16]:
CORRELATION_LEVEL = 0.15 # minium correlation to be considered significant
CORRELATION_DIFFERENCE = 0.05 # minimum difference between correlations to consider influence
VARIABLES_LIST = list(set(independence_tests['var1']).union(set(independence_tests['var2'])))

### Marginally dependent/Confounders
Two variables are marginally dependent if their correlation is significant, but their dependence given another variable is not significant.

In [4]:
def marginally_dependent(df):
    single_pairs_significant = df.loc[(df['con'].apply(lambda x: len(x) == 0)) & (df['p-value']<0.05) & (abs(df['correlation'])>CORRELATION_LEVEL)]
    conditioning =  df.loc[(df['con'].apply(lambda x: len(x) != 0)) & (df['p-value']<0.05)]

    print(f"DEPENDENT: if values have a correlation bigger than 0.2 (or smaller then -0.2) AND have a p-value<0.05")
    print(f"MARGINALLY INDEPENDENT: if correlation between variables is influenced less than 0.05 by conditioning on other variable(s).\n")


    for index, row in single_pairs_significant.iterrows():
        # select matching rows
        subdf = conditioning.loc[(conditioning['var1']==row['var1']) & (conditioning['var2']==row['var2'])]
        # only select rows where conditioning has influence the corrrelation by at least 0.05
        subdf = subdf.loc[abs(subdf['correlation']) - abs(row['correlation']) > CORRELATION_DIFFERENCE]

        if len(subdf)==0:
            print(f"MARGINALLY DEPENDENT: {row['var1']} and {row['var2']} are marginally dependent. They are not influenced (with more than 0.05 difference in correlation) by conditioning on another variable.")

        else:
            print(f"DEPENDENT WITH CONFOUNDER:  {row['var1']} and {row['var2']} are NOT marginally dependent. Their correlation is influenced by {subdf.iloc[0]['con']}.")



marginally_dependent(independence_tests)

DEPENDENT: if values have a correlation bigger than 0.2 (or smaller then -0.2) AND have a p-value<0.05
MARGINALLY INDEPENDENT: if correlation between variables is influenced less than 0.05 by conditioning on other variable(s).

MARGINALLY DEPENDENT: nearc4 and smsa are marginally dependent. They are not influenced (with more than 0.05 difference in correlation) by conditioning on another variable.
MARGINALLY DEPENDENT: nearc4 and south are marginally dependent. They are not influenced (with more than 0.05 difference in correlation) by conditioning on another variable.
MARGINALLY DEPENDENT: nearc4 and lwage are marginally dependent. They are not influenced (with more than 0.05 difference in correlation) by conditioning on another variable.
MARGINALLY DEPENDENT: educ and black are marginally dependent. They are not influenced (with more than 0.05 difference in correlation) by conditioning on another variable.
MARGINALLY DEPENDENT: educ and smsa are marginally dependent. They are not infl

### Chain
There is chain from X -> Y -> Z, if X&Y, X&Z, Z&Y are significant AND X,Z | Y is NOT significant

Assumption 1:  X&Y, X&Z, Z&Y are dependent

In [5]:
display(independence_tests.loc[(independence_tests['con'].apply(lambda x: len(x) != 0)) & (independence_tests['p-value']>0.05)])

Unnamed: 0,var1,var2,con,CI95%,correlation,p-value
15,nearc4,black,[south],[-0.03 0.04],0.000803,0.964916
18,nearc4,black,[lwage],[-0.06 0.01],-0.027663,0.129696
31,nearc4,married,[educ],[-0.03 0.04],0.005915,0.745987
32,nearc4,married,[black],[-0.01 0.06],0.024453,0.180436
33,nearc4,married,[smsa],[-0.05 0.02],-0.014164,0.437883
...,...,...,...,...,...,...
1763,exper,lwage,"[south, married, smsa]",[-0.01 0.06],0.021073,0.248559
1765,exper,lwage,"[married, nearc4, black]",[-0.01 0.06],0.021878,0.230936
1766,exper,lwage,"[married, nearc4, smsa]",[-0.04 0.03],-0.001745,0.923885
1775,exper,lwage,"[south, nearc4, married, black]",[-0. 0.07],0.032042,0.079353


In [6]:
from itertools import combinations

def chain(df):
    single_pairs_significant = df.loc[(df['con'].apply(lambda x: len(x) == 0)) & (df['p-value']<0.05) & (abs(df['correlation'])>0.2)]
    three_pairs = df.loc[df['con'].apply(lambda x: len(x) == 1) & (df['p-value']>0.05)]

    for index, row in three_pairs.iterrows():
        variables = set([row['var1'], row['var2'], row['con'][0]])
        pair_combinations = list(combinations(variables, 2))

        all_pairs_significant = 0
        for pair in pair_combinations:
            subdf = single_pairs_significant.loc[(single_pairs_significant['var1']==pair[0]) & (single_pairs_significant['var2']==pair[1])]
            if len(subdf) == 0:
                subdf = single_pairs_significant.loc[(single_pairs_significant['var1']==pair[1]) & (single_pairs_significant['var2']==pair[0])]

            if len(subdf) ==1:
                all_pairs_significant+=1

            elif len(subdf) > 1:
                print("ERROR")
        print(pair_combinations)
        print(all_pairs_significant)

                        


chain(independence_tests)

[('south', 'nearc4'), ('south', 'black'), ('nearc4', 'black')]
2
[('nearc4', 'lwage'), ('nearc4', 'black'), ('lwage', 'black')]
1
[('educ', 'nearc4'), ('educ', 'married'), ('nearc4', 'married')]
0
[('nearc4', 'married'), ('nearc4', 'black'), ('married', 'black')]
0
[('smsa', 'nearc4'), ('smsa', 'married'), ('nearc4', 'married')]
1
[('south', 'nearc4'), ('south', 'married'), ('nearc4', 'married')]
1
[('exper', 'nearc4'), ('exper', 'married'), ('nearc4', 'married')]
1
[('smsa', 'exper'), ('smsa', 'nearc4'), ('exper', 'nearc4')]
1
[('smsa', 'nearc4'), ('smsa', 'black'), ('nearc4', 'black')]
1
[('smsa', 'educ'), ('smsa', 'black'), ('educ', 'black')]
1
[('smsa', 'south'), ('smsa', 'black'), ('south', 'black')]
1
[('smsa', 'exper'), ('smsa', 'black'), ('exper', 'black')]
0
[('smsa', 'exper'), ('smsa', 'educ'), ('exper', 'educ')]
1
[('south', 'nearc4'), ('south', 'married'), ('nearc4', 'married')]
1
[('south', 'educ'), ('south', 'married'), ('educ', 'married')]
1
[('smsa', 'south'), ('smsa', 

In [27]:
from itertools import combinations

def chain(df):
    single_pairs_significant = df.loc[(df['con'].apply(lambda x: len(x) == 0)) & (df['p-value']<0.05) & (abs(df['correlation'])>CORRELATION_LEVEL)]
    single_pairs = list(zip(single_pairs_significant['var1'], single_pairs_significant['var2']))
    single_pairs = [set(p) for p in single_pairs]

    triples = list(combinations(VARIABLES_LIST, 3))

    # ASSUMPTION 1: check if all pair combinations are dependent on each other
    for triple_combination in triples:
        pairs = list(combinations(triple_combination, 2))
        pairs = [set(p) for p in pairs]
        
        count = 0
        for pair in pairs:
            if set(pair) in single_pairs:
                count += 1
        
        if count == 3:
            print(f"POTENTIALLY CHAIN: {triple_combination}. These variables all have a dependence on each other.")

        
        
chain(independence_tests)

POTENTIALLY CHAIN: ('lwage', 'black', 'educ'). These variables all have a dependence on each other.
POTENTIALLY CHAIN: ('lwage', 'black', 'south'). These variables all have a dependence on each other.
POTENTIALLY CHAIN: ('lwage', 'educ', 'smsa'). These variables all have a dependence on each other.
POTENTIALLY CHAIN: ('lwage', 'educ', 'south'). These variables all have a dependence on each other.
POTENTIALLY CHAIN: ('lwage', 'nearc4', 'smsa'). These variables all have a dependence on each other.
POTENTIALLY CHAIN: ('lwage', 'nearc4', 'south'). These variables all have a dependence on each other.
POTENTIALLY CHAIN: ('lwage', 'smsa', 'south'). These variables all have a dependence on each other.
POTENTIALLY CHAIN: ('black', 'educ', 'south'). These variables all have a dependence on each other.
POTENTIALLY CHAIN: ('educ', 'smsa', 'south'). These variables all have a dependence on each other.
POTENTIALLY CHAIN: ('nearc4', 'smsa', 'south'). These variables all have a dependence on each othe

### Collider
Z is a collider IF:
- X,Y are indepent when not conditioned on Z
- X,Y are dependent when conditioned on Z
- X,Z AND Z,Y are dependent.

In [39]:
def collider(df):
    conditioned_sig = df.loc[(df['con'].apply(lambda x: len(x) == 1)) & (df['p-value']<0.05) & (abs(df['correlation'])>CORRELATION_LEVEL)]
    # conditioned_pairs = list(zip(conditioned_sig['var1'], conditioned_sig['var2']))
    # conditioned_pairs = [set(p) for p in conditioned_pairs]
    # print(conditioned_pairs)

    independent = df.loc[(df['con'].apply(lambda x: len(x) ==0)) & ((abs(df['correlation'])<CORRELATION_LEVEL)  |  (df['p-value']>0.05))]
    independent_pairs = list(zip(independent['var1'], independent['var2']))
    independent_pairs = [set(p) for p in independent_pairs]
    
    for index, row in conditioned_sig.iterrows():
        combo = set([row['var1'], row['var2']])

        # pair is depent when conditioned on, but independent when not conditioned on
        if combo in independent_pairs:
            if len(row['con']) ==1:
                condition_variable = row['con'][0]

                # check if conditioned variable has relation with both X and Y
                if set([condition_variable,row['var1']]) and set([condition_variable,row['var2']]) not in independent_pairs:
        
                    print(f"COLLIDER: {row['con']} for VARIABLES {row['var1']} and {row['var2']} --- {row['var1']} -> {row['con'][0]} <- {row['var2']}")

            else:
                print("ERROR: too much variables in conditions")

collider(independence_tests)

COLLIDER: ['married'] for VARIABLES black and exper --- black -> married <- exper
COLLIDER: ['educ'] for VARIABLES exper and lwage --- exper -> educ <- lwage
