In [4]:
import ast
import pandas as pd

def parse_list_string(string):
        return ast.literal_eval(string)

independence_tests = pd.read_csv('data/independence_tests.csv')
independence_tests['con'] = independence_tests['con'].apply(parse_list_string)
display(independence_tests.head())

Unnamed: 0,var1,var2,con,CI95%,correlation,p-value
0,nearc4,educ,[],[0.11 0.18],0.144,2.204911e-15
1,nearc4,black,[],[-0.11 -0.04],-0.075199,3.707318e-05
2,nearc4,smsa,[],[0.32 0.38],0.352974,7.985801e-89
3,nearc4,south,[],[-0.26 -0.19],-0.222357,5.888085e-35
4,nearc4,married,[],[-0.02 0.05],0.016058,0.3790447


In [5]:
display(independence_tests.loc[independence_tests['p-value']>0.05])

Unnamed: 0,var1,var2,con,CI95%,correlation,p-value
4,nearc4,married,[],[-0.02 0.05],0.016058,0.379045
15,nearc4,black,[south],[-0.03 0.04],0.000803,0.964916
18,nearc4,black,[lwage],[-0.06 0.01],-0.027663,0.129696
31,nearc4,married,[educ],[-0.03 0.04],0.005915,0.745987
32,nearc4,married,[black],[-0.01 0.06],0.024453,0.180436
...,...,...,...,...,...,...
1763,exper,lwage,"[south, married, smsa]",[-0.01 0.06],0.021073,0.248559
1765,exper,lwage,"[married, nearc4, black]",[-0.01 0.06],0.021878,0.230936
1766,exper,lwage,"[married, nearc4, smsa]",[-0.04 0.03],-0.001745,0.923885
1775,exper,lwage,"[south, nearc4, married, black]",[-0. 0.07],0.032042,0.079353


In [6]:
CORRELATION_LEVEL = 0.15 # minium correlation to be considered significant
CORRELATION_DIFFERENCE = 0.05 # minimum difference between correlations to consider influence
VARIABLES_LIST = list(set(independence_tests['var1']).union(set(independence_tests['var2'])))

### Insignificant correlations - based on p-value

In [7]:
display(independence_tests.loc[(independence_tests['p-value']>0.05) & (independence_tests['con'].apply(lambda x: len(x) == 0)) ])

Unnamed: 0,var1,var2,con,CI95%,correlation,p-value
4,nearc4,married,[],[-0.02 0.05],0.016058,0.379045
1408,south,married,[],[-0.04 0.03],-0.003022,0.868516
1728,exper,lwage,[],[-0.02 0.05],0.011985,0.511497


### Significant - but weak correlations

In [8]:
display(independence_tests.loc[(independence_tests['p-value']<0.05) & (independence_tests['con'].apply(lambda x: len(x) == 0)) & (abs(independence_tests['correlation'])<CORRELATION_LEVEL) ])

Unnamed: 0,var1,var2,con,CI95%,correlation,p-value
0,nearc4,educ,[],[0.11 0.18],0.144,2.204911e-15
1,nearc4,black,[],[-0.11 -0.04],-0.075199,3.707318e-05
5,nearc4,exper,[],[-0.1 -0.03],-0.0632,0.0005295486
451,educ,married,[],[0.04 0.11],0.07097,9.925693e-05
832,black,smsa,[],[-0.07 -0. ],-0.037615,0.03928614
834,black,married,[],[0.07 0.14],0.108788,2.275164e-09
835,black,exper,[],[0.1 0.17],0.138084,2.95564e-14
1153,smsa,married,[],[0.05 0.12],0.082909,5.382099e-06
1154,smsa,exper,[],[-0.17 -0.1 ],-0.137055,4.591651e-14
1409,south,exper,[],[0.08 0.15],0.111263,9.74071e-10


### Marginally dependent/Confounders
Two variables are marginally dependent if their correlation is significant, but their dependence given another variable is not significant.

In [9]:
def marginally_dependent(df):
    single_pairs_significant = df.loc[(df['con'].apply(lambda x: len(x) == 0)) & (df['p-value']<0.05) & (abs(df['correlation'])>CORRELATION_LEVEL)]
    conditioning =  df.loc[(df['con'].apply(lambda x: len(x) != 0)) & (df['p-value']<0.05)]

    print(f"DEPENDENT: if values have a correlation bigger than 0.2 (or smaller then -0.2) AND have a p-value<0.05")
    print(f"MARGINALLY INDEPENDENT: if correlation between variables is influenced less than 0.05 by conditioning on other variable(s).\n")


    for index, row in single_pairs_significant.iterrows():
        # select matching rows
        subdf = conditioning.loc[(conditioning['var1']==row['var1']) & (conditioning['var2']==row['var2'])]
        # only select rows where conditioning has influence the corrrelation by at least 0.05
        subdf = subdf.loc[abs(subdf['correlation']) - abs(row['correlation']) > CORRELATION_DIFFERENCE]

        # print(row['correlation'])
        # display(subdf)

        if len(subdf)==0:
            print(f"MARGINALLY DEPENDENT: {row['var1']} and {row['var2']} are marginally dependent. They are not influenced (with more than {CORRELATION_DIFFERENCE} difference in correlation) by conditioning on another variable.")

        else:
            print(f"DEPENDENT WITH CONFOUNDER:  {row['var1']} and {row['var2']} are NOT marginally dependent. Their correlation is influenced by {subdf.iloc[0]['con']}.")



marginally_dependent(independence_tests)

DEPENDENT: if values have a correlation bigger than 0.2 (or smaller then -0.2) AND have a p-value<0.05
MARGINALLY INDEPENDENT: if correlation between variables is influenced less than 0.05 by conditioning on other variable(s).

MARGINALLY DEPENDENT: nearc4 and smsa are marginally dependent. They are not influenced (with more than 0.05 difference in correlation) by conditioning on another variable.
MARGINALLY DEPENDENT: nearc4 and south are marginally dependent. They are not influenced (with more than 0.05 difference in correlation) by conditioning on another variable.
MARGINALLY DEPENDENT: nearc4 and lwage are marginally dependent. They are not influenced (with more than 0.05 difference in correlation) by conditioning on another variable.
MARGINALLY DEPENDENT: educ and black are marginally dependent. They are not influenced (with more than 0.05 difference in correlation) by conditioning on another variable.
MARGINALLY DEPENDENT: educ and smsa are marginally dependent. They are not infl

In [26]:
def check_confounder(df, x,y,z):
    correlation = df.loc[(df['var1']==x) & (df['var2']==y) & (df['con'].apply(lambda x: len(x) == 0))]
    if len(correlation) == 0:
        correlation = df.loc[(df['var1']==y) & (df['var2']==x) & (df['con'].apply(lambda x: len(x) == 0))]

    display(correlation)

    conditioned = df.loc[(df['var1']==x) & (df['var2']==y) & (df['con'].apply(lambda x: x == [z]))]
    if len(conditioned) == 0:
        conditioned = df.loc[(df['var1']==y) & (df['var2']==x) & (df['con'].apply(lambda x: x == [z]))]

    display(conditioned)

    if correlation.iloc[0]['p-value']<0.05 and conditioned.iloc[0]['p-value']>0.05:
        print(f"{x} and {y} are confounded by {z}")

    elif abs(conditioned.iloc[0]['correlation']) - abs(correlation.iloc[0]['correlation']) > CORRELATION_DIFFERENCE:
        print(abs(conditioned.iloc[0]['correlation']) - abs(correlation.iloc[0]['correlation']) )
        print(f"Stronger correlation when conditioned on {z}")

    else:
        print("Conditioning does not make a stronger correlation")

    

check_confounder(independence_tests, 'exper', 'educ', 'lwage')

Unnamed: 0,var1,var2,con,CI95%,correlation,p-value
452,educ,exper,[],[-0.67 -0.63],-0.653467,0.0


Unnamed: 0,var1,var2,con,CI95%,correlation,p-value
483,educ,exper,[lwage],[-0.71 -0.67],-0.692426,0.0


Conditioning does not make a stronger correlation


### Chain
There is chain from X -> Y -> Z, if X&Y, X&Z, Z&Y are significant AND X,Z | Y is NOT significant

Assumption 1:  X&Y, X&Z, Z&Y are dependent

In [65]:
from itertools import combinations

def chain(df):
    single_pairs_significant = df.loc[(df['con'].apply(lambda x: len(x) == 0)) & (df['p-value']<0.05) & (abs(df['correlation'])>CORRELATION_LEVEL)]
    single_pairs = list(zip(single_pairs_significant['var1'], single_pairs_significant['var2']))
    single_pairs = [set(p) for p in single_pairs]

    triples = list(combinations(VARIABLES_LIST, 3))

    # ASSUMPTION 1: check if all pair combinations are dependent on each other
    for triple_combination in triples:
        pairs = list(combinations(triple_combination, 2))
        pairs = [set(p) for p in pairs]
        
        count = 0
        for pair in pairs:
            if set(pair) in single_pairs:
                count += 1
        
        if count == 3:
            print(f"POTENTIALLY CHAIN: {triple_combination}. These variables all have a dependence on each other.")

        
        
chain(independence_tests)

POTENTIALLY CHAIN: ('lwage', 'black', 'educ'). These variables all have a dependence on each other.
POTENTIALLY CHAIN: ('lwage', 'black', 'south'). These variables all have a dependence on each other.
POTENTIALLY CHAIN: ('lwage', 'educ', 'smsa'). These variables all have a dependence on each other.
POTENTIALLY CHAIN: ('lwage', 'educ', 'south'). These variables all have a dependence on each other.
POTENTIALLY CHAIN: ('lwage', 'nearc4', 'smsa'). These variables all have a dependence on each other.
POTENTIALLY CHAIN: ('lwage', 'nearc4', 'south'). These variables all have a dependence on each other.
POTENTIALLY CHAIN: ('lwage', 'smsa', 'south'). These variables all have a dependence on each other.
POTENTIALLY CHAIN: ('black', 'educ', 'south'). These variables all have a dependence on each other.
POTENTIALLY CHAIN: ('educ', 'smsa', 'south'). These variables all have a dependence on each other.
POTENTIALLY CHAIN: ('nearc4', 'smsa', 'south'). These variables all have a dependence on each othe

### Collider
Z is a collider IF:
- X,Y are indepent when not conditioned on Z
- X,Y are dependent when conditioned on Z
- X,Z AND Z,Y are dependent.

In [42]:
def collider(df):
    conditioned_sig = df.loc[(df['con'].apply(lambda x: len(x) == 1)) & (df['p-value']<0.05) & (abs(df['correlation'])>CORRELATION_LEVEL)]

    independent = df.loc[(df['con'].apply(lambda x: len(x) ==0)) & ((abs(df['correlation'])<CORRELATION_LEVEL)  |  (df['p-value']>0.05))]
    independent_pairs = list(zip(independent['var1'], independent['var2']))
    independent_pairs = [set(p) for p in independent_pairs]
    
    for index, row in conditioned_sig.iterrows():
        combo = set([row['var1'], row['var2']])

        # pair is depent when conditioned on, but independent when not conditioned on
        if combo in independent_pairs:
            if len(row['con']) ==1:
                condition_variable = row['con'][0]

                # check if conditioned variable has relation with both X and Y
                if set([condition_variable,row['var1']]) and set([condition_variable,row['var2']]) not in independent_pairs:
        
                    print(f"COLLIDER: {row['con']} for VARIABLES {row['var1']} and {row['var2']} --- {row['var1']} -> {row['con'][0]} <- {row['var2']}")

            else:
                print("ERROR: too much variables in conditions")

    print("I did not take descendants into account.")

collider(independence_tests)

COLLIDER: ['married'] for VARIABLES black and exper --- black -> married <- exper
COLLIDER: ['educ'] for VARIABLES exper and lwage --- exper -> educ <- lwage
I did not take descendants into account.
