In [1]:
from graph import load_graph
import pandas as pd
import networkx as nx
from dowhy.causal_identifier import backdoor
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


### Functions

In [2]:
def get_all_possible_paths(graph):
    # Get every path between educ and lwage. Do this being converting the graph to undirected.
    H = graph.to_undirected()

    # the all_simple_paths function finds all possible undirected paths between 'X' and 'Y' in graph H
    all_possible_paths = list(nx.all_simple_paths(H, 'educ', 'lwage'))

    # print("These are all posible paths between educ and lwage.")
    # for path in all_possible_paths:
    #     print(path)

    return all_possible_paths

def get_backdoorpaths(all_possible_paths, graph):
    # Get if path in all_possible_path is a backdoor path.
    bd = backdoor.Backdoor(graph, 'educ', 'lwage')
    backdoor_paths = [path for path in all_possible_paths if bd.is_backdoor(path)]

    # print("These are all backdoor paths.")
    # for path in backdoor_paths:
    #     print(path)

    return backdoor_paths


def get_adjustment_variables(backdoor_paths, graph):
    # Create a dataframe with the paths as rows and for each path the colliders and non-colliders
    adjustment_variables = pd.DataFrame(columns=['path', 'colliders', 'non_colliders'])

    for path in backdoor_paths:
        colliders = np.array([])
        non_colliders = []
        path_len = len(path)
        for node0, node1, node2 in zip(path[0:path_len-2], path[1:path_len-1], path[2:]):
            # if there is an arrow pointing into node1 from both sides on the path, it is a collider
            if graph.has_edge(node0, node1) and graph.has_edge(node2, node1):
                colliders = np.append(colliders, list(nx.descendants(graph,node1)) + [node1]) # so we add it (and all its descendants) to the list

        # we flatten the list of list
        colliders = colliders.flatten()
        non_colliders = [x for x in path[1:-1] if all(x_i != x for x_i in colliders)]

        adjustment_variables.loc[len(adjustment_variables.index)] = [path, colliders, non_colliders] 

    return adjustment_variables  


In [3]:
def backdoor_criterion(graph):
    possible_paths = get_all_possible_paths(graph)
    backdoor_paths = get_backdoorpaths(possible_paths, graph)
    adjustment_variables = get_adjustment_variables(backdoor_paths, graph)
    return adjustment_variables

DoWhy functions

In [4]:
import dowhy
from dowhy import CausalModel
from IPython.display import Image, display

# a utility function to parse the .gml file to string
def gml_to_string(file):
    gml_str = ''
    with open(file, 'r') as file:
        for line in file:
            gml_str += line.rstrip()
    return gml_str


def dowhy_backdoor(file):

    # creating arbitrary data
    df = pd.read_stata("data/close_college.dta")

    gml_graph = gml_to_string(file)
    # With GML string
    model=CausalModel(
        data = df,
        treatment='educ',
        outcome='lwage',
        graph=gml_graph
    )

    identified_estimand = model.identify_effect()
    print(identified_estimand)


### Backdoor criterion - Graph version = 0

In [5]:
from graph import load_graph
import pandas as pd
import networkx as nx

G = load_graph(version=0)
# nx.draw(G, with_labels=True, node_size=1000, font_size=8)


educ -> smsa results in all variables being a descendant of educ

And since descendats cannot be in adjustment set, the backdoor criterion cannot be satisfied. Because there are open backdoor paths between educ an lwage qwithout any variable in the adjustment set Z.

In [6]:
adjustment_variables = backdoor_criterion(G)
display(adjustment_variables)
print(f"Descendants of educ are: {nx.descendants(G,'educ') | {'educ'}}")
print("These variables cannot be included in the adjustment set.")

Unnamed: 0,path,colliders,non_colliders
0,"[educ, nearc4, smsa, south, black, lwage]",[],"[nearc4, smsa, south, black]"
1,"[educ, nearc4, smsa, black, lwage]",[],"[nearc4, smsa, black]"
2,"[educ, nearc4, smsa, lwage]",[],"[nearc4, smsa]"
3,"[educ, black, lwage]",[],[black]
4,"[educ, black, smsa, lwage]",[],"[black, smsa]"
5,"[educ, black, south, smsa, lwage]",[],"[black, south, smsa]"


Descendants of educ are: {'lwage', 'married', 'black', 'educ', 'exper', 'nearc4', 'south', 'smsa'}
These variables cannot be included in the adjustment set.


**Adjustment set**

Not possible, backdoor criterion is not satisfied.

### Backdoor criterion - Graph version = 1

In [7]:
from graph import load_graph
import pandas as pd
import networkx as nx
import numpy as np

G = load_graph(version=1)
# nx.draw(G, with_labels=True, node_size=1000, font_size=8)


In [8]:
adjustment_variables = backdoor_criterion(G)
display(adjustment_variables)
print(f"Descendants of educ are: {nx.descendants(G,'educ') | {'educ'}}")
print("These variables cannot be included in the adjustment set.")

Unnamed: 0,path,colliders,non_colliders
0,"[educ, nearc4, smsa, south, black, lwage]",[],"[nearc4, smsa, south, black]"
1,"[educ, nearc4, smsa, black, lwage]",[],"[nearc4, smsa, black]"
2,"[educ, nearc4, smsa, lwage]",[],"[nearc4, smsa]"
3,"[educ, black, lwage]",[],[black]
4,"[educ, black, smsa, lwage]",[],"[black, smsa]"
5,"[educ, black, south, smsa, lwage]",[],"[black, south, smsa]"


Descendants of educ are: {'lwage', 'educ', 'exper'}
These variables cannot be included in the adjustment set.


**Adjustment sets**

- Minimum: black, nearc4 OR black, smsa

Other posibilities
- black, nearc4
- black, nearc4, smsa
- black, nearc4, south
- black, nearc4, smsa, south


**DoWhy**

In [9]:
import dowhy
from dowhy import CausalModel
from IPython.display import Image, display

# a utility function to parse the .gml file to string
def gml_to_string(file):
    gml_str = ''
    with open(file, 'r') as file:
        for line in file:
            gml_str += line.rstrip()
    return gml_str


def dowhy_backdoor(file):

    # creating arbitrary data
    df = pd.read_stata("data/close_college.dta")

    gml_graph = gml_to_string(file)
    # With GML string
    model=CausalModel(
        data = df,
        treatment='educ',
        outcome='lwage',
        graph=gml_graph
    )

    identified_estimand = model.identify_effect()
    print(identified_estimand)

dowhy_backdoor("graph_files/graph_version1.gml")

Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
   d                          
───────(E[lwage|black,nearc4])
d[educ]                       
Estimand assumption 1, Unconfoundedness: If U→{educ} and U→lwage then P(lwage|educ,black,nearc4,U) = P(lwage|educ,black,nearc4)

### Estimand : 2
Estimand name: iv
No such variable(s) found!

### Estimand : 3
Estimand name: frontdoor
No such variable(s) found!



#### version 2

In [10]:
from graph import load_graph
import pandas as pd
import networkx as nx
import numpy as np

G = load_graph(version=2)
# nx.draw(G, with_labels=True, node_size=1000, font_size=8)

adjustment_variables = backdoor_criterion(G)
display(adjustment_variables)
print(f"Descendants of educ are: {nx.descendants(G,'educ') | {'educ'}}")
print("These variables cannot be included in the adjustment set.")
dowhy_backdoor("graph_files/graph_version2.gml")


Unnamed: 0,path,colliders,non_colliders
0,"[educ, nearc4, smsa, black, lwage]",[],"[nearc4, smsa, black]"
1,"[educ, nearc4, smsa, lwage]",[],"[nearc4, smsa]"
2,"[educ, black, lwage]",[],[black]
3,"[educ, black, smsa, lwage]",[],"[black, smsa]"
4,"[educ, smsa, black, lwage]",[],"[smsa, black]"
5,"[educ, smsa, lwage]",[],[smsa]


Descendants of educ are: {'lwage', 'educ', 'exper'}
These variables cannot be included in the adjustment set.
Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
   d                        
───────(E[lwage|smsa,black])
d[educ]                     
Estimand assumption 1, Unconfoundedness: If U→{educ} and U→lwage then P(lwage|educ,smsa,black,U) = P(lwage|educ,smsa,black)

### Estimand : 2
Estimand name: iv
No such variable(s) found!

### Estimand : 3
Estimand name: frontdoor
No such variable(s) found!



## VERSION 3

In [11]:
from graph import load_graph
import pandas as pd
import networkx as nx
import numpy as np

G = load_graph(version=3)
# nx.draw(G, with_labels=True, node_size=1000, font_size=8)

adjustment_variables = backdoor_criterion(G)
display(adjustment_variables)
print(f"Descendants of educ are: {nx.descendants(G,'educ') | {'educ'}}")
print("These variables cannot be included in the adjustment set.")
dowhy_backdoor("graph_files/graph_version2.gml")


Unnamed: 0,path,colliders,non_colliders
0,"[educ, nearc4, smsa, black, lwage]",[],"[nearc4, smsa, black]"
1,"[educ, nearc4, smsa, black, exper, lwage]",[],"[nearc4, smsa, black, exper]"
2,"[educ, nearc4, smsa, lwage]",[],"[nearc4, smsa]"
3,"[educ, black, lwage]",[],[black]
4,"[educ, black, exper, lwage]",[],"[black, exper]"
5,"[educ, black, smsa, lwage]",[],"[black, smsa]"
6,"[educ, smsa, black, lwage]",[],"[smsa, black]"
7,"[educ, smsa, black, exper, lwage]",[],"[smsa, black, exper]"
8,"[educ, smsa, lwage]",[],[smsa]


Descendants of educ are: {'lwage', 'educ', 'exper'}
These variables cannot be included in the adjustment set.
Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
   d                        
───────(E[lwage|smsa,black])
d[educ]                     
Estimand assumption 1, Unconfoundedness: If U→{educ} and U→lwage then P(lwage|educ,smsa,black,U) = P(lwage|educ,smsa,black)

### Estimand : 2
Estimand name: iv
No such variable(s) found!

### Estimand : 3
Estimand name: frontdoor
No such variable(s) found!

