In [3]:
import warnings
from causalnex.structure import StructureModel

warnings.filterwarnings("ignore")  # silence warnings

sm = StructureModel()

### Structure from domain knowledge

##### specify the relationships between features.assuming the following causal relationships are known (G1 is grade in semester 1):

    health -> absences

    health -> G1


In [2]:
#add these relationships into our structure model:
sm.add_edges_from([
    ('health', 'absences'),
    ('health', 'G1')
])

### Visualising the Structure

In [20]:
#it can often be more intuitive to visualise it. CausalNex provides a plotting module that allows to do this.
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE

viz = plot_structure(
    sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)
viz.show("01_simple_plot.html")

01_simple_plot.html


### Learning the Structure

As the number of variables grows, or when domain knowledge does not exist, it can be tedious to define a structure manually. We can use CausalNex to learn the structure model from data. The structure learning algorithm we are going to use here is the NOTEARS algorithm.

When learning structure, we can use the entire dataset. Since structure should be considered as a joint effort between machine learning and domain experts, it is not always necessary to use a train / test split.

But before we begin, we have to pre-process the data so that the NOTEARS algorithm can be used.


### Preparing the Data for Structure Learning

In [12]:
import pandas as pd

data = pd.read_csv('/home/b/Documents/week8/Logistic-Optimization/data/student-por_cnt.csv', delimiter=';')
data.head(5)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [13]:
#Looking at the data, we can see that features consist of numeric and non-numeric columns. We can drop sensitive features such as sex that we do not want to include in our model.
drop_col = ['school','sex','age','Mjob', 'Fjob','reason','guardian']
data = data.drop(columns=drop_col)
data.head(5)

Unnamed: 0,address,famsize,Pstatus,Medu,Fedu,traveltime,studytime,failures,schoolsup,famsup,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,U,GT3,A,4,4,2,2,0,yes,no,...,4,3,4,1,1,3,4,0,11,11
1,U,GT3,T,1,1,1,2,0,no,yes,...,5,3,3,1,1,3,2,9,11,11
2,U,LE3,T,1,1,1,2,0,yes,no,...,4,3,2,2,3,3,6,12,13,12
3,U,GT3,T,4,2,1,3,0,no,yes,...,3,2,2,1,1,5,0,14,14,14
4,U,GT3,T,3,3,1,2,0,no,yes,...,4,3,2,1,2,5,0,11,13,13


In [14]:
#Next, we want to make our data numeric, since this is what the NOTEARS expects. We can do this by label encoding non-numeric variables.
import numpy as np
struct_data = data.copy()
non_numeric_columns = list(struct_data.select_dtypes(exclude=[np.number]).columns)
print(non_numeric_columns)

['address', 'famsize', 'Pstatus', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']


In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in non_numeric_columns:
    struct_data[col] = le.fit_transform(struct_data[col])
struct_data.head(5)

Unnamed: 0,address,famsize,Pstatus,Medu,Fedu,traveltime,studytime,failures,schoolsup,famsup,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,1,0,0,4,4,2,2,0,1,0,...,4,3,4,1,1,3,4,0,11,11
1,1,0,1,1,1,1,2,0,0,1,...,5,3,3,1,1,3,2,9,11,11
2,1,1,1,1,1,1,2,0,1,0,...,4,3,2,2,3,3,6,12,13,12
3,1,0,1,4,2,1,3,0,0,1,...,3,2,2,1,1,5,0,14,14,14
4,1,0,1,3,3,1,2,0,0,1,...,4,3,2,1,2,5,0,11,13,13


In [16]:
from causalnex.structure.notears import from_pandas
sm = from_pandas(struct_data)

In [22]:
#now apply the NOTEARS algorithm to learn the structure.

In [21]:
#and visualise the learned StructureModel using the plot function.
viz = plot_structure(
    sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)
viz.toggle_physics(False)
viz.show("01_fully_connected.html")

01_fully_connected.html


In [23]:
from causalnex.structure.notears import from_pandas
sm = from_pandas(struct_data)

In [None]:
#then visualise the learned StructureModel using the plot function.

In [25]:
viz = plot_structure(
    sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)

viz.toggle_physics(False)
viz.show("01_fully_connected.html")

01_fully_connected.html


In [27]:
#The reason there's have a fully connected graph here is thresholding hasn't been applied to the weaker edges.
#Thresholding can be applied either by specifying the value for the parameter w_threshold in from_pandas, 
#or we can remove the edges by calling the structure model function, remove_edges_below_threshold.

sm.remove_edges_below_threshold(0.8)
viz = plot_structure(
    sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)
viz.show("01_thresholded.html")

01_thresholded.html


#### In the structure, it's seen that there are some relationships that appear intuitively correct:

**Pstatus affects famrel** - if parents live apart, the quality of family relationship may be poor as a result.

**internet affects absences** - The presence of internet at home may cause student to skip class.

**studytime affects G1** - longer studytime should have a positive impact on a student’s result.

However, there are some relationships that are certainly incorrect:

higher affects Medu (Mother’s education) - this relationship does not make sense as students who wants to pursue higher education does not affect mother’s education. It could be the other way round.


#### To avoid these erroneous relationships, re-run structure learning with some added constraints:

In [28]:
sm = from_pandas(struct_data, tabu_edges=[("higher", "Medu")], w_threshold=0.8)
viz = plot_structure(
    sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)
viz.show("01_edge_added.html")

01_edge_added.html
