In [None]:
!wget https://raw.githubusercontent.com/rpryzant/causal-bert-pytorch/master/CausalBert.py
!wget https://raw.githubusercontent.com/rpryzant/causal-bert-pytorch/master/testdata.csv
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm

import networkx as nx

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

In [None]:
COLORS = [
    '#00B0F0',
    '#FF0000',
    '#B0F000'
]

In [None]:
np.random.seed(18)

# Causality vs NLP (PyCon DE & PyData Berlin 2023 )

## DGP 1

In [None]:
dag_1 = nx.DiGraph()

dag_1.add_edges_from(
    [
        ('X', 'Y'),
        ('W', 'X'),
        ('W', 'Y')
    ]
)

nx.draw(
    dag_1, 
    with_labels=True, 
    node_size=900,
    font_color='white', 
    node_color=COLORS[0]
)

In [None]:
SAMPLE_SIZE = 1000

w = np.random.normal(0, 1, SAMPLE_SIZE)
x = w + 0.5*np.random.normal(0, 1, SAMPLE_SIZE)
y = 5*x + 2*w + 0.5*np.random.normal(0, 1, SAMPLE_SIZE)

feats = np.stack([x, w]).T

In [None]:
# Statsmodel models require us to add constant
feats = sm.add_constant(feats)

# Instantiate the model and fit it
model = sm.OLS(y, feats)
fitted_model = model.fit()

# Print results summary
print(fitted_model.summary(xname=['const', 'x', 'w']))

## DGP 2

In [None]:
dag_2 = nx.DiGraph()

dag_2.add_edges_from(
    [
        ('X', 'Y'),
        ('X', 'W'),
        ('Y', 'W')
    ]
)

nx.draw(
    dag_2, 
    with_labels=True, 
    node_size=900,
    font_color='white', 
    node_color=COLORS[0]
)

In [None]:
x = np.random.normal(0, 1, SAMPLE_SIZE)
y = 5*x + 0.5*np.random.normal(0, 1, SAMPLE_SIZE)
w = 5*x + 2*y + 0.2*np.random.normal(0, 1, SAMPLE_SIZE)

feats = np.stack([x, w]).T

In [None]:
# Statsmodel models require us to add constant
feats = sm.add_constant(feats)

# Instantiate the model and fit it
model = sm.OLS(y, feats)
fitted_model = model.fit()

# Print results summary
print(fitted_model.summary(xname=['const', 'x', 'w']))

## CausalBert

In [None]:
from CausalBert import CausalBertWrapper

In [None]:
# Get data
df = pd.read_csv('testdata.csv')    

df.head()

In [None]:
# Instantiate the model
causal_bert = CausalBertWrapper(
    batch_size=32,                      
    g_weight=0.1, 
    Q_weight=0.1, 
    mlm_weight=1
)

### What's inside?


```python
class CausalBertWrapper:
    """Model wrapper in charge of training and inference."""

    def __init__(self, g_weight=1.0, Q_weight=0.1, mlm_weight=1.0,
        batch_size=32):
        self.model = CausalBert.from_pretrained(
            "distilbert-base-uncased",
            num_labels=2,
            output_attentions=False,
            output_hidden_states=False)
        if CUDA:
            self.model = self.model.cuda()
```





---



In [None]:
# Fit
causal_bert.train(
    texts=df['text'], 
    confounds=df['C'], 
    treatments=df['T'], 
    outcomes=df['Y'], 
    epochs=2
)  

In [None]:
causal_bert.ATE(
    C=df['C'], 
    W=df['text']
) 