In [None]:
import numpy as np
import pandas as pd
from scipy import stats

from dowhy.causal_model import CausalModel

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

# PyData NYC 2023

**Nov 2, 2023, New York**

Traditional machine learning methods leverage associations between variables in order to learn the patterns of variability in the dataset of interest.

This is great when we want to predict the next most likely token or classify a data point. Yet, when decision-making is at stakes, these models usually cannot provide us with a clear solution.

A person with a relatively high probability of churn, might react negatively to a promotional content we send them and churn, because of this content. This scenario cannot be effectively modeled in a traditional churn prediction framework and requires a causal approach.

In the talk we’ll demonstrate why this is the case. We’ll discuss theoretical and practical underpinnings of causal models and demonstrate how to implement them in Python.

The talk is addressed to people who want to enrich their data science toolbox and learn about one of the currently hottest sub-fields of artificial intelligence.

In the talk we’ll focus on building the practical understanding of the topic and we’ll use a mixture of hands-on and theoretical approaches.

___________________________

A part of [CausalPython](https://causalpython.io) series on causality.

<a href="https://causalpython.io"><img src="img/CausalPython.io__flat.png" width=150 align="left"></a>
<br>

## Create the environment

To run this notebook use `causal_book_py39_cuda117`.

To create: `conda env create -f causal_book_py39_cuda117.yml`

In [None]:
COLORS = [
    '#00B0F0',
    '#FF0000',
    '#B0F000'
]

## Define helpers

In [None]:
def plot_effect(effect_true, effect_pred, figsize=(10, 7), ylim=(5000, 22000)):
    plt.figure(figsize=figsize)
    plt.scatter(effect_true, effect_pred, color=COLORS[0])
    plt.plot(np.sort(effect_true), np.sort(effect_true), color=COLORS[1], alpha=.7, label='Perfect model')
    plt.xlabel('$True\ effect$', fontsize=14, alpha=.5)
    plt.ylabel('$Predicted\ effect$', fontsize=14, alpha=.5)
    plt.ylim(ylim[0], ylim[1])
    plt.legend()
    plt.show()

In [None]:
class GPSMemorySCM:
    
    def __init__(self, random_seed=None):
        self.random_seed = random_seed
        self.u_x = stats.truncnorm(0, np.infty, scale=5)
        self.u_y = stats.norm(scale=2)
        self.u_z = stats.norm(scale=2)
        self.u = stats.truncnorm(0, np.infty, scale=4)
        
    def sample(self, sample_size=100, treatment_value=None):
        """Samples from the SCM"""
        if self.random_seed:
            np.random.seed(self.random_seed)
        
        u_x = self.u_x.rvs(sample_size)
        u_y = self.u_y.rvs(sample_size)
        u_z = self.u_z.rvs(sample_size)
        u = self.u.rvs(sample_size)
        
        if treatment_value:
            gps = np.array([treatment_value]*sample_size)
        else:
            gps = u_x + 0.7*u
            
        hippocampus = -0.6*gps + 0.25*u_z
        memory = 0.7*hippocampus + 0.25*u
        
        return gps, hippocampus, memory
    
    def intervene(self, treatment_value, sample_size=100):
        """Intervenes on the SCM"""
        return self.sample(treatment_value=treatment_value, sample_size=sample_size)

## Get the data

In [None]:
# Instantiate the SCM
scm = GPSMemorySCM()

# Generate observational data
gps_obs, hippocampus_obs, memory_obs = scm.sample(600)

# Encode as a pandas df
df = pd.DataFrame(np.vstack([gps_obs, hippocampus_obs, memory_obs]).T, columns=['X', 'Z', 'Y'])

In [None]:
df

## Step 1: Modeling the problem

### Step 1.1 - Define the graph - `GML`

In [None]:
# Create the graph describing the causal structure
gml_graph = """
graph [
    directed 1
    
    node [
        id "X" 
        label "X"
    ]    
    node [
        id "Z"
        label "Z"
    ]
    node [
        id "Y"
        label "Y"
    ]
    node [
        id "U"
        label "U"
    ]
    
    edge [
        source "X"
        target "Z"
    ]
    edge [
        source "Z"
        target "Y"
    ]
    edge [
        source "U"
        target "X"
    ]
    edge [
        source "U"
        target "Y"
    ]
]
"""

### Step 1.2 - define the DoWhy model

In [None]:
# With graph
model = CausalModel(
    data=df,
    treatment='X',
    outcome='Y',
    graph=gml_graph
)

In [None]:
# View the model
model.view_model()

## Step 2: Identify the estimand

In [None]:
estimand = model.identify_effect()
print(estimand)

## Step 3: Estimate the causal effect

In [None]:
estimate = model.estimate_effect(
    identified_estimand=estimand,
    method_name='frontdoor.two_stage_regression')

print(f'Estimate of causal effect (linear regression): {estimate.value}')

## Step 4: Run refutation tests

In [None]:
refute_subset = model.refute_estimate(
    estimand=estimand, 
    estimate=estimate,
    method_name="data_subset_refuter", 
    subset_fraction=0.4)

In [None]:
print(refute_subset)

## Heterogeneous Treatment Effects

AKA **Conditional Average Treamtent Effects** (**CATE**)

In [None]:
from econml.dml import LinearDML

from sklearn.linear_model import LinearRegression, LogisticRegression
from lightgbm import LGBMRegressor, LGBMClassifier

from sklearn.metrics import mean_absolute_percentage_error

### Get the data

In [None]:
# Train set
earnings_interaction_train = pd.read_csv(r'https://raw.githubusercontent.com/PacktPublishing/Causal-Inference-and-Discovery-in-Python/main/data/ml_earnings_interaction_train.csv')

# Test set
earnings_interaction_test = pd.read_csv(r'https://raw.githubusercontent.com/PacktPublishing/Causal-Inference-and-Discovery-in-Python/main/data/ml_earnings_interaction_test.csv')

### Step 1 - Define the problem

In [None]:
# Construct the graph (the graph is constant for all iterations)
nodes = ['took_a_course', 'python_proficiency', 'earnings', 'age']
edges = [
    ('took_a_course', 'earnings'),
    ('age', 'took_a_course'),
    ('age', 'earnings'),
    ('python_proficiency', 'earnings')
]

# Generate the GML graph
gml_string = 'graph [directed 1\n'

for node in nodes:
    gml_string += f'\tnode [id "{node}" label "{node}"]\n'

for edge in edges:
    gml_string += f'\tedge [source "{edge[0]}" target "{edge[1]}"]\n'
    
gml_string += ']'

In [None]:
# Instantiate the CausalModel 
model = CausalModel(
    data=earnings_interaction_train,
    treatment='took_a_course',
    outcome='earnings',
    effect_modifiers='python_proficiency',
    graph=gml_string
)

In [None]:
model.view_model()

### Step 2 - Get the estimand

In [None]:
# Get the estimand
estimand = model.identify_effect()

print(estimand)

### Step 3 - Get the estimate

In [None]:
# Get estimate (DML)
estimate = model.estimate_effect(
    identified_estimand=estimand,
    method_name='backdoor.econml.dml.LinearDML',
    target_units='ate',
    method_params={
        'init_params': {
            'model_y': LGBMRegressor(n_estimators=500, max_depth=10),
            'model_t': LogisticRegression(),
            'discrete_treatment': True
        },
        'fit_params': {}
    })

In [None]:
estimate.cate_estimates.mean()

In [None]:
estimate.cate_estimates

### Step 4 - Refute the model

In [None]:
# YOUR TURN
# See https://www.pywhy.org/dowhy/v0.8/user_guide/effect_inference/refute.html for available refuters

### Model validation

This type of model validation is not possible in most real-world scenarios as it uses a synthetic test set with known ground truth.

We usually don't have this luxury in the the real-world.

To learn more about causal model validation see **Chapter 10** of [**Causal Inference and Discovery in Python**](https://amzn.to/3QGSiuf).

In [None]:
# Compute predictions
effect_pred = model.causal_estimator.effect(earnings_interaction_test.drop(['true_effect', 'took_a_course'], axis=1))

# Get the true effect
effect_true = earnings_interaction_test['true_effect'].values


In [None]:
# Compute the error 
mean_absolute_percentage_error(effect_true, effect_pred)

In [None]:
plot_effect(
    effect_true=effect_true,
    effect_pred=effect_pred,
)

## Learn more about Causality (mostly) for free:
* [Free Causal Repo](https://bit.ly/3QIKOIm)
* [Causal Bandits Podcast](https://bit.ly/3FIkEil)
* [Free Weekly emails](https://bit.ly/3QGHYDn)
* [The Causal Python Book](https://amzn.to/3QGSiuf) (this one's not free, but the [examples repo](https://bit.ly/49lHJoy) is)