# Causal Inference with Real TLC Traffic Data

This notebook integrates actual taxi pickup volumes from TLC as a traffic proxy, replacing the synthetic `Traffic_Proxy` with real `traffic_count`.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from dowhy import CausalModel
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
%matplotlib inline

## 1. Load Full Panel and TLC Traffic

In [4]:
# Load full panel
print("Loading full H3 panel...")
panel = pd.read_csv('../data/h3_full_panel_res8.csv')
panel['date'] = pd.to_datetime(panel['date'])
panel['datetime'] = panel['date'] + pd.to_timedelta(panel['hour'], unit='h')

print(f"Panel shape: {panel.shape}")
print(f"Columns: {panel.columns.tolist()}")

# Load TLC traffic
print("\nLoading TLC traffic data...")
traffic = pd.read_parquet('../data/traffic_h3_2022_2025_polyfill.parquet')
traffic['match_hour'] = pd.to_datetime(traffic['match_hour'])

print(f"Traffic shape: {traffic.shape}")
print(f"Traffic date range: {traffic['match_hour'].min()} to {traffic['match_hour'].max()}")
print(f"Unique H3 cells: {traffic['h3_index'].nunique()}")

Loading full H3 panel...
Panel shape: (38871480, 14)
Columns: ['h3_index', 'date', 'hour', 'accidents_count', 'accident_indicator', 'day_of_week', 'is_weekend', 'month', 'is_rush_hour', 'Traffic_Proxy', 'Baseline_Risk', 'rain_flag', 'precipitation', 'datetime']

Loading TLC traffic data...
Traffic shape: (11619229, 3)
Traffic date range: 2022-01-01 01:00:00 to 2025-11-01 00:00:00
Unique H3 cells: 1070


## 2. Merge Traffic Data into Panel

In [5]:
# Merge traffic onto panel
print("Merging traffic data...")
df = panel.merge(
    traffic[['h3_index', 'match_hour', 'traffic_count']], 
    left_on=['h3_index', 'datetime'], 
    right_on=['h3_index', 'match_hour'],
    how='left'
)

# Fill missing traffic with 0 (no pickups)
df['traffic_count'] = df['traffic_count'].fillna(0)

print(f"✓ Merged shape: {df.shape}")
print(f"Traffic coverage: {(df['traffic_count'] > 0).sum():,} / {len(df):,} ({(df['traffic_count'] > 0).mean()*100:.2f}%)")

# Drop old Traffic_Proxy and match_hour columns
df.drop(columns=['Traffic_Proxy', 'match_hour'], inplace=True, errors='ignore')

print(f"\nTraffic statistics:")
print(df['traffic_count'].describe())

Merging traffic data...
✓ Merged shape: (38871480, 16)
Traffic coverage: 10,796,648 / 38,871,480 (27.78%)

Traffic statistics:
count    3.887148e+07
mean     3.197427e+00
std      2.120617e+01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      1.428571e-01
max      1.239000e+03
Name: traffic_count, dtype: float64


## 3. Prepare Data for DoWhy

In [6]:
# Select analysis variables (replace Traffic_Proxy with traffic_count)
analysis_vars = ['accident_indicator', 'accidents_count', 'rain_flag', 'day_of_week', 'is_weekend', 
                 'month', 'is_rush_hour', 'Baseline_Risk', 'traffic_count']

print("Missing values in analysis variables:")
print(df[analysis_vars].isnull().sum())

# Drop rows with missing Baseline_Risk
df_clean = df[analysis_vars].dropna().copy()

# Ensure binary types
df_clean['accident_indicator'] = df_clean['accident_indicator'].astype(int)
df_clean['rain_flag'] = df_clean['rain_flag'].astype(int)

print(f"\nRows after dropping missing values: {len(df_clean):,} (retained {len(df_clean)/len(df)*100:.2f}%)")

# Class balance
print(f"\nClass balance:")
print(f"  Rain hours: {df_clean['rain_flag'].sum():,} ({df_clean['rain_flag'].mean()*100:.2f}%)")
print(f"  Accident hours: {df_clean['accident_indicator'].sum():,} ({df_clean['accident_indicator'].mean()*100:.2f}%)")
print(f"  Hours with traffic: {(df_clean['traffic_count'] > 0).sum():,} ({(df_clean['traffic_count'] > 0).mean()*100:.2f}%)")

# Cross-tabulation: rain vs accident
print(f"\nCross-tabulation (Rain vs Accident):")
crosstab = pd.crosstab(df_clean['rain_flag'], df_clean['accident_indicator'], 
                       margins=True, normalize='index')
print(crosstab)

Missing values in analysis variables:
accident_indicator       0
accidents_count          0
rain_flag                0
day_of_week              0
is_weekend               0
month                    0
is_rush_hour             0
Baseline_Risk         1135
traffic_count            0
dtype: int64

Rows after dropping missing values: 38,870,345 (retained 100.00%)

Class balance:
  Rain hours: 4,182,475 (10.76%)
  Accident hours: 334,796 (0.86%)
  Hours with traffic: 10,796,648 (27.78%)

Cross-tabulation (Rain vs Accident):
accident_indicator         0         1
rain_flag                             
0                   0.991481  0.008519
1                   0.990605  0.009395
All                 0.991387  0.008613


## 4. Define Causal DAG with Real Traffic

In [7]:
# Updated DAG with traffic_count
causal_graph = """
digraph {
    day_of_week -> rain_flag;
    day_of_week -> traffic_count;
    day_of_week -> accident_indicator;
    
    is_weekend -> rain_flag;
    is_weekend -> traffic_count;
    is_weekend -> accident_indicator;
    
    month -> rain_flag;
    month -> accident_indicator;
    
    is_rush_hour -> traffic_count;
    is_rush_hour -> accident_indicator;
    
    Baseline_Risk -> accident_indicator;
    traffic_count -> accident_indicator;
    
    rain_flag -> accident_indicator;
}
"""

print("✓ Causal DAG defined with real traffic_count")
print("\nDAG structure:")
print("  Confounders → Rain")
print("  Confounders → Traffic (actual TLC pickups)")
print("  Confounders → Accident Indicator")
print("  Rain → Accident Indicator (causal effect of interest)")

✓ Causal DAG defined with real traffic_count

DAG structure:
  Confounders → Rain
  Confounders → Traffic (actual TLC pickups)
  Confounders → Accident Indicator
  Rain → Accident Indicator (causal effect of interest)


## 5. Initialize DoWhy Model

In [8]:
# Initialize causal model with traffic_count
print("Initializing DoWhy causal model...")

model = CausalModel(
    data=df_clean,
    treatment='rain_flag',
    outcome='accident_indicator',
    graph=causal_graph,
    common_causes=['day_of_week', 'is_weekend', 'month', 'is_rush_hour', 
                   'Baseline_Risk', 'traffic_count']
)

print("\n✓ Causal model initialized")
print(f"  Treatment: rain_flag")
print(f"  Outcome: accident_indicator")
print(f"  Confounders: day_of_week, is_weekend, month, is_rush_hour, Baseline_Risk, traffic_count (TLC actual)")

Initializing DoWhy causal model...

✓ Causal model initialized
  Treatment: rain_flag
  Outcome: accident_indicator
  Confounders: day_of_week, is_weekend, month, is_rush_hour, Baseline_Risk, traffic_count (TLC actual)


## 6. Identify Causal Effect

In [10]:
# Identify estimand
print("Identifying causal estimand...")
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)

print("\n" + "="*60)
print("IDENTIFIED ESTIMAND")
print("="*60)
print(identified_estimand)

Identifying causal estimand...

IDENTIFIED ESTIMAND
Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
     d                                                          
────────────(E[accident_indicator|day_of_week,month,is_weekend])
d[rain_flag]                                                    
Estimand assumption 1, Unconfoundedness: If U→{rain_flag} and U→accident_indicator then P(accident_indicator|rain_flag,day_of_week,month,is_weekend,U) = P(accident_indicator|rain_flag,day_of_week,month,is_weekend)

### Estimand : 2
Estimand name: iv
No such variable(s) found!

### Estimand : 3
Estimand name: frontdoor
No such variable(s) found!

### Estimand : 4
Estimand name: general_adjustment
Estimand expression:
     d                                                          
────────────(E[accident_indicator|month,is_weekend,day_of_week])
d[rain_flag]                                                    
Estimand assumption 1, Unconf

## 7. Estimate ATE with Propensity Score Weighting

In [11]:
# Estimate using propensity score weighting
print("Estimating causal effect using Propensity Score Weighting...")

estimate = model.estimate_effect(
    identified_estimand,
    method_name="backdoor.propensity_score_weighting",
    target_units="ate"
)

print("\n" + "="*60)
print("CAUSAL EFFECT ESTIMATE (ATE)")
print("="*60)
print(f"Treatment: Rain (rain_flag = 1)")
print(f"Outcome: Accident Indicator (binary)")
print(f"Method: Propensity Score Weighting")
print(f"Confounders: Now includes REAL TLC traffic volume")

print(f"\nAverage Treatment Effect (ATE): {estimate.value:.6f}")

print(f"\nInterpretation:")
if estimate.value > 0:
    print(f"  Rain INCREASES probability of any crash by {estimate.value*100:.4f} percentage points.")
elif estimate.value < 0:
    print(f"  Rain DECREASES probability of any crash by {abs(estimate.value)*100:.4f} percentage points.")
else:
    print(f"  Rain has NO EFFECT on probability of any crash.")
print("="*60)

Estimating causal effect using Propensity Score Weighting...

CAUSAL EFFECT ESTIMATE (ATE)
Treatment: Rain (rain_flag = 1)
Outcome: Accident Indicator (binary)
Method: Propensity Score Weighting
Confounders: Now includes REAL TLC traffic volume

Average Treatment Effect (ATE): 0.000913

Interpretation:
  Rain INCREASES probability of any crash by 0.0913 percentage points.


## 8. Compare Results: Proxy vs Real Traffic

In [13]:
print("\n" + "#"*60)
print("COMPARISON: TRAFFIC PROXY vs REAL TLC DATA")
print("#"*60)

# Previous result from 04_causal_inference.ipynb
ate_with_proxy = 0.000913

print(f"\nPrevious (Traffic_Proxy):  ATE = {ate_with_proxy:.6f} ({ate_with_proxy*100:.4f} pp)")
print(f"Current  (TLC traffic_count): ATE = {estimate.value:.6f} ({estimate.value*100:.4f} pp)")

diff = estimate.value - ate_with_proxy
pct_change = (diff / ate_with_proxy) * 100 if ate_with_proxy != 0 else 0

print(f"\nDifference: {diff:.6f} ({pct_change:+.2f}% change)")

if abs(pct_change) < 5:
    print("\u2192 Results are CONSISTENT - proxy was a good approximation")
elif abs(pct_change) < 20:
    print("\u2192 Results are SIMILAR - real traffic provides refinement")
else:
    print("\u2192 Results DIFFER substantially - real traffic reveals new dynamics")

print("#"*60)


############################################################
COMPARISON: TRAFFIC PROXY vs REAL TLC DATA
############################################################

Previous (Traffic_Proxy):  ATE = 0.000913 (0.0913 pp)
Current  (TLC traffic_count): ATE = 0.000913 (0.0913 pp)

Difference: -0.000000 (-0.05% change)
→ Results are CONSISTENT - proxy was a good approximation
############################################################
