In [None]:
# === Environment Setup ===
import os, sys, math, time, random, json, textwrap, warnings
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from linearmodels.iv import IV2SLS
from ipywidgets import interact, widgets
from IPython.display import display, Markdown

# --- Configuration ---
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams.update({'font.size': 14, 'figure.figsize': (12, 8), 'figure.dpi': 150,
                     'axes.titlesize': 'large', 'axes.labelsize': 'medium',
                     'xtick.labelsize': 'small', 'ytick.labelsize': 'small'})
np.set_printoptions(suppress=True, linewidth=120, precision=4)

# --- Utility Functions ---
def note(msg, **kwargs):
    display(Markdown(f"<div class='alert alert-block alert-info'>📝 **Note:** {msg}</div>"))
def sec(title):
    print(f"\n{100*'='}\n| {title.upper()} |\n{100*'='}")

note("Environment initialized for Advanced Regression Discontinuity Design.")

# Part 6: Econometrics
## Chapter 6.5: Regression Discontinuity Design: A Local Experiment

### Introduction: Exploiting Arbitrary Cutoffs

The **Regression Discontinuity (RD) design** is one of the most credible and transparent quasi-experimental methods. The core idea is to exploit situations where a treatment is assigned based on whether an individual's score on a continuous variable (the **running variable**) is above or below a specific, arbitrary cutoff.

The logic of RD is that individuals who are *just* on either side of the cutoff are likely to be very similar in all other respects. The cutoff effectively creates a **local randomized experiment**. By comparing the outcomes of individuals just to the left and just to the right of the cutoff, we can obtain a highly credible estimate of the causal effect of the treatment for this specific subpopulation.

This chapter provides a PhD-level treatment of RD, covering:
1.  The formal potential outcomes framework for RD.
2.  Estimation using local polynomial methods.
3.  The crucial issue of optimal bandwidth selection.
4.  Fuzzy RD as an instrumental variables strategy.
5.  Validity checks and the Regression Kink Design (RKD) extension.

## 1. The Formal RD Framework

Let $X_i$ be the running variable for individual $i$ and $c$ be the cutoff. In a **Sharp RD**, treatment $D_i$ is a deterministic function: $D_i = \mathbf{1}\{X_i \ge c\}$.

The parameter of interest is the average causal effect at the cutoff:
$$ \tau_{SRD} = E[Y_i(1) - Y_i(0) | X_i = c] $$ 

### 1.1 The Identification Assumption: Continuity
The central identifying assumption is that the potential outcome functions, $E[Y_i(0)|X_i=x]$ and $E[Y_i(1)|X_i=x]$, are **continuous** in the running variable $x$ at the cutoff $c$. This means that in the absence of the treatment, the average outcome for individuals just below the cutoff would have been the same as for individuals just above it. 

Under this assumption, any observed jump, or **discontinuity**, in the actual outcome function at the cutoff must be attributed solely to the causal effect of the treatment:
$$ \tau_{SRD} = \lim_{x \downarrow c} E[Y_i|X_i=x] - \lim_{x \uparrow c} E[Y_i|X_i=x] $$ 
This assumption would be violated if agents could precisely manipulate their running variable to sort themselves non-randomly around the cutoff.

## 2. Estimation: Local Polynomial Regression

The modern approach to RD estimation is **local polynomial regression**. Instead of fitting a global high-order polynomial, which can be sensitive to data far from the cutoff, we fit a simple polynomial (usually linear or quadratic) within a narrow **bandwidth** ($h$) around the cutoff.

For a **local linear regression**, we run the interacted regression:
$$ Y_i = \beta_0 + \beta_1 (X_i - c) + \beta_2 D_i + \beta_3 [D_i \times (X_i - c)] + \epsilon_i $$ 
This is estimated using only data for which $|X_i - c| < h$. The coefficient $\mathbf{\beta_2}$ is the RD estimate of the treatment effect.

A more robust, non-parametric approach is to use **kernel weighting**, which gives more weight to observations closer to the cutoff within the bandwidth. The most common is a **triangular kernel**.

### 2.1 The Bias-Variance Trade-off and Optimal Bandwidth Selection

The choice of bandwidth $h$ is crucial and involves a **bias-variance trade-off**:
- **Large Bandwidth:** Reduces the variance of the estimate (more data points) but increases the risk of bias from misspecification if the true underlying function is non-linear.
- **Small Bandwidth:** Reduces bias from non-linearities but increases the variance (fewer data points).

Modern RD practice relies on **data-driven optimal bandwidth selectors**. These methods aim to find the bandwidth $h_{opt}$ that minimizes the Asymptotic Mean Squared Error (AMSE) of the RD estimator. Leading methods include:
- **Imbens-Kalyanaraman (IK, 2012):** A plug-in estimator that involves estimating nuisance parameters related to the curvature of the regression function and the variance at the cutoff.
- **Calonico, Cattaneo, and Titiunik (CCT, 2014):** An alternative approach that provides bias-corrected inference and is now widely used.

These methods automate the difficult choice of bandwidth, making RD analysis more objective and replicable.

In [None]:
sec("Interactive Lab: The Bias-Variance Trade-off in RD")
# Generate synthetic data with a known non-linear structure
rng = np.random.default_rng(seed=123); n_obs = 1000; cutoff = 0.0; true_effect = 25.0
running_var = rng.uniform(-1, 1, n_obs)
treatment = (running_var >= cutoff).astype(int)
outcome = 50 + 15*running_var + 20*running_var**2 - 10*running_var**3 + true_effect*treatment + rng.normal(0, 5, n_obs)
df = pd.DataFrame({'running_var': running_var, 'outcome': outcome, 'treatment': treatment})
df['running_var_centered'] = df['running_var'] - cutoff

@interact(bandwidth=widgets.FloatSlider(min=0.05, max=1.0, step=0.05, value=0.5, description='Bandwidth (h)'))
def plot_rd_bandwidth(bandwidth):
    df_local = df[np.abs(df['running_var_centered']) < bandwidth].copy()
    fig, ax = plt.subplots(figsize=(12, 8))
    df['binned_rv'] = pd.cut(df['running_var_centered'], bins=40)
    binned_data = df.groupby('binned_rv')['outcome'].mean().reset_index()
    binned_data['binned_mid'] = binned_data['binned_rv'].apply(lambda x: x.mid)
    sns.scatterplot(data=binned_data, x='binned_mid', y='outcome', ax=ax, color='gray', label='Binned Averages')
    
    formula = 'outcome ~ running_var_centered * treatment'
    rd_model = smf.ols(formula, data=df_local).fit(cov_type='HC1')
    x_below = np.linspace(-bandwidth, 0, 100); x_above = np.linspace(0, bandwidth, 100)
    pred_below = rd_model.predict(pd.DataFrame({'running_var_centered': x_below, 'treatment': 0}))
    pred_above = rd_model.predict(pd.DataFrame({'running_var_centered': x_above, 'treatment': 1}))
    ax.plot(x_below, pred_below, 'b-', lw=3, label='Control Fit'); ax.plot(x_above, pred_above, 'r-', lw=3, label='Treatment Fit')
    ax.axvline(cutoff, color='k', ls='--', label='Cutoff'); ax.legend()
    ax.set_title(f'RD Plot (Bandwidth = {bandwidth:.2f})'); plt.show()
    note(f"Estimated Effect: {rd_model.params['treatment']:.3f} (95% CI: {rd_model.conf_int().loc['treatment'].to_string(float_format='%.3f')})")

### 2.2 Fuzzy RD as Instrumental Variables

In a **Fuzzy RD** design, crossing the cutoff only changes the *probability* of receiving treatment, but does not determine it perfectly. This is common in practice, for example, when a program is offered to all individuals above a certain age, but not all eligible individuals take it up.

This setup is perfectly analyzed using the **cutoff as an instrumental variable (IV)** for treatment status. The logic is as follows:
1.  **First Stage:** The instrument (a dummy for being above the cutoff, $Z_i = \mathbf{1}\{X_i \ge c\}$) must be a strong predictor of the actual treatment received, $D_i$. We can test this by examining the discontinuity in the probability of treatment at the cutoff.
2.  **Exclusion Restriction:** The instrument $Z_i$ must affect the outcome $Y_i$ *only* through its effect on the treatment $D_i$. This is guaranteed by the continuity assumption of the RD design.

The Fuzzy RD estimate is the ratio of the jump in the outcome to the jump in the treatment probability at the cutoff:
$$ \tau_{FRD} = \frac{\lim_{x \downarrow c} E[Y_i|X_i=x] - \lim_{x \uparrow c} E[Y_i|X_i=x]}{\lim_{x \downarrow c} E[D_i|X_i=x] - \lim_{x \uparrow c} E[D_i|X_i=x]} $$

In [None]:
sec("Fuzzy RD Example: Estimating the LATE")
# Generate synthetic data for a Fuzzy RD
rng = np.random.default_rng(seed=42);
n_obs = 2000; cutoff = 0.0; true_effect = 15.0
running_var_f = rng.uniform(-1, 1, n_obs)
instrument_z = (running_var_f >= cutoff).astype(int)

# First Stage: Instrument affects probability of treatment
prob_d = 0.2 + 0.6 * instrument_z # Jump from 20% to 80% at the cutoff
treatment_d = rng.binomial(1, prob_d)

outcome_y = 50 + true_effect * treatment_d + 10 * running_var_f + rng.normal(0, 5, n_obs)
df_fuzzy = pd.DataFrame({'y': outcome_y, 'd': treatment_d, 'z': instrument_z, 'x': running_var_f})

# Estimate using 2SLS with an interacted model
df_fuzzy['x_centered'] = df_fuzzy['x'] - cutoff
df_fuzzy['z_x_interact'] = df_fuzzy['z'] * df_fuzzy['x_centered']
formula = 'y ~ 1 + x_centered + z_x_interact + [d ~ z]'
iv_model = IV2SLS.from_formula(formula, data=df_fuzzy).fit(cov_type='robust')

note("Fuzzy RD Results using 2SLS")
display(iv_model.summary)

## 3. Validity Checks

The credibility of any RD design rests on its core assumptions, which must be tested.

1.  **Manipulation Test:** We must check for non-random sorting around the cutoff. A common method is the **McCrary Density Test**, which formally tests for a discontinuity in the density of the running variable at the cutoff. A significant jump suggests manipulation.

2.  **Placebo Tests:** We should test for discontinuities in **predetermined covariates**. These are variables (e.g., gender, race) whose values are determined before the treatment. They should be smooth through the cutoff. Finding a jump in a placebo outcome would be a major red flag.

3.  **Placebo Cutoffs:** We can test for discontinuities at points other than the true cutoff. We should not find any significant effects at these placebo cutoffs.

## 4. Regression Kink Design (RKD)

The **Regression Kink Design (RKD)** is an extension of RD used when the treatment itself does not change discontinuously, but its *slope* with respect to the running variable does. The causal effect is identified by the change in the slope of the outcome variable at the kink point.

**Example:** In many tax systems, the marginal tax rate changes at specific income thresholds. The treatment is the tax rate, which is a function of income (the running variable). The function is continuous, but its slope changes at the kink points. RKD allows us to estimate the elasticity of taxable income with respect to the marginal tax rate by measuring the change in the slope of the income distribution at these kink points.

In [None]:
sec("Regression Kink Design (RKD) Illustration")
# Generate synthetic data for an RKD
rng = np.random.default_rng(seed=789)
n_obs = 2000; kink_point = 50000
running_var_k = rng.uniform(20000, 80000, n_obs)
running_var_centered_k = running_var_k - kink_point

# Treatment (e.g., marginal tax rate) has a kink
treatment_k = 0.1 * running_var_k + 0.15 * np.maximum(0, running_var_centered_k)
# Outcome has a corresponding kink
outcome_k = 1000 + 0.5 * treatment_k + rng.normal(0, 500, n_obs)

df_kink = pd.DataFrame({'income': running_var_k, 'outcome': outcome_k, 
                        'income_centered': running_var_centered_k,
                        'above_kink': (running_var_k >= kink_point).astype(int)})

# To estimate the kink, we interact the running variable with the dummy for being above the kink
rkd_formula = 'outcome ~ income_centered * above_kink'
rkd_model = smf.ols(rkd_formula, data=df_kink).fit()

fig, ax = plt.subplots(figsize=(12,8))
sns.scatterplot(data=df_kink, x='income', y='outcome', alpha=0.3)
x_range = np.linspace(df_kink['income'].min(), df_kink['income'].max(), 200)
pred_df = pd.DataFrame({'income': x_range, 'income_centered': x_range - kink_point, 
                        'above_kink': (x_range >= kink_point).astype(int)})
ax.plot(x_range, rkd_model.predict(pred_df), 'r-', lw=3, label='RKD Fit')
ax.axvline(kink_point, color='k', ls='--', label='Kink Point')
ax.set(title='Regression Kink Design', xlabel='Running Variable (Income)', ylabel='Outcome')
ax.legend(); plt.show()

note(f"The change in slope is given by the coefficient on the interaction term 'income_centered:above_kink': {rkd_model.params['income_centered:above_kink']:.3f}")