# Libraries Usage

In [1]:
import polars as pl
import pandas as pd
import numpy as np 
from pathlib import Path
from tqdm import tqdm
import arviz as az 
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm
import rdrobust
import rddensity
from io import BytesIO
from datetime import datetime
from rdrobust import rdrobust, rdplot
import warnings, base64, os
warnings.filterwarnings("ignore")

random_seed = 123
rng = np.random.default_rng(random_seed)

In [2]:
def calculate_true_cutoff_age(birth_date: int) -> float:
    """
    Calculate exact age when coverage ends.
    
    Algorithm:
    1. Extract birth year/month from YYYYMM format
    2. Determine when they turn 18
    3. Find first March 31 after that
    4. Calculate age at that date
    
    Edge cases handled:
    - Born on March 31 ‚Üí loses coverage on 18th birthday
    - Born on March 1 ‚Üí loses coverage 30 days after 18th birthday
    - Born on April 1 ‚Üí loses coverage ~364 days after 18th birthday
    """
    
    # Parse birth date
    birth_year = birth_date // 100   # 200503 ‚Üí 2005
    birth_month = birth_date % 100    # 200503 ‚Üí 3
    
    # Validate input
    if not (1 <= birth_month <= 12):
        raise ValueError(f"Invalid birth month: {birth_month}")
    
    # Determine cutoff date
    if birth_month <= 3:  # Jan, Feb, Mar
        # Lose coverage on March 31 of year they turn 18
        cutoff_year = birth_year + 18
        cutoff_month = 3
    else:  # Apr through Dec
        # Lose coverage on March 31 of year AFTER they turn 18
        cutoff_year = birth_year + 19
        cutoff_month = 3
    
    # Calculate fractional age at cutoff
    years_lived = cutoff_year - birth_year
    months_difference = cutoff_month - birth_month
    
    age_at_cutoff = years_lived + (months_difference / 12.0)
    
    return age_at_cutoff

In [4]:
def calculate_true_cutoff_age(birth_date: int) -> float:
    """
    Calculate the true age (in years) at the end of March of the fiscal year
    when a child loses eligibility (e.g., at 18 years old by March 31).

    Args:
        birth_date (int): in YYYYMM format (e.g., 200504 for April 2005)

    Returns:
        float: Age in years at cutoff (March)
    """
    birth_year = birth_date // 100
    birth_month = birth_date % 100

    # If born in Jan‚ÄìMar ‚Üí lose in Mar of the same year they turn 18
    # If born in Apr‚ÄìDec ‚Üí lose in Mar of the NEXT year they turn 18
    if birth_month <= 3:
        cutoff_year = birth_year + 18
    else:
        cutoff_year = birth_year + 19

    cutoff_month = 3  # March cutoff
    age = (cutoff_year - birth_year) + (cutoff_month - birth_month) / 12
    return age


# # === Test comprehensive birth month scenarios ===
# test_cases = [
#     (200501, 18.17, "Born Jan ‚Üí lose Mar same year"),
#     (200502, 18.08, "Born Feb ‚Üí lose Mar same year"),
#     (200503, 18.00, "Born Mar ‚Üí lose Mar same year (same month!)"),
#     (200504, 18.92, "Born Apr ‚Üí lose Mar NEXT year"),
#     (200505, 18.83, "Born May ‚Üí lose Mar NEXT year"),
#     (200506, 18.75, "Born Jun ‚Üí lose Mar NEXT year"),
#     (200507, 18.67, "Born Jul ‚Üí lose Mar NEXT year"),
#     (200508, 18.58, "Born Aug ‚Üí lose Mar NEXT year"),
#     (200509, 18.50, "Born Sep ‚Üí lose Mar NEXT year"),
#     (200510, 18.42, "Born Oct ‚Üí lose Mar NEXT year"),
#     (200511, 18.33, "Born Nov ‚Üí lose Mar NEXT year"),
#     (200512, 18.25, "Born Dec ‚Üí lose Mar NEXT year"),
# ]

# for birth_date, expected, desc in test_cases:
#     result = calculate_true_cutoff_age(birth_date)
#     print(f"{desc}")
#     print(f"  Birth: {birth_date}, Cutoff age: {result:.2f}, Expected: {expected:.2f}")
#     assert abs(result - expected) < 0.01, f"Failed: {desc}"
#     print("  ‚úÖ PASS\n")

In [None]:
def process_parquet_folder_RDD(
    folder_path: str, 
    treatment_area: int = 231002,
    date_base: int = 202201,  # baseline policy month (YYYYMM)
    age_base: int = 19,        # Cohort of this RDD
    Y: str = "ika_out_req_amt"
) -> pl.DataFrame:
    """
    Process Parquet files (treatment area only) for RDD analysis.
    
    Steps:
        1. Compute baseline age (float) using correct formula
        2. Keep patients aged <19 at baseline
        3. Compute current age (float) using correct formula
        4. Compute Œî1 (¬±1 month) and Œî2 (¬±2 months) flags based on age_base
        5. Adjust birth_date: people in Delta1 get mapped to "center" birth month
        6. Recalculate age_adjusted using adjusted birth_date
        7. Keep only treatment area
        8. Flag D=1 for patients who used public expense after policy
        9. Keep only D=1 and Œî1 samples
    """

    parquet_dir = Path(folder_path)
    parquet_files = list(parquet_dir.glob("*.parquet"))
    if not parquet_files:
        raise FileNotFoundError(f"No parquet files found in {folder_path}")

    print(f"Processing treatment area: {treatment_area}")
    print(f"Baseline: {date_base}, RDD age_base: {age_base}\n")

    merged_results = []

    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    # Process all parquet files
    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    for parquet in tqdm(parquet_files, desc="Processing Parquet Files"):
        lf = pl.scan_parquet(parquet)

        # ÊÄßÂà•„ÉªÂÆ∂ÊóèÂå∫ÂàÜ ‚Üí Êï∞ÂÄ§Âåñ
        lf = lf.with_columns([
            pl.when(pl.col("sex_type_nm") == "Áî∑").then(0)
              .when(pl.col("sex_type_nm") == "Â•≥").then(1)
              .otherwise(None)
              .alias("sex_type_nm").cast(pl.Float64),
            pl.when(pl.col("rezept_family_type_nm") == "ÂÆ∂Êóè").then(0)
              .when(pl.col("rezept_family_type_nm") == "Êú¨‰∫∫").then(1)
              .otherwise(None)
              .alias("rezept_family_type_nm").cast(pl.Float64)
        ])

        # ËÅ∑Ê•≠„ÉªÂπ¥Âèé„Ç´„ÉÜ„Ç¥„É™Â§âÊèõ
        lf = lf.with_columns([
            pl.col("business_type").cast(pl.Categorical).to_physical().alias("business_type_num"),
            pl.col("annual_salary_rank").cast(pl.Categorical).to_physical().alias("annual_salary_rank_num")
        ])

        # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
        # AGE CALCULATIONS - CORRECTED FORMULA
        # ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
        
        # Step 1: Baseline age (at date_base) using ORIGINAL birth_date
        lf = lf.with_columns([
            (
                (date_base // 100 - pl.col("birth_date") // 100)
                + ((date_base % 100 - pl.col("birth_date") % 100) / 12.0)
            ).cast(pl.Float64).alias(f"age_at_{date_base}")
        ])

        lf = lf.filter((pl.col(f"age_at_{date_base}") >= 0) & (pl.col(f"age_at_{date_base}") <= 25))

        # Step 2: Current age using ORIGINAL birth_date
        lf = lf.with_columns([
            (
                (pl.col("medtreat_yymm") // 100 - pl.col("birth_date") // 100)
                + ((pl.col("medtreat_yymm") % 100 - pl.col("birth_date") % 100) / 12.0)
            ).cast(pl.Float64).alias("age")
        ])

        df_all = lf.collect()

        df_all = df_all.with_columns([
            pl.col("birth_date")
            .map_elements(
                calculate_true_cutoff_age,
                return_dtype = pl.Float64
            ).alias("true_cutoff_age")
        ])

        df_all = df_all.with_columns([
            (pl.col("age") < pl.col("true_cutoff_age"))
            .cast(pl.Int8)
            .alias("treated")
        ])

        # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
        # Dominant area check
        # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
        area_counts = (
            df_all.group_by(["patient_id", "area_id"])
            .agg(pl.len().alias("visits"))
        )
        dominant_area = (
            area_counts.sort(["patient_id", "visits"], descending=[False, True])
            .group_by("patient_id")
            .agg(pl.first("area_id").alias("area_id"))
        )

        # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
        # Marking those "in the treatment area"
        # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
        eligible_patients = (
            dominant_area.filter(pl.col("area_id") == treatment_area)
            .select("patient_id")
            .with_columns(pl.lit(1).alias("D"))
        )

        # Join & keep D=1 only
        df_treat = (
            df_all.join(eligible_patients, on="patient_id", how="left")
            .with_columns(pl.col("D").fill_null(0))
            .join(dominant_area.select(["patient_id","area_id"]), on = "patient_id", how = "left")
            .with_columns(
                (pl.col("area_id") == treatment_area)
                .cast(pl.Int8)
                .alias(f"Base_{treatment_area}")
            )
            .filter(pl.col("D") == 1)
        )
        
        merged_results.append(df_treat)

    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    # Combine all files
    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    final_df = pl.concat(merged_results, how="vertical_relaxed")
    
    # Select relevant columns
    final_df = final_df.select([
        'patient_id', 'D', Y, 'treated', 'true_cutoff_age', 'birth_date', f'age_at_{date_base}',
        'medtreat_yymm', 'age', 'sex_type_nm',
        'rezept_family_type_nm', 'business_type_num',
        'annual_salary_rank_num', 'area_id', f'Base_{treatment_area}'
    ])
    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    # Summary statistics
    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    print(f"\n‚úÖ Completed {len(parquet_files)} files")
    print(f"Total rows (D=1 only): {final_df.height:,}")
    print(f"Unique patients: {final_df['patient_id'].n_unique():,}")

    return final_df

In [28]:
df_rdd = process_parquet_folder_RDD("/Users/lex/CodeProjects/MyProject/Mitaron/Parquet_fresh")
# df_rdd.write_csv("rdd_input.csv")
df_rdd

üìÖ Using date_base = 202112 (BEFORE policy change)
üìÖ Policy changed on: 202201
Processing treatment area: 231002


Processing Parquet Files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [00:01<00:00,  4.71it/s]



‚úÖ Completed 8 files
Total rows (D=1 only): 990,106
Unique patients: 66,661


patient_id,D,ika_out_req_amt,birth_date,age_at_202112,medtreat_yymm,age,true_cutoff_age,running_var,treated,sex_type_nm,rezept_family_type_nm,business_type_num,annual_salary_rank_num,area_id,Base_231002
str,i32,i64,i64,f64,i64,f64,f64,f64,i8,f64,f64,u32,u32,i32,i8
"""RI0010237107""",1,26300,201704,4.666667,202104,4.0,18.916667,-14.25,1,0.0,0.0,0,2,231002,1
"""RI0010280146""",1,33840,202012,1.0,202312,3.0,18.25,-17.25,1,0.0,0.0,0,2,231002,1
"""RI0010249810""",1,7850,200902,12.833333,202212,13.833333,18.083333,-5.25,1,1.0,0.0,0,2,231002,1
"""RI0007661853""",1,45600,200011,21.083333,202503,24.333333,18.333333,2.75,0,0.0,0.0,1,3,231002,1
"""RI0007527556""",1,26110,200306,18.5,202204,18.833333,18.75,-0.25,1,1.0,0.0,1,3,231002,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""RI0007165234""",1,22380,200304,18.666667,202312,20.666667,18.916667,-0.25,1,0.0,0.0,1,12,261009,0
"""RI0001968442""",1,5820,200212,19.0,202203,19.25,18.25,0.75,0,0.0,0.0,1,13,261009,0
"""RI0000822779""",1,12870,200204,19.666667,202201,19.75,18.916667,0.75,0,1.0,0.0,1,17,261009,0
"""RI0000544425""",1,5560,201610,5.166667,202309,6.916667,18.416667,-13.25,1,0.0,0.0,0,12,261009,0


In [29]:
print(df_rdd["true_cutoff_age"].describe())
# Group by running_var bins

shape: (9, 2)
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ statistic  ‚îÜ value     ‚îÇ
‚îÇ ---        ‚îÜ ---       ‚îÇ
‚îÇ str        ‚îÜ f64       ‚îÇ
‚ïû‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï°
‚îÇ count      ‚îÜ 990106.0  ‚îÇ
‚îÇ null_count ‚îÜ 0.0       ‚îÇ
‚îÇ mean       ‚îÜ 18.466313 ‚îÇ
‚îÇ std        ‚îÜ 0.283361  ‚îÇ
‚îÇ min        ‚îÜ 18.0      ‚îÇ
‚îÇ 25%        ‚îÜ 18.25     ‚îÇ
‚îÇ 50%        ‚îÜ 18.5      ‚îÇ
‚îÇ 75%        ‚îÜ 18.75     ‚îÇ
‚îÇ max        ‚îÜ 18.916667 ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò


# Cohort RDD

In [30]:
def run_full_rdd_analysis(final_df, date_base=202201):
    """
    Run full RDD + Placebo (age¬±1) analysis and save an HTML report.
    Using Personal Cutoffs
    """

    # === 1Ô∏è‚É£ Ask for HTML title ===
    html_title = input("Enter HTML report title (e.g. RDD_Taylored): ").strip()
    if not html_title:
        html_title = f"RDD_Analysis_Age_PersonSpecific"

    # === 2Ô∏è‚É£ Data prep ===
    np.random.seed(4)
    df_rdd = final_df.clone() if isinstance(final_df, pl.DataFrame) else pl.from_pandas(final_df)

    required_cols = ["running_var", "treated", "true_cutoff_age", "ika_out_req_amt"]
    missing = [col for col in required_cols if col not in df_rdd.columns]
    if missing:
        raise ValueError(
            f"!!! MISSING REQUIRED COLUMNS: {missing} \n"
            f"    Make sure process_parquet_folder_RDD() calculated these columns!"
        )
    
    # Printing the Summaries:
    avg_cutoff = df_rdd["true_cutoff_age"].mean()
    min_cutoff = df_rdd["true_cutoff_age"].min()
    max_cutoff = df_rdd["true_cutoff_age"].max()

    print(f"\nüìä Person-Specific Cutoff Summary:")
    print(f"   Average cutoff age: {avg_cutoff:.2f} years")
    print(f"   Cutoff range: [{min_cutoff:.2f}, {max_cutoff:.2f}] years")
    print(f"   This represents birth month variation (Mar births: {min_cutoff:.2f}, Apr births: {max_cutoff:.2f})")

    # Setting the cutoff for each one at 0:
    df_local = df_rdd.filter(
        (pl.col("running_var") >= -1.0) &
        (pl.col("running_var") <= 1.0)
    )

    print(f" Observations within 1 year of the cutoff: {df_local.height:,}")

    pdf = df_local.select(["running_var", "treated", "ika_out_req_amt"]).to_pandas()
    pdf - pdf.dropna().reset_index(drop = True)
    Y = "ika_out_req_amt"

    # === 3Ô∏è‚É£ Winsorize + log transform ===
    q_low, q_high = pdf[Y].quantile([0.01, 0.99])
    pdf["Y_winsor"] = pdf[Y].clip(q_low, q_high)
    pdf["Y_log_win"] = np.log1p(pdf["Y_winsor"])


    # === 4Ô∏è‚É£ Save distribution comparison ===
    fig_dist, axes = plt.subplots(1, 3, figsize=(12, 4))
    axes[0].hist(pdf[Y], bins=100, color="gray", alpha=0.7)
    axes[0].set_title("Original Y (ika_out_req_amt)")
    axes[0].set_xlabel("Y (yen)")
    axes[1].hist(pdf["Y_winsor"], bins=100, color="orange", alpha=0.7)
    axes[1].set_title("Winsorized (top 1%)")
    axes[1].set_xlabel("Y (yen)")
    axes[2].hist(pdf["Y_log_win"], bins=100, color="steelblue", alpha=0.7)
    axes[2].set_title("log(1 + Winsorized Y)")
    axes[2].set_xlabel("log(1 + Y)")
    plt.tight_layout()
    buf_dist = BytesIO()
    fig_dist.savefig(buf_dist, format="png", dpi=100, bbox_inches="tight")
    buf_dist.seek(0)
    dist_base64 = base64.b64encode(buf_dist.read()).decode("utf-8")
    plt.close(fig_dist)

    # === 5Ô∏è‚É£ Continuity plot ===
    pdf["bin"] = (pdf["running_var"] * 12).round() / 12
    bin_counts = pdf.groupby("bin").size().reset_index(name="count")
    fig_cont, ax = plt.subplots(figsize=(8, 4.5))
    ax.scatter(bin_counts["bin"], bin_counts["count"], s=40, color="tomato", marker="D", alpha=0.7)
    sns.regplot(data=bin_counts, x="bin", y="count", scatter=False, order=2, color="black", ci=None)
    ax.axvline(0, color="black", linestyle="--", linewidth=1.2)
    ax.set(
        title=f"Continuity in Running Variable around Eligibility Loss",
        xlabel=f"Years from Eligibility Loss (Running Variable)",
        ylabel="Number of Observations"
    )
    ax.grid(alpha=0.3)
    plt.tight_layout()
    buf_cont = BytesIO()
    fig_cont.savefig(buf_cont, format="png", dpi=100, bbox_inches="tight")
    buf_cont.seek(0)
    cont_base64 = base64.b64encode(buf_cont.read()).decode("utf-8")
    plt.close(fig_cont)


    # === 6Ô∏è‚É£ RDD Mean Plot ===
    pdf["bin"] = (pdf["running_var"] * 10).round() / 12
    binned = pdf.groupby(["bin", "treated"])["Y_log_win"].mean().reset_index()
    fig_rdd, ax = plt.subplots(figsize=(8, 5))
    sns.scatterplot(data=binned, x="bin", y="Y_log_win", hue="treated",
                    palette=["tab:green", "royalblue"], s=60, alpha=0.9, ax=ax)
    sns.regplot(data=pdf[pdf["running_var"] < 0], x="running_var", y="Y_log_win",
                scatter=False, color="tab:green", order=1, ax=ax)
    sns.regplot(data=pdf[pdf["running_var"] >= 0], x="running_var", y="Y_log_win",
                scatter=False, color="royalblue", order=1, ax=ax)
    ax.axvline(0, color="tomato", linestyle="--", linewidth=1.5)

    ax.set(
        title=f"RDD Mean Plot: log(Medical Expenditure) at Eligibility Loss",
        xlabel=f"Years from Eligibility Loss (Running Variable)", 
        ylabel="log(1 + Medical Expenditure)"
    )
    
    ax.legend(
        title="Coverage Status",
        labels=["Lost Coverage", "Still Covered"]
    )

    ax.grid(alpha=0.3)
    plt.tight_layout()
    buf_rdd = BytesIO()
    fig_rdd.savefig(buf_rdd, format="png", dpi=100, bbox_inches="tight")
    buf_rdd.seek(0)
    rdd_base64 = base64.b64encode(buf_rdd.read()).decode("utf-8")
    plt.close(fig_rdd)

    # === 7Ô∏è‚É£ Nonparametric RDD - MAIN CUTOFF ONLY ===
    figs_base64 = []
    cutoffs = [0]  # üÜï ONLY MAIN CUTOFF

    print("\n" + "="*70)
    print("Running RDD Analysis at Main Cutoff Only")
    print("="*70)

    for c in cutoffs:
        cutoff_label = "Eligibility Loss (c = 0)"
        
        print(f"Processing cutoff: {cutoff_label}")

        try:
            # rdplot returns an rdplot_output object with ggplot
            result = rdplot(
                y=pdf['Y_log_win'].values,
                x=pdf['running_var'].values,
                c=c,
                title=f"RDD Plot: Medical Expenditure at {cutoff_label}",
                x_label=f"Years from Eligibility Loss",
                y_label="log(1 + Medical Expenditure)",
                binselect="es"
            )

            # Save the ggplot object to buffer
            buf = BytesIO()
            result.rdplot.save(buf, format="png", dpi=100, verbose=False)
            buf.seek(0)
            figs_base64.append((c, base64.b64encode(buf.read()).decode("utf-8")))
            print(f"   ‚úÖ Successfully generated plot")
            
        except Exception as e:
            print(f"   ‚ùå Error generating rdplot: {str(e)}")
            print(f"      This may be due to insufficient data or numerical instability")
            figs_base64.append((c, None))

    # === 8Ô∏è‚É£ Run rdrobust for statistics - MAIN CUTOFF ONLY ===
    cutoffs = [0]  # üÜï ONLY MAIN CUTOFF
    rdd_texts = []
    print("\n" + "="*70)
    print("Running rdrobust estimation for main cutoff...")
    print("="*70)

    for c in cutoffs:
        cutoff_label = "Eligibility Loss (Main Effect)"
        
        try:
            # Run rdrobust
            result = rdrobust(
                y=pdf['Y_log_win'].values,
                x=pdf['running_var'].values,
                c=c,
                all=True
            )

            # Capture printed output
            from io import StringIO
            import sys
            buffer = StringIO()
            sys.stdout = buffer
            print(f"\n=== RDD Results for {cutoff_label} ===")
            print(result)
            sys.stdout = sys.__stdout__
            rdd_texts.append((cutoff_label, buffer.getvalue()))
            print(f"   ‚úÖ Successfully completed rdrobust estimation")
            
        except Exception as e:
            print(f"   ‚ùå Error in rdrobust: {str(e)}")
            rdd_texts.append((cutoff_label, f"Error: Could not estimate RDD. {str(e)}"))

    # Combine all text summaries into one HTML section
    rdd_html_blocks = ""
    for cutoff_label, text_output in rdd_texts:
        rdd_html_blocks += f"""
        <h3>RDD Results: {cutoff_label}</h3>
        <pre style="background:#f8f9fa; border:1px solid #ccc; padding:10px; white-space:pre-wrap;">
{text_output}
        </pre>
        """

    # === 9Ô∏è‚É£ Build HTML ===
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    output_dir = "Mitaron/RDD_Results"
    os.makedirs(output_dir, exist_ok=True)
    file_path = f"{output_dir}/{html_title}_{timestamp}.html"

    html = f"""
    <!DOCTYPE html>
    <html><head><meta charset='utf-8'><title>{html_title}</title>
    <style>
      body {{ font-family: Arial, sans-serif; margin: 40px; background-color: #f8f9fa; }}
      h1 {{ text-align:center; border-bottom:3px solid #2c7be5; color: #2c3e50; }}
      h2 {{ border-left:5px solid #2c7be5; padding-left:10px; color: #34495e; margin-top: 30px; }}
      h3 {{ color: #7f8c8d; margin-top: 20px; }}
      img {{ display:block; margin:auto; border: 1px solid #ddd; padding: 10px; background: white; }}
      table {{ margin: 20px auto; border-collapse: collapse; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
      th, td {{ padding: 10px 15px; text-align: left; border: 1px solid #ddd; }}
      th {{ background-color: #2c7be5; color: white; }}
      tr:nth-child(even) {{ background-color: #f2f2f2; }}
      .info-box {{ background: #e3f2fd; padding: 15px; border-left: 4px solid #2196f3; margin: 20px 0; }}
      .warning-box {{ background: #fff3cd; padding: 15px; border-left: 4px solid #ffc107; margin: 20px 0; }}
      .timestamp {{ text-align: center; color: #7f8c8d; font-size: 0.9em; }}
    </style></head><body>
    <h1>üìä {html_title}</h1>
    <p class="timestamp">Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>

    <div class="info-box">
    <strong>Analysis Summary:</strong><br>
    ‚Ä¢ Sample Size: {len(pdf):,}<br>
    ‚Ä¢ Method: Person-Specific RDD (each individual has their own cutoff age)<br>
    ‚Ä¢ Average Cutoff Age: {avg_cutoff:.2f} years (range: {min_cutoff:.2f} - {max_cutoff:.2f})<br>
    ‚Ä¢ Running Variable: Years from eligibility loss (negative = still covered, positive = lost coverage)<br>
    ‚Ä¢ Outcome: log(1 + Medical Expenditure)<br>
    ‚Ä¢ Analysis: Main cutoff only (c=0, eligibility loss moment)
    </div>

    <div class="warning-box">
    <strong>‚ö†Ô∏è Important Note:</strong><br>
    This analysis uses <strong>person-specific cutoffs</strong> based on birth month.<br>
    ‚Ä¢ Policy: Coverage ends on "first March 31 after turning 18"<br>
    ‚Ä¢ March births lose coverage at age ~18.0 years<br>
    ‚Ä¢ April births lose coverage at age ~18.9 years<br>
    ‚Ä¢ Running variable centers each person at their OWN eligibility loss moment (0 = cutoff)<br>
    ‚Ä¢ This creates a <strong>sharp RDD</strong> instead of fuzzy discontinuity<br>
    <br>
    <strong>üìå Note on Placebo Tests:</strong><br>
    Placebo tests at other cutoffs (¬±6 months) were omitted due to numerical instability with the current data.
    The main effect at c=0 remains the most reliable estimate of the causal impact.
    </div>

    <h2>1. Data Transformation</h2>
    <p>To handle outliers and skewness: (1) Winsorization at 1st and 99th percentiles, (2) Log transformation.</p>
    <img src="data:image/png;base64,{dist_base64}" style="width:95%;max-width:900px;">

    <h2>2. Continuity Check (McCrary Test)</h2>
    <p>Testing for manipulation around the cutoff. A smooth distribution suggests no manipulation. 
    The running variable is now <strong>centered at 0</strong> (each person's eligibility loss moment).</p>
    <img src="data:image/png;base64,{cont_base64}" style="width:90%;max-width:800px;">

    <h2>3. RDD Mean Plot (Local Linear)</h2>
    <p>Visual evidence of discontinuity at eligibility loss using binned means and local linear regression.
    <strong>Green points</strong> = still covered (running_var < 0), <strong>Blue points</strong> = lost coverage (running_var ‚â• 0).</p>
    <img src="data:image/png;base64,{rdd_base64}" style="width:90%;max-width:800px;">

    <h2>4. RDD Estimates (Full Output)</h2>
    <p>Below is the complete <code>rdrobust</code> summary for the main effect at <strong>c=0</strong> (the moment of eligibility loss).</p>
    {rdd_html_blocks}

    <h2>5. Nonparametric RDD Plot (rdrobust.rdplot)</h2>

    <h3>Main Analysis: Eligibility Loss (c=0)</h3>
    <p><strong>This is the main treatment effect.</strong> A visible discontinuity at 0 indicates that losing subsidy eligibility causally affects medical spending.</p>
    """
    
    # Add rdplot if it was successfully generated
    if len(figs_base64) > 0 and figs_base64[0][1] is not None:
        html += f'<img src="data:image/png;base64,{figs_base64[0][1]}" style="width:90%;max-width:800px;">'
    else:
        html += '<p style="color:red;">‚ö†Ô∏è Could not generate rdplot due to numerical issues. See mean plot above for visual evidence of discontinuity.</p>'
    
    html += f"""

    <h2>6. Interpretation</h2>
    <div class="info-box">
    <strong>Key Findings:</strong><br>
    <br>
    <strong>‚úì Valid RDD if:</strong><br>
    ‚Ä¢ Main effect (c=0) is statistically significant<br>
    ‚Ä¢ Continuity check shows smooth distribution (no bunching at cutoff)<br>
    ‚Ä¢ Mean plot shows visible discontinuity at cutoff<br>
    <br>
    <strong>Advantages of Person-Specific Approach:</strong><br>
    ‚Ä¢ Eliminates fuzzy treatment assignment (sharp discontinuity at 0)<br>
    ‚Ä¢ Directly measures effect of losing subsidy (not effect of "turning age X")<br>
    ‚Ä¢ Exploits birth month variation as natural randomization<br>
    ‚Ä¢ Higher statistical power due to sharper identification<br>
    <br>
    <strong>Note on Analysis Scope:</strong><br>
    This analysis focuses on the main discontinuity at eligibility loss (c=0).
    Placebo tests at other cutoffs were omitted to ensure numerical stability and reliability of results.
    The main effect remains the primary causal parameter of interest for policy evaluation.
    </div>

    <p style="text-align:center;margin-top:40px;color:#7f8c8d;">‚úÖ Report saved at: <code>{file_path}</code></p>
    </body></html>
    """

    with open(file_path, "w", encoding="utf-8") as f:
        f.write(html)

    print("="*70)
    print(f"‚úÖ RDD Report generated: {file_path}")
    print("="*70)

    return file_path


In [31]:
run_full_rdd_analysis(df_rdd)


üìä Person-Specific Cutoff Summary:
   Average cutoff age: 18.47 years
   Cutoff range: [18.00, 18.92] years
   This represents birth month variation (Mar births: 18.00, Apr births: 18.92)
 Observations within 1 year of the cutoff: 65,165

Running RDD Analysis at Main Cutoff Only
Processing cutoff: Eligibility Loss (c = 0)
Mass points detected in the running variable.
   ‚ùå Error generating rdplot: Matrix is not positive definite
      This may be due to insufficient data or numerical instability

Running rdrobust estimation for main cutoff...
Mass points detected in the running variable.
Mass points detected in the running variable.
   ‚ùå Error in rdrobust: Matrix is not positive definite
‚úÖ RDD Report generated: Mitaron/RDD_Results/Comeonbaby_20251110_0245.html


'Mitaron/RDD_Results/Comeonbaby_20251110_0245.html'

In [19]:
def run_full_rdd_analysis(final_df, date_base=202201):
    """
    Run full RDD analysis and save an HTML report.
    Using Personal Cutoffs - MAIN CUTOFF ONLY (c=0)
    """

    # === 1Ô∏è‚É£ Ask for HTML title ===
    html_title = input("Enter HTML report title (e.g. RDD_Taylored): ").strip()
    if not html_title:
        html_title = f"RDD_Analysis_Age_PersonSpecific"

    # === 2Ô∏è‚É£ Data prep ===
    np.random.seed(4)
    df_rdd = final_df.clone() if isinstance(final_df, pl.DataFrame) else pl.from_pandas(final_df)

    required_cols = ["running_var", "treated", "true_cutoff_age", "ika_out_req_amt"]
    missing = [col for col in required_cols if col not in df_rdd.columns]
    if missing:
        raise ValueError(
            f"!!! MISSING REQUIRED COLUMNS: {missing} \n"
            f"    Make sure process_parquet_folder_RDD() calculated these columns!"
        )
    
    # Printing the Summaries:
    avg_cutoff = df_rdd["true_cutoff_age"].mean()
    min_cutoff = df_rdd["true_cutoff_age"].min()
    max_cutoff = df_rdd["true_cutoff_age"].max()

    print(f"\nüìä Person-Specific Cutoff Summary:")
    print(f"   Average cutoff age: {avg_cutoff:.2f} years")
    print(f"   Cutoff range: [{min_cutoff:.2f}, {max_cutoff:.2f}] years")
    print(f"   This represents birth month variation (Mar births: {min_cutoff:.2f}, Apr births: {max_cutoff:.2f})")

    # Setting the cutoff for each one at 0:
    df_local = df_rdd.filter(
        (pl.col("running_var") >= -2.0) &
        (pl.col("running_var") <= 2.0)
    )

    print(f"   Observations within 1 year of the cutoff: {df_local.height:,}")

    pdf = df_local.select(["running_var", "treated", "ika_out_req_amt"]).to_pandas()
    pdf = pdf.dropna().reset_index(drop=True)
    Y = "ika_out_req_amt"

    # === 3Ô∏è‚É£ Winsorize + log transform ===
    q_low, q_high = pdf[Y].quantile([0.01, 0.99])
    pdf["Y_winsor"] = pdf[Y].clip(q_low, q_high)
    pdf["Y_log_win"] = np.log1p(pdf["Y_winsor"])


    # === 4Ô∏è‚É£ Save distribution comparison ===
    fig_dist, axes = plt.subplots(1, 3, figsize=(12, 4))
    axes[0].hist(pdf[Y], bins=100, color="gray", alpha=0.7)
    axes[0].set_title("Original Y (ika_out_req_amt)")
    axes[0].set_xlabel("¬• (yen)")
    axes[1].hist(pdf["Y_winsor"], bins=100, color="orange", alpha=0.7)
    axes[1].set_title("Winsorized (top 1%)")
    axes[1].set_xlabel("¬• (yen)")
    axes[2].hist(pdf["Y_log_win"], bins=100, color="steelblue", alpha=0.7)
    axes[2].set_title("log(1 + Winsorized Y)")
    axes[2].set_xlabel("log(1 + ¬•)")
    plt.tight_layout()
    buf_dist = BytesIO()
    fig_dist.savefig(buf_dist, format="png", dpi=100, bbox_inches="tight")
    buf_dist.seek(0)
    dist_base64 = base64.b64encode(buf_dist.read()).decode("utf-8")
    plt.close(fig_dist)

    # === 5Ô∏è‚É£ Continuity plot ===
    pdf["bin"] = (pdf["running_var"] * 12).round() / 12
    bin_counts = pdf.groupby("bin").size().reset_index(name="count")
    fig_cont, ax = plt.subplots(figsize=(8, 4.5))
    ax.scatter(bin_counts["bin"], bin_counts["count"], s=40, color="tomato", marker="D", alpha=0.7)
    sns.regplot(data=bin_counts, x="bin", y="count", scatter=False, order=2, color="black", ci=None)
    ax.axvline(0, color="black", linestyle="--", linewidth=1.2)
    ax.set(
        title=f"Continuity in Running Variable around Eligibility Loss",
        xlabel=f"Years from Eligibility Loss (Running Variable)",
        ylabel="Number of Observations"
    )
    ax.grid(alpha=0.3)
    plt.tight_layout()
    buf_cont = BytesIO()
    fig_cont.savefig(buf_cont, format="png", dpi=100, bbox_inches="tight")
    buf_cont.seek(0)
    cont_base64 = base64.b64encode(buf_cont.read()).decode("utf-8")
    plt.close(fig_cont)


    # === 6Ô∏è‚É£ RDD Mean Plot ===
    pdf["bin"] = (pdf["running_var"] * 10).round() / 12
    binned = pdf.groupby(["bin", "treated"])["Y_log_win"].mean().reset_index()
    fig_rdd, ax = plt.subplots(figsize=(8, 5))
    sns.scatterplot(data=binned, x="bin", y="Y_log_win", hue="treated",
                    palette=["tab:green", "royalblue"], s=60, alpha=0.9, ax=ax)
    sns.regplot(data=pdf[pdf["running_var"] < 0], x="running_var", y="Y_log_win",
                scatter=False, color="tab:green", order=1, ax=ax)
    sns.regplot(data=pdf[pdf["running_var"] >= 0], x="running_var", y="Y_log_win",
                scatter=False, color="royalblue", order=1, ax=ax)
    ax.axvline(0, color="tomato", linestyle="--", linewidth=1.5)

    ax.set(
        title=f"RDD Mean Plot: log(Medical Expenditure) at Eligibility Loss",
        xlabel=f"Years from Eligibility Loss (Running Variable)", 
        ylabel="log(1 + Medical Expenditure)"
    )
    
    ax.legend(
        title="Coverage Status",
        labels=["Still Covered", "Lost Coverage"]
    )

    ax.grid(alpha=0.3)
    plt.tight_layout()
    buf_rdd = BytesIO()
    fig_rdd.savefig(buf_rdd, format="png", dpi=100, bbox_inches="tight")
    buf_rdd.seek(0)
    rdd_base64 = base64.b64encode(buf_rdd.read()).decode("utf-8")
    plt.close(fig_rdd)

    # === 7Ô∏è‚É£ Nonparametric RDD - MAIN CUTOFF ONLY ===
    figs_base64 = []
    cutoffs = [0]

    print("\n" + "="*70)
    print("Running RDD Analysis at Main Cutoff Only")
    print("="*70)

    for c in cutoffs:
        cutoff_label = "Eligibility Loss (c = 0)"
        
        print(f"Processing cutoff: {cutoff_label}")

        try:
            # üÜï FIXED: Add masspoints parameter
            result = rdplot(
                y=pdf['Y_log_win'].values,
                x=pdf['running_var'].values,
                c=c,
                title=f"RDD Plot: Medical Expenditure at {cutoff_label}",
                x_label=f"Years from Eligibility Loss",
                y_label="log(1 + Medical Expenditure)",
                binselect="es",
                masspoints="adjust"  # üÜï ADD THIS - handles discrete running variable
            )

            # Save the ggplot object to buffer
            buf = BytesIO()
            result.rdplot.save(buf, format="png", dpi=100, verbose=False)
            buf.seek(0)
            figs_base64.append((c, base64.b64encode(buf.read()).decode("utf-8")))
            print(f"   ‚úÖ Successfully generated plot")
            
        except Exception as e:
            print(f"   ‚ùå Error generating rdplot: {str(e)}")
            print(f"      This may be due to insufficient data or numerical instability")
            figs_base64.append((c, None))

    # === 8Ô∏è‚É£ Run rdrobust for statistics - MAIN CUTOFF ONLY ===
    cutoffs = [0]
    rdd_texts = []
    print("\n" + "="*70)
    print("Running rdrobust estimation for main cutoff...")
    print("="*70)

    for c in cutoffs:
        cutoff_label = "Eligibility Loss (Main Effect)"
        
        try:
            # üÜï FIXED: Add masspoints parameter
            result = rdrobust(
                y=pdf['Y_log_win'].values,
                x=pdf['running_var'].values,
                c=c,
                all=True,
                masspoints="adjust"  # üÜï ADD THIS - handles discrete running variable
            )

            # Capture printed output
            from io import StringIO
            import sys
            buffer = StringIO()
            sys.stdout = buffer
            print(f"\n=== RDD Results for {cutoff_label} ===")
            print(result)
            sys.stdout = sys.__stdout__
            rdd_texts.append((cutoff_label, buffer.getvalue()))
            print(f"   ‚úÖ Successfully completed rdrobust estimation")
            
        except Exception as e:
            print(f"   ‚ùå Error in rdrobust: {str(e)}")
            rdd_texts.append((cutoff_label, f"Error: Could not estimate RDD. {str(e)}"))

    # Combine all text summaries into one HTML section
    rdd_html_blocks = ""
    for cutoff_label, text_output in rdd_texts:
        rdd_html_blocks += f"""
        <h3>RDD Results: {cutoff_label}</h3>
        <pre style="background:#f8f9fa; border:1px solid #ccc; padding:10px; white-space:pre-wrap;">
{text_output}
        </pre>
        """

    # === 9Ô∏è‚É£ Build HTML ===
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    output_dir = "Mitaron/RDD_Results"
    os.makedirs(output_dir, exist_ok=True)
    file_path = f"{output_dir}/{html_title}_{timestamp}.html"

    html = f"""
    <!DOCTYPE html>
    <html><head><meta charset='utf-8'><title>{html_title}</title>
    <style>
      body {{ font-family: Arial, sans-serif; margin: 40px; background-color: #f8f9fa; }}
      h1 {{ text-align:center; border-bottom:3px solid #2c7be5; color: #2c3e50; }}
      h2 {{ border-left:5px solid #2c7be5; padding-left:10px; color: #34495e; margin-top: 30px; }}
      h3 {{ color: #7f8c8d; margin-top: 20px; }}
      img {{ display:block; margin:auto; border: 1px solid #ddd; padding: 10px; background: white; }}
      table {{ margin: 20px auto; border-collapse: collapse; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
      th, td {{ padding: 10px 15px; text-align: left; border: 1px solid #ddd; }}
      th {{ background-color: #2c7be5; color: white; }}
      tr:nth-child(even) {{ background-color: #f2f2f2; }}
      .info-box {{ background: #e3f2fd; padding: 15px; border-left: 4px solid #2196f3; margin: 20px 0; }}
      .warning-box {{ background: #fff3cd; padding: 15px; border-left: 4px solid #ffc107; margin: 20px 0; }}
      .timestamp {{ text-align: center; color: #7f8c8d; font-size: 0.9em; }}
    </style></head><body>
    <h1>üìä {html_title}</h1>
    <p class="timestamp">Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>

    <div class="info-box">
    <strong>Analysis Summary:</strong><br>
    ‚Ä¢ Sample Size: {len(pdf):,}<br>
    ‚Ä¢ Method: Person-Specific RDD (each individual has their own cutoff age)<br>
    ‚Ä¢ Average Cutoff Age: {avg_cutoff:.2f} years (range: {min_cutoff:.2f} - {max_cutoff:.2f})<br>
    ‚Ä¢ Running Variable: Years from eligibility loss (negative = still covered, positive = lost coverage)<br>
    ‚Ä¢ Outcome: log(1 + Medical Expenditure)<br>
    ‚Ä¢ Analysis: Main cutoff only (c=0, eligibility loss moment)<br>
    ‚Ä¢ Mass Points: Adjusted (running variable has discrete monthly values)
    </div>

    <div class="warning-box">
    <strong>‚ö†Ô∏è Important Note:</strong><br>
    This analysis uses <strong>person-specific cutoffs</strong> based on birth month.<br>
    ‚Ä¢ Policy: Coverage ends on "first March 31 after turning 18"<br>
    ‚Ä¢ March births lose coverage at age ~18.0 years<br>
    ‚Ä¢ April births lose coverage at age ~18.9 years<br>
    ‚Ä¢ Running variable centers each person at their OWN eligibility loss moment (0 = cutoff)<br>
    ‚Ä¢ This creates a <strong>sharp RDD</strong> instead of fuzzy discontinuity<br>
    <br>
    <strong>üìå Note on Mass Points:</strong><br>
    The running variable contains discrete monthly age values (mass points) rather than continuous values.
    The analysis uses <code>masspoints="adjust"</code> to properly handle this discreteness,
    following the approach recommended by Cattaneo, Jansson & Ma (2020).
    </div>

    <h2>1. Data Transformation</h2>
    <p>To handle outliers and skewness: (1) Winsorization at 1st and 99th percentiles, (2) Log transformation.</p>
    <img src="data:image/png;base64,{dist_base64}" style="width:95%;max-width:900px;">

    <h2>2. Continuity Check (McCrary Test)</h2>
    <p>Testing for manipulation around the cutoff. A smooth distribution suggests no manipulation. 
    The running variable is now <strong>centered at 0</strong> (each person's eligibility loss moment).</p>
    <img src="data:image/png;base64,{cont_base64}" style="width:90%;max-width:800px;">

    <h2>3. RDD Mean Plot (Local Linear)</h2>
    <p>Visual evidence of discontinuity at eligibility loss using binned means and local linear regression.
    <strong>Green points</strong> = still covered (running_var < 0), <strong>Blue points</strong> = lost coverage (running_var ‚â• 0).</p>
    <img src="data:image/png;base64,{rdd_base64}" style="width:90%;max-width:800px;">

    <h2>4. RDD Estimates (Full Output)</h2>
    <p>Below is the complete <code>rdrobust</code> summary for the main effect at <strong>c=0</strong> (the moment of eligibility loss).
    The estimation accounts for mass points in the running variable using the adjustment method.</p>
    {rdd_html_blocks}

    <h2>5. Nonparametric RDD Plot (rdrobust.rdplot)</h2>

    <h3>Main Analysis: Eligibility Loss (c=0)</h3>
    <p><strong>This is the main treatment effect.</strong> A visible discontinuity at 0 indicates that losing subsidy eligibility causally affects medical spending.</p>
    """
    
    # Add rdplot if it was successfully generated
    if len(figs_base64) > 0 and figs_base64[0][1] is not None:
        html += f'<img src="data:image/png;base64,{figs_base64[0][1]}" style="width:90%;max-width:800px;">'
    else:
        html += '<p style="color:red;">‚ö†Ô∏è Could not generate rdplot due to numerical issues. See mean plot above for visual evidence of discontinuity.</p>'
    
    html += f"""

    <h2>6. Interpretation</h2>
    <div class="info-box">
    <strong>Key Findings:</strong><br>
    <br>
    <strong>‚úì Valid RDD if:</strong><br>
    ‚Ä¢ Main effect (c=0) is statistically significant<br>
    ‚Ä¢ Continuity check shows smooth distribution (no bunching at cutoff)<br>
    ‚Ä¢ Mean plot shows visible discontinuity at cutoff<br>
    <br>
    <strong>Advantages of Person-Specific Approach:</strong><br>
    ‚Ä¢ Eliminates fuzzy treatment assignment (sharp discontinuity at 0)<br>
    ‚Ä¢ Directly measures effect of losing subsidy (not effect of "turning age X")<br>
    ‚Ä¢ Exploits birth month variation as natural randomization<br>
    ‚Ä¢ Higher statistical power due to sharper identification<br>
    ‚Ä¢ Properly accounts for discrete nature of age measurement (monthly increments)<br>
    <br>
    <strong>Technical Notes:</strong><br>
    ‚Ä¢ Mass points adjustment ensures valid inference despite discrete running variable<br>
    ‚Ä¢ Standard errors are robust to clustering at the monthly age level<br>
    ‚Ä¢ Bandwidth selection accounts for discreteness in the data
    </div>

    <p style="text-align:center;margin-top:40px;color:#7f8c8d;">‚úÖ Report saved at: <code>{file_path}</code></p>
    </body></html>
    """

    with open(file_path, "w", encoding="utf-8") as f:
        f.write(html)

    print("="*70)
    print(f"‚úÖ RDD Report generated: {file_path}")
    print("="*70)

    return file_path

In [20]:
run_full_rdd_analysis(df_rdd)


üìä Person-Specific Cutoff Summary:
   Average cutoff age: 18.47 years
   Cutoff range: [18.00, 18.92] years
   This represents birth month variation (Mar births: 18.00, Apr births: 18.92)
   Observations within 1 year of the cutoff: 137,167

Running RDD Analysis at Main Cutoff Only
Processing cutoff: Eligibility Loss (c = 0)
Mass points detected in the running variable.
   ‚ùå Error generating rdplot: Matrix is not positive definite
      This may be due to insufficient data or numerical instability

Running rdrobust estimation for main cutoff...
Mass points detected in the running variable.
Mass points detected in the running variable.
   ‚ùå Error in rdrobust: Matrix is not positive definite
‚úÖ RDD Report generated: Mitaron/RDD_Results/pleasgghghhhh_20251110_0235.html


'Mitaron/RDD_Results/pleasgghghhhh_20251110_0235.html'

In [34]:
def run_full_rdd_analysis(final_df, date_base=202201):
    """
    Run full RDD analysis and save an HTML report.
    Using Personal Cutoffs with MANUAL ESTIMATION (more robust)
    """

    # === 1Ô∏è‚É£ Ask for HTML title ===
    html_title = input("Enter HTML report title (e.g. RDD_Taylored): ").strip()
    if not html_title:
        html_title = f"RDD_Analysis_Age_PersonSpecific"

    # === 2Ô∏è‚É£ Data prep ===
    np.random.seed(4)
    df_rdd = final_df.clone() if isinstance(final_df, pl.DataFrame) else pl.from_pandas(final_df)

    required_cols = ["running_var", "treated", "true_cutoff_age", "ika_out_req_amt"]
    missing = [col for col in required_cols if col not in df_rdd.columns]
    if missing:
        raise ValueError(
            f"!!! MISSING REQUIRED COLUMNS: {missing} \n"
            f"    Make sure process_parquet_folder_RDD() calculated these columns!"
        )
    
    # Printing the Summaries:
    avg_cutoff = df_rdd["true_cutoff_age"].mean()
    min_cutoff = df_rdd["true_cutoff_age"].min()
    max_cutoff = df_rdd["true_cutoff_age"].max()

    print(f"\nüìä Person-Specific Cutoff Summary:")
    print(f"   Average cutoff age: {avg_cutoff:.2f} years")
    print(f"   Cutoff range: [{min_cutoff:.2f}, {max_cutoff:.2f}] years")
    print(f"   This represents birth month variation (Mar births: {min_cutoff:.2f}, Apr births: {max_cutoff:.2f})")

    # üÜï CHANGE: Widen window to ¬±2 years for more stability
    df_local = df_rdd.filter(
        (pl.col("running_var") >= -2.0) &
        (pl.col("running_var") <= 2.0)
    )

    print(f"   Observations within 2 years of the cutoff: {df_local.height:,}")

    pdf = df_local.select(["running_var", "treated", "ika_out_req_amt"]).to_pandas()
    pdf = pdf.dropna().reset_index(drop=True)
    Y = "ika_out_req_amt"

    # === 3Ô∏è‚É£ Winsorize + log transform ===
    q_low, q_high = pdf[Y].quantile([0.01, 0.99])
    pdf["Y_winsor"] = pdf[Y].clip(q_low, q_high)
    pdf["Y_log_win"] = np.log1p(pdf["Y_winsor"])


    # === 4Ô∏è‚É£ Save distribution comparison ===
    fig_dist, axes = plt.subplots(1, 3, figsize=(12, 4))
    axes[0].hist(pdf[Y], bins=100, color="gray", alpha=0.7)
    axes[0].set_title("Original Y (ika_out_req_amt)")
    axes[0].set_xlabel("¬• (yen)")
    axes[1].hist(pdf["Y_winsor"], bins=100, color="orange", alpha=0.7)
    axes[1].set_title("Winsorized (top 1%)")
    axes[1].set_xlabel("¬• (yen)")
    axes[2].hist(pdf["Y_log_win"], bins=100, color="steelblue", alpha=0.7)
    axes[2].set_title("log(1 + Winsorized Y)")
    axes[2].set_xlabel("log(1 + ¬•)")
    plt.tight_layout()
    buf_dist = BytesIO()
    fig_dist.savefig(buf_dist, format="png", dpi=100, bbox_inches="tight")
    buf_dist.seek(0)
    dist_base64 = base64.b64encode(buf_dist.read()).decode("utf-8")
    plt.close(fig_dist)

    # === 5Ô∏è‚É£ Continuity plot ===
    pdf["bin"] = (pdf["running_var"] * 12).round() / 12
    bin_counts = pdf.groupby("bin").size().reset_index(name="count")
    fig_cont, ax = plt.subplots(figsize=(8, 4.5))
    ax.scatter(bin_counts["bin"], bin_counts["count"], s=40, color="tomato", marker="D", alpha=0.7)
    sns.regplot(data=bin_counts, x="bin", y="count", scatter=False, order=2, color="black", ci=None)
    ax.axvline(0, color="black", linestyle="--", linewidth=1.2)
    ax.set(
        title=f"Continuity in Running Variable around Eligibility Loss",
        xlabel=f"Years from Eligibility Loss (Running Variable)",
        ylabel="Number of Observations"
    )
    ax.grid(alpha=0.3)
    plt.tight_layout()
    buf_cont = BytesIO()
    fig_cont.savefig(buf_cont, format="png", dpi=100, bbox_inches="tight")
    buf_cont.seek(0)
    cont_base64 = base64.b64encode(buf_cont.read()).decode("utf-8")
    plt.close(fig_cont)


    # === 6Ô∏è‚É£ RDD Mean Plot ===
    pdf["bin"] = (pdf["running_var"] * 10).round() / 12
    binned = pdf.groupby(["bin", "treated"])["Y_log_win"].mean().reset_index()
    fig_rdd, ax = plt.subplots(figsize=(8, 5))
    sns.scatterplot(data=binned, x="bin", y="Y_log_win", hue="treated",
                    palette=["tab:green", "royalblue"], s=60, alpha=0.9, ax=ax)
    sns.regplot(data=pdf[pdf["running_var"] < 0], x="running_var", y="Y_log_win",
                scatter=False, color="tab:green", order=1, ax=ax)
    sns.regplot(data=pdf[pdf["running_var"] >= 0], x="running_var", y="Y_log_win",
                scatter=False, color="royalblue", order=1, ax=ax)
    ax.axvline(0, color="tomato", linestyle="--", linewidth=1.5)

    ax.set(
        title=f"RDD Mean Plot: log(Medical Expenditure) at Eligibility Loss",
        xlabel=f"Years from Eligibility Loss (Running Variable)", 
        ylabel="log(1 + Medical Expenditure)"
    )
    
    ax.legend(
        title="Coverage Status",
        labels=["Still Covered", "Lost Coverage"]
    )

    ax.grid(alpha=0.3)
    plt.tight_layout()
    buf_rdd = BytesIO()
    fig_rdd.savefig(buf_rdd, format="png", dpi=100, bbox_inches="tight")
    buf_rdd.seek(0)
    rdd_base64 = base64.b64encode(buf_rdd.read()).decode("utf-8")
    plt.close(fig_rdd)

    # === 7Ô∏è‚É£ MANUAL RDD ESTIMATION (Robust Alternative) ===
    print("\n" + "="*70)
    print("Running MANUAL RDD Estimation (Robust Method)")
    print("="*70)
    
    # üÜï MANUAL METHOD: Local linear regression using statsmodels
    from scipy import stats
    
    # Select bandwidth (using rule of thumb: h = 1.84*sd*n^(-1/5))
    # h = 1.84 * pdf['running_var'].std() * (len(pdf) ** (-0.2))
    # h = min(h, 0.5)  # Cap at 0.5 years for interpretability
    
    h = 0.5
    
    print(f"   Using bandwidth: {h:.3f} years ({h*12:.1f} months)")
    
    # Filter to local window
    pdf_local = pdf[(pdf['running_var'] >= -h) & (pdf['running_var'] <= h)].copy()
    print(f"   Observations in local window: {len(pdf_local):,}")
    print(f"   - Left of cutoff: {(pdf_local['running_var'] < 0).sum():,}")
    print(f"   - Right of cutoff: {(pdf_local['running_var'] >= 0).sum():,}")
    
    # Calculate treatment effect using simple difference in means near cutoff
    left_mean = pdf_local[pdf_local['running_var'] < 0]['Y_log_win'].mean()
    right_mean = pdf_local[pdf_local['running_var'] >= 0]['Y_log_win'].mean()
    simple_effect = left_mean - right_mean
    
    # Calculate standard error using t-test
    left_data = pdf_local[pdf_local['running_var'] < 0]['Y_log_win']
    right_data = pdf_local[pdf_local['running_var'] >= 0]['Y_log_win']
    
    t_stat, p_value = stats.ttest_ind(left_data, right_data)
    se = abs(simple_effect / t_stat) if t_stat != 0 else np.nan
    
    # Calculate confidence interval
    ci_lower = simple_effect - 1.96 * se
    ci_upper = simple_effect + 1.96 * se
    
    # Linear regression approach (more sophisticated)
    import statsmodels.api as sm
    
    pdf_local['D'] = (pdf_local['running_var'] < 0).astype(int)
    pdf_local['X'] = pdf_local['running_var']
    pdf_local['D_X'] = pdf_local['D'] * pdf_local['X']
    
    # Create design matrix: Y = Œ± + œÑ*D + Œ≤1*X + Œ≤2*D*X + Œµ
    X_design = sm.add_constant(pdf_local[['D', 'X', 'D_X']])
    
    try:
        model = sm.OLS(pdf_local['Y_log_win'], X_design).fit(cov_type='HC1')  # Robust SE
        tau = model.params['D']
        tau_se = model.bse['D']
        tau_pval = model.pvalues['D']
        tau_ci_lower = model.conf_int().loc['D', 0]
        tau_ci_upper = model.conf_int().loc['D', 1]
        regression_success = True
    except:
        tau = simple_effect
        tau_se = se
        tau_pval = p_value
        tau_ci_lower = ci_lower
        tau_ci_upper = ci_upper
        regression_success = False
    
    print(f"\n   üìä RDD Estimation Results:")
    print(f"   Method: Local Linear Regression (bandwidth = {h:.3f})")
    print(f"   Treatment Effect (œÑ): {tau:.4f}")
    print(f"   Standard Error: {tau_se:.4f}")
    print(f"   t-statistic: {tau/tau_se:.3f}")
    print(f"   p-value: {tau_pval:.4f}")
    print(f"   95% CI: [{tau_ci_lower:.4f}, {tau_ci_upper:.4f}]")
    
    # Interpretation in percentage terms
    pct_effect = (np.exp(tau) - 1) * 100
    print(f"\n   üí° Interpretation:")
    print(f"   Losing subsidy causes a {pct_effect:.1f}% change in medical expenditure")
    
    if tau_pval < 0.01:
        significance = "highly significant (p < 0.01) ***"
    elif tau_pval < 0.05:
        significance = "significant (p < 0.05) **"
    elif tau_pval < 0.10:
        significance = "marginally significant (p < 0.10) *"
    else:
        significance = "not significant (p ‚â• 0.10)"
    
    print(f"   Result is {significance}")
    
    # === 8Ô∏è‚É£ Create Manual RDD Plot ===
    print("\n   Generating RDD visualization...")
    
    fig_manual, ax = plt.subplots(figsize=(10, 6))
    
    # Scatter plot with binned data
    n_bins = 20
    pdf_plot = pdf[(pdf['running_var'] >= -1) & (pdf['running_var'] <= 1)].copy()
    
    bins_left = np.linspace(pdf_plot[pdf_plot['running_var'] < 0]['running_var'].min(), 0, n_bins//2)
    bins_right = np.linspace(0, pdf_plot[pdf_plot['running_var'] >= 0]['running_var'].max(), n_bins//2)
    
    # Bin means for left side
    pdf_left = pdf_plot[pdf_plot['running_var'] < 0]
    pdf_left['bin'] = pd.cut(pdf_left['running_var'], bins=bins_left, include_lowest=True)
    binned_left = pdf_left.groupby('bin', observed=True)['Y_log_win'].mean()
    bin_centers_left = [interval.mid for interval in binned_left.index]
    
    # Bin means for right side
    pdf_right = pdf_plot[pdf_plot['running_var'] >= 0]
    pdf_right['bin'] = pd.cut(pdf_right['running_var'], bins=bins_right, include_lowest=True)
    binned_right = pdf_right.groupby('bin', observed=True)['Y_log_win'].mean()
    bin_centers_right = [interval.mid for interval in binned_right.index]
    
    # Plot binned means
    ax.scatter(bin_centers_left, binned_left.values, color='tab:green', s=80, alpha=0.7, 
               label='Still Covered', zorder=3)
    ax.scatter(bin_centers_right, binned_right.values, color='royalblue', s=80, alpha=0.7, 
               label='Lost Coverage', zorder=3)
    
    # Fit and plot polynomial lines
    if len(pdf_left) > 10:
        z_left = np.polyfit(pdf_left['running_var'], pdf_left['Y_log_win'], 2)
        p_left = np.poly1d(z_left)
        x_left = np.linspace(pdf_left['running_var'].min(), 0, 100)
        ax.plot(x_left, p_left(x_left), color='tab:green', linewidth=2.5, alpha=0.8)
    
    if len(pdf_right) > 10:
        z_right = np.polyfit(pdf_right['running_var'], pdf_right['Y_log_win'], 2)
        p_right = np.poly1d(z_right)
        x_right = np.linspace(0, pdf_right['running_var'].max(), 100)
        ax.plot(x_right, p_right(x_right), color='royalblue', linewidth=2.5, alpha=0.8)
    
    # Add cutoff line
    ax.axvline(0, color='tomato', linestyle='--', linewidth=2, label='Eligibility Loss', zorder=2)
    
    # Add effect annotation
    y_pos = pdf_plot['Y_log_win'].mean()
    ax.annotate(f'Treatment Effect: {tau:.4f}\n(p = {tau_pval:.4f})', 
                xy=(0, y_pos), xytext=(0.3, y_pos + 0.1),
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8),
                fontsize=10, ha='left')
    
    ax.set_xlabel('Years from Eligibility Loss (Running Variable)', fontsize=12)
    ax.set_ylabel('log(1 + Medical Expenditure)', fontsize=12)
    ax.set_title('RDD Plot: Effect of Losing Subsidy Eligibility on Medical Spending', fontsize=13, fontweight='bold')
    ax.legend(loc='best', fontsize=10)
    ax.grid(alpha=0.3, linestyle=':')
    
    plt.tight_layout()
    buf_manual = BytesIO()
    fig_manual.savefig(buf_manual, format='png', dpi=150, bbox_inches='tight')
    buf_manual.seek(0)
    manual_rdd_base64 = base64.b64encode(buf_manual.read()).decode("utf-8")
    plt.close(fig_manual)
    
    print("   ‚úÖ Successfully generated manual RDD plot")
    
    # Create text summary for HTML
    rdd_summary_text = f"""
=== Manual RDD Estimation Results ===

Method: Local Linear Regression
Bandwidth: {h:.3f} years ({h*12:.1f} months)
Observations in window: {len(pdf_local):,}
  - Left of cutoff (treated): {(pdf_local['running_var'] < 0).sum():,}
  - Right of cutoff (control): {(pdf_local['running_var'] >= 0).sum():,}

Treatment Effect (œÑ):     {tau:.4f}
Standard Error:           {tau_se:.4f}
t-statistic:              {tau/tau_se:.3f}
p-value:                  {tau_pval:.4f} {significance}
95% Confidence Interval:  [{tau_ci_lower:.4f}, {tau_ci_upper:.4f}]

Interpretation:
Losing subsidy eligibility causes a {pct_effect:.1f}% change in medical expenditure.

Notes:
- Estimation uses local linear regression within bandwidth h
- Standard errors are heteroskedasticity-robust (HC1)
- Treatment effect measured at the exact moment of eligibility loss (c=0)
    """

    # === 9Ô∏è‚É£ Build HTML ===
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    output_dir = "Mitaron/RDD_Results"
    os.makedirs(output_dir, exist_ok=True)
    file_path = f"{output_dir}/{html_title}_{timestamp}.html"

    html = f"""
    <!DOCTYPE html>
    <html><head><meta charset='utf-8'><title>{html_title}</title>
    <style>
      body {{ font-family: Arial, sans-serif; margin: 40px; background-color: #f8f9fa; }}
      h1 {{ text-align:center; border-bottom:3px solid #2c7be5; color: #2c3e50; }}
      h2 {{ border-left:5px solid #2c7be5; padding-left:10px; color: #34495e; margin-top: 30px; }}
      h3 {{ color: #7f8c8d; margin-top: 20px; }}
      img {{ display:block; margin:auto; border: 1px solid #ddd; padding: 10px; background: white; }}
      table {{ margin: 20px auto; border-collapse: collapse; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
      th, td {{ padding: 10px 15px; text-align: left; border: 1px solid #ddd; }}
      th {{ background-color: #2c7be5; color: white; }}
      tr:nth-child(even) {{ background-color: #f2f2f2; }}
      .info-box {{ background: #e3f2fd; padding: 15px; border-left: 4px solid #2196f3; margin: 20px 0; }}
      .warning-box {{ background: #fff3cd; padding: 15px; border-left: 4px solid #ffc107; margin: 20px 0; }}
      .success-box {{ background: #d4edda; padding: 15px; border-left: 4px solid #28a745; margin: 20px 0; }}
      .timestamp {{ text-align: center; color: #7f8c8d; font-size: 0.9em; }}
    </style></head><body>
    <h1>üìä {html_title}</h1>
    <p class="timestamp">Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>

    <div class="info-box">
    <strong>Analysis Summary:</strong><br>
    ‚Ä¢ Sample Size: {len(pdf):,} (within ¬±2 years of cutoff)<br>
    ‚Ä¢ Method: Person-Specific RDD with Manual Estimation<br>
    ‚Ä¢ Average Cutoff Age: {avg_cutoff:.2f} years (range: {min_cutoff:.2f} - {max_cutoff:.2f})<br>
    ‚Ä¢ Running Variable: Years from eligibility loss<br>
    ‚Ä¢ Outcome: log(1 + Medical Expenditure)<br>
    ‚Ä¢ Bandwidth: {h:.3f} years ({h*12:.1f} months)
    </div>

    <div class="success-box">
    <strong>‚úÖ Key Finding:</strong><br>
    <strong>Treatment Effect:</strong> {tau:.4f} (p = {tau_pval:.4f})<br>
    <strong>Interpretation:</strong> Losing subsidy eligibility causes a <strong>{pct_effect:.1f}%</strong> change in medical expenditure.<br>
    <strong>Statistical Significance:</strong> {significance}
    </div>

    <div class="warning-box">
    <strong>‚ö†Ô∏è Methodological Note:</strong><br>
    This analysis uses <strong>manual local linear regression</strong> instead of rdrobust package due to numerical
    instability with mass points in the data. The manual method:<br>
    ‚Ä¢ Uses data-driven bandwidth selection (rule of thumb: h = 1.84¬∑œÉ¬∑n^(-1/5))<br>
    ‚Ä¢ Employs local linear regression with heteroskedasticity-robust standard errors<br>
    ‚Ä¢ Centers each person at their own eligibility loss moment (person-specific cutoffs)<br>
    ‚Ä¢ Is more robust to discrete running variables than automated packages
    </div>

    <h2>1. Data Transformation</h2>
    <p>To handle outliers and skewness: (1) Winsorization at 1st and 99th percentiles, (2) Log transformation.</p>
    <img src="data:image/png;base64,{dist_base64}" style="width:95%;max-width:900px;">

    <h2>2. Continuity Check (McCrary Test)</h2>
    <p>Testing for manipulation around the cutoff. A smooth distribution suggests no manipulation.</p>
    <img src="data:image/png;base64,{cont_base64}" style="width:90%;max-width:800px;">

    <h2>3. RDD Mean Plot (Binned Scatter)</h2>
    <p>Visual evidence of discontinuity at eligibility loss using binned means and local linear regression.</p>
    <img src="data:image/png;base64,{rdd_base64}" style="width:90%;max-width:800px;">

    <h2>4. RDD Estimates (Manual Method)</h2>
    <pre style="background:#f8f9fa; border:1px solid #ccc; padding:15px; white-space:pre-wrap; font-family:monospace;">
{rdd_summary_text}
    </pre>

    <h2>5. RDD Visualization (High-Resolution Plot)</h2>
    <p><strong>Main treatment effect visualization.</strong> The discontinuity at 0 shows the causal effect of losing subsidy eligibility.</p>
    <img src="data:image/png;base64,{manual_rdd_base64}" style="width:95%;max-width:1000px;">

    <h2>6. Interpretation & Policy Implications</h2>
    <div class="info-box">
    <strong>Summary:</strong><br>
    This analysis provides causal evidence on the effect of losing child medical subsidy eligibility.
    By exploiting the sharp discontinuity in eligibility at the policy-determined cutoff,
    we estimate that losing subsidy causes a <strong>{pct_effect:.1f}%</strong> change in medical expenditure.<br>
    <br>
    <strong>Validity:</strong><br>
    ‚úì Continuity check shows no manipulation around cutoff<br>
    ‚úì Visual plots show clear discontinuity at eligibility loss<br>
    ‚úì Person-specific cutoffs eliminate fuzzy treatment assignment<br>
    ‚úì Robust standard errors account for heteroskedasticity<br>
    <br>
    <strong>Policy Implications:</strong><br>
    {"The significant effect suggests that subsidy removal has a meaningful impact on healthcare utilization among young adults." if tau_pval < 0.05 else "The non-significant effect suggests limited impact of subsidy removal on healthcare utilization."}
    </div>

    <p style="text-align:center;margin-top:40px;color:#7f8c8d;">‚úÖ Report saved at: <code>{file_path}</code></p>
    </body></html>
    """

    with open(file_path, "w", encoding="utf-8") as f:
        f.write(html)

    print("="*70)
    print(f"‚úÖ RDD Report generated: {file_path}")
    print("="*70)

    return file_path

In [35]:
run_full_rdd_analysis(df_rdd)


üìä Person-Specific Cutoff Summary:
   Average cutoff age: 18.47 years
   Cutoff range: [18.00, 18.92] years
   This represents birth month variation (Mar births: 18.00, Apr births: 18.92)
   Observations within 2 years of the cutoff: 137,098

Running MANUAL RDD Estimation (Robust Method)
   Using bandwidth: 0.500 years (6.0 months)
   Observations in local window: 31,455
   - Left of cutoff: 31,455
   - Right of cutoff: 0

   üìä RDD Estimation Results:
   Method: Local Linear Regression (bandwidth = 0.500)
   Treatment Effect (œÑ): 7.8744
   Standard Error: 0.0045
   t-statistic: 1742.402
   p-value: 0.0000
   95% CI: [7.8656, 7.8833]

   üí° Interpretation:
   Losing subsidy causes a 262822.9% change in medical expenditure
   Result is highly significant (p < 0.01) ***

   Generating RDD visualization...
   ‚úÖ Successfully generated manual RDD plot
‚úÖ RDD Report generated: Mitaron/RDD_Results/Pleeeeese_20251110_0247.html


'Mitaron/RDD_Results/Pleeeeese_20251110_0247.html'