In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings("ignore")
import json

import pymc as pm
import arviz as az
import causalpy 
import seaborn as sns
from patsy import build_design_matrices, dmatrices
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib
from library.synthetic_did_best import *


plt.style.use('ggplot')

In [8]:
real_data = pd.read_parquet('data/data.pt')
model = 'SyntheticDiff-in-Diff'

In [11]:
def run_synthetic_diff_in_diff_simulation(metric: str,
                                     model: str = "SyntheticDiff-in-Diff",
                                     real_data: pd.DataFrame = real_data,
                                     T0: int = 70,
                                     T1: int = 90,
                                     bootstrap_rounds: int = 100,
                                     effect_sizes: list = [0.05, 0.10, 0.15, 0.20],
                                     output_dir: str = "results"):


    df = real_data.rename(columns={'shopno': 'unit', metric: 'y'})[['unit', 'time', 'y']]
    df = df[['unit', 'time', 'y']]
    template = df.copy()
    units = df['unit'].unique()


    T0 = 70
    T1 = 90
    effect_sizes = [0.05, 0.10, 0.15, 0.20]


    def simulate_type1(unit):
        """One Type I iteration: returns (abs(att), is_false_positive, rmspe)."""
        data_sim = template.copy()
        data_sim['treated'] = data_sim['unit'] == unit
        data_sim['after_treatment'] = data_sim['time'] >= T0

        sc = SyntheticDIDModel(
            data=data_sim,
            metric="y",
            period_index="time",
            shopno="unit",
            treated="treated",
            after_treatment="after_treatment",
            bootstrap_rounds=bootstrap_rounds,
            seed=42,
            njobs=-1
        )
        att, unit_w, time_w, model_fit, intercept = sc.synthetic_diff_in_diff()
        se, ci_low, ci_high = sc.estimate_se(alpha=0.05)
        is_fp = (ci_low > 0) or (ci_high < 0)
        return abs(att), is_fp, se


    def simulate_type2(unit, effect_size):
        """One Type II iteration: returns (abs(att), is_false_negative, se)."""
        data_sim = template.copy()
        data_sim['treated'] = data_sim['unit'] == unit
        data_sim['after_treatment'] = data_sim['time'] >= T0

        mask = (
            (data_sim['unit'] == unit) &
            (data_sim['time'] >= T0) &
            (data_sim['time'] <= T1)
        )
        data_sim.loc[mask, 'y'] *= (1 - effect_size)  

        sc = SyntheticDIDModel(
            data=data_sim,
            metric="y",
            period_index="time",
            shopno="unit",
            treated="treated",
            after_treatment="after_treatment",
            bootstrap_rounds=bootstrap_rounds,
            seed=42,
            njobs=-1
        )
        att, unit_w, time_w, model_fit, intercept = sc.synthetic_diff_in_diff()
        se, ci_low, ci_high = sc.estimate_se(alpha=0.05)
        is_fn = not ((ci_low > 0) or (ci_high < 0))
        return abs(att), is_fn, se


    print("=== Type I error (no effect) ===")
    out1 = []
    with tqdm(desc="Type I sims", total=len(units)) as pbar:
        for u in units:
            out1.append(simulate_type1(u))
            pbar.update(1)

    att1, fp1, se1 = zip(*out1)
    t1_error = sum(fp1) / len(units)
    mean_se = np.mean(se1)
    print(f"Type I error rate: {t1_error:.3f}")
    print(f"Mean se (Type I): {mean_se:.3f}\n")

    results = []
    for eff in effect_sizes:
        print(f"=== Effect size {eff:.0%} ===")
        out2 = []
        with tqdm(desc=f"Type II sims {eff:.0%}", total=len(units)) as pbar:
            for u in units:
                out2.append(simulate_type2(u, eff))
                pbar.update(1)

        att2, fn2, se2 = zip(*out2)
        t2_error = sum(fn2) / len(units)
        print(f"Type II error rate: {t2_error:.3f}")

        results.append({
            'effect_size': eff,
            'type1_error': t1_error,
            'type2_error': t2_error,
            'mean_rmspe': mean_se
        })

    summary = pd.DataFrame(results)
    print("=== Summary ===")
    print(summary.to_string(index=False))

    result_dict = {
        "model": model,
        "metric": metric,
        "summary": summary.to_dict(orient="records")
    }
    filename = f"{output_dir}/summary_{model}_{metric}.json"
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(result_dict, f, ensure_ascii=False, indent=2)
    print(f"Результаты сохранены в {filename}")

    return summary

In [None]:
run_synthetic_diff_in_diff_simulation(
    metric ="preprocessed_avg_delivery",
    model="SyntheticDiff-in-Diff",
    real_data=real_data
)

In [2]:
run_synthetic_diff_in_diff_simulation(
    metric ="preprocessed_orders_per_courier",
    model="SyntheticDiff-in-Diff",
    real_data=real_data
)

In [None]:
run_synthetic_diff_in_diff_simulation(
    metric ="preprocessed_distance",
    model="SyntheticDiff-in-Diff",
    real_data=real_data
)

In [None]:
run_synthetic_diff_in_diff_simulation(
    metric ="preprocessed_avg_collection_time",
    model="SyntheticDiff-in-Diff",
    real_data=real_data
)

In [None]:
run_synthetic_diff_in_diff_simulation(
    metric ="preprocessed_percent_late",
    model="SyntheticDiff-in-Diff",
    real_data=real_data
)