In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings("ignore")
import json

import pymc as pm
import arviz as az
import causalpy 
import seaborn as sns
from patsy import build_design_matrices, dmatrices
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib

from library.synthetic_control_best import *
from joblib import Parallel, delayed


plt.style.use('ggplot')

In [10]:
real_data = pd.read_parquet('data/data.pt')
model = 'SyntheticControl'

In [11]:
def run_synthetic_control_simulation(metric: str,
                                     model: str = "SyntheticControl",
                                     real_data: pd.DataFrame = real_data,
                                     T0: int = 70,
                                     T1: int = 90,
                                     bootstrap_rounds: int = 100,
                                     effect_sizes: list = [0.05, 0.10, 0.15, 0.20],
                                     output_dir: str = "results"):

    df = real_data.rename(columns={'shopno': 'unit', metric: 'y'})[['unit', 'time', 'y']]
    template = df.copy()
    units = df['unit'].unique()

    def simulate_type1(unit):
        data_sim = template.copy()
        data_sim['treated'] = data_sim['unit'] == unit
        data_sim['after_treatment'] = data_sim['time'] >= T0

        sc = SyntheticControl(
            data=data_sim, metric="y", period_index="time",
            shopno="unit", treated="treated",
            after_treatment="after_treatment",
            bootstrap_rounds=bootstrap_rounds, seed=42
        )
        att, opt_w = sc.synthetic_control()
        se, ci_low, ci_high = sc.estimate_se_sc(alpha=0.05)
        is_fp = (ci_low > 0) or (ci_high < 0)
        return abs(att), is_fp, se

    def simulate_type2(unit, effect_size):
        data_sim = template.copy()
        data_sim['treated'] = data_sim['unit'] == unit
        data_sim['after_treatment'] = data_sim['time'] >= T0

        mask = (
            (data_sim['unit'] == unit) &
            (data_sim['time'] >= T0) &
            (data_sim['time'] <= T1)
        )
        data_sim.loc[mask, 'y'] *= (1 - effect_size)

        sc = SyntheticControl(
            data=data_sim, metric="y", period_index="time",
            shopno="unit", treated="treated",
            after_treatment="after_treatment",
            bootstrap_rounds=bootstrap_rounds, seed=42
        )
        att, opt_w = sc.synthetic_control()
        _, ci_low, ci_high = sc.estimate_se_sc(alpha=0.05)
        is_fn = not ((ci_low > 0) or (ci_high < 0))
        return abs(att), is_fn

    print("=== Type I error (no treatment effect) ===")
    type1_out = Parallel(n_jobs=-1)(
        delayed(simulate_type1)(u) for u in tqdm(units, desc="Type I sims")
    )
    _, fp_flags, se = zip(*type1_out)
    type1_error = sum(fp_flags) / len(units)
    mean_se = np.mean(se)
    print(f"Type I error: {type1_error:.3f}, Mean se: {mean_se:.3f}\n")

    results = []
    for eff in effect_sizes:
        print(f"=== effect_size = {eff:.0%} ===")
        type2_out = Parallel(n_jobs=-1)(
            delayed(simulate_type2)(u, eff) for u in tqdm(units, desc=f"Type II sims {eff:.0%}")
        )
        _, fn_flags = zip(*type2_out)
        type2_error = sum(fn_flags) / len(units)
        print(f"Type II error: {type2_error:.3f}\n")

        results.append({
            "effect_size": eff,
            "type1_error": type1_error,
            "type2_error": type2_error,
            "mean_se": mean_se
        })

    summary = pd.DataFrame(results)
    print("=== Summary ===")
    print(summary.to_string(index=False))

    result_dict = {
        "model": model,
        "metric": metric,
        "summary": summary.to_dict(orient="records")
    }
    filename = f"{output_dir}/summary_{model}_{metric}.json"
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(result_dict, f, ensure_ascii=False, indent=2)
    print(f"Результаты сохранены в {filename}")

    return summary


In [None]:
run_synthetic_control_simulation(
    metric="preprocessed_avg_delivery",
    model="SyntheticControl",
    real_data=real_data
)

=== Type I error (no treatment effect) ===


Type I sims:  44%|████▍     | 40/90 [01:49<02:41,  3.23s/it]

In [5]:
run_synthetic_control_simulation(
    metric="preprocessed_orders_per_courier",
    model="SyntheticControl",
    real_data=real_data
)

=== Type I error (no treatment effect) ===


Type I sims: 100%|██████████| 90/90 [01:01<00:00,  1.47it/s]


Type I error: 0.033, Mean se: 1.099

=== effect_size = 5% ===


Type II sims 5%: 100%|██████████| 90/90 [01:01<00:00,  1.47it/s]


Type II error: 0.900

=== effect_size = 10% ===


Type II sims 10%: 100%|██████████| 90/90 [01:00<00:00,  1.48it/s]


Type II error: 0.611

=== effect_size = 15% ===


Type II sims 15%: 100%|██████████| 90/90 [01:00<00:00,  1.48it/s]


Type II error: 0.244

=== effect_size = 20% ===


Type II sims 20%: 100%|██████████| 90/90 [01:01<00:00,  1.46it/s]


Type II error: 0.056

=== Summary ===
 effect_size  type1_error  type2_error  mean_se
        0.05     0.033333     0.900000 1.098505
        0.10     0.033333     0.611111 1.098505
        0.15     0.033333     0.244444 1.098505
        0.20     0.033333     0.055556 1.098505
Результаты сохранены в results/summary_SyntheticControl_preprocessed_orders_per_courier.json


Unnamed: 0,effect_size,type1_error,type2_error,mean_se
0,0.05,0.033333,0.9,1.098505
1,0.1,0.033333,0.611111,1.098505
2,0.15,0.033333,0.244444,1.098505
3,0.2,0.033333,0.055556,1.098505


In [6]:
run_synthetic_control_simulation(
    metric="preprocessed_distance",
    model="SyntheticControl",
    real_data=real_data
)

=== Type I error (no treatment effect) ===


Type I sims: 100%|██████████| 90/90 [01:56<00:00,  1.29s/it]


Type I error: 0.100, Mean se: 0.067

=== effect_size = 5% ===


Type II sims 5%: 100%|██████████| 90/90 [01:56<00:00,  1.29s/it]


Type II error: 0.922

=== effect_size = 10% ===


Type II sims 10%: 100%|██████████| 90/90 [01:56<00:00,  1.29s/it]


Type II error: 0.456

=== effect_size = 15% ===


Type II sims 15%: 100%|██████████| 90/90 [01:56<00:00,  1.29s/it]


Type II error: 0.033

=== effect_size = 20% ===


Type II sims 20%: 100%|██████████| 90/90 [01:56<00:00,  1.30s/it]


Type II error: 0.000

=== Summary ===
 effect_size  type1_error  type2_error  mean_se
        0.05          0.1     0.922222 0.067414
        0.10          0.1     0.455556 0.067414
        0.15          0.1     0.033333 0.067414
        0.20          0.1     0.000000 0.067414
Результаты сохранены в results/summary_SyntheticControl_preprocessed_distance.json


Unnamed: 0,effect_size,type1_error,type2_error,mean_se
0,0.05,0.1,0.922222,0.067414
1,0.1,0.1,0.455556,0.067414
2,0.15,0.1,0.033333,0.067414
3,0.2,0.1,0.0,0.067414


In [7]:
run_synthetic_control_simulation(
    metric="preprocessed_avg_collection_time",
    model="SyntheticControl",
    real_data=real_data
)

=== Type I error (no treatment effect) ===


Type I sims: 100%|██████████| 90/90 [01:22<00:00,  1.08it/s]


Type I error: 0.067, Mean se: 0.070

=== effect_size = 5% ===


Type II sims 5%: 100%|██████████| 90/90 [01:23<00:00,  1.08it/s]


Type II error: 0.856

=== effect_size = 10% ===


Type II sims 10%: 100%|██████████| 90/90 [01:25<00:00,  1.05it/s]


Type II error: 0.556

=== effect_size = 15% ===


Type II sims 15%: 100%|██████████| 90/90 [01:26<00:00,  1.05it/s]


Type II error: 0.156

=== effect_size = 20% ===


Type II sims 20%: 100%|██████████| 90/90 [01:24<00:00,  1.06it/s]


Type II error: 0.022

=== Summary ===
 effect_size  type1_error  type2_error  mean_se
        0.05     0.066667     0.855556 0.069506
        0.10     0.066667     0.555556 0.069506
        0.15     0.066667     0.155556 0.069506
        0.20     0.066667     0.022222 0.069506
Результаты сохранены в results/summary_SyntheticControl_preprocessed_avg_collection_time.json


Unnamed: 0,effect_size,type1_error,type2_error,mean_se
0,0.05,0.066667,0.855556,0.069506
1,0.1,0.066667,0.555556,0.069506
2,0.15,0.066667,0.155556,0.069506
3,0.2,0.066667,0.022222,0.069506


In [8]:
run_synthetic_control_simulation(
    metric="preprocessed_percent_late",
    model="SyntheticControl",
    real_data=real_data
)

=== Type I error (no treatment effect) ===


Type I sims: 100%|██████████| 90/90 [01:12<00:00,  1.24it/s]


Type I error: 0.100, Mean se: 0.636

=== effect_size = 5% ===


Type II sims 5%: 100%|██████████| 90/90 [01:11<00:00,  1.25it/s]


Type II error: 0.911

=== effect_size = 10% ===


Type II sims 10%: 100%|██████████| 90/90 [01:14<00:00,  1.21it/s]


Type II error: 0.933

=== effect_size = 15% ===


Type II sims 15%: 100%|██████████| 90/90 [01:10<00:00,  1.29it/s]


Type II error: 0.944

=== effect_size = 20% ===


Type II sims 20%: 100%|██████████| 90/90 [01:11<00:00,  1.26it/s]


Type II error: 0.922

=== Summary ===
 effect_size  type1_error  type2_error  mean_se
        0.05          0.1     0.911111 0.635863
        0.10          0.1     0.933333 0.635863
        0.15          0.1     0.944444 0.635863
        0.20          0.1     0.922222 0.635863
Результаты сохранены в results/summary_SyntheticControl_preprocessed_percent_late.json


Unnamed: 0,effect_size,type1_error,type2_error,mean_se
0,0.05,0.1,0.911111,0.635863
1,0.1,0.1,0.933333,0.635863
2,0.15,0.1,0.944444,0.635863
3,0.2,0.1,0.922222,0.635863
