In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import warnings
import pymc as pm
import arviz as az
import causalpy 
import seaborn as sns
from patsy import build_design_matrices, dmatrices
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import json

from library.synthetic_control import *
from library.data_generator import generate_gaussian_process_data
from library.synthetic_did import SyntheticDIDModel
from library.synthetic_bayes import WeightedSumFitter


plt.style.use('ggplot')
warnings.filterwarnings("ignore")

In [4]:
real_data = pd.read_parquet('data/data.pt')

In [5]:
def run_synthetic_control_analysis(model_name: str,
                                   data: pd.DataFrame,
                                   output_dir: str = "results",
                                   metric: str = "summary",
                                   T0: int = 70,
                                   effect_sizes: list = None,
                                   hdi_prob: float = 0.95):

    if effect_sizes is None:
        effect_sizes = [0, 5, 10, 15, 20]

    units = data.columns.tolist()
    results = []

    def make_model():
        return WeightedSumFitter(
            sample_kwargs={"target_accept": .95, "random_seed": 42, "progressbar": False, "draws": 300, "tune": 400, "chains":2}
        )

    fp_flags = []
    for response in units:
        predictors = [c for c in units if c != response]
        formula = f"{response} ~ 1 + " + " + ".join(predictors)

        result = causalpy.SyntheticControl(
            data,
            treatment_time=T0,
            formula=formula,
            model=make_model()
        )
        summ = az.summary(result.post_impact.mean("obs_ind"), hdi_prob=hdi_prob)
        low, high = float(summ["hdi_2.5%"][0]), float(summ["hdi_97.5%"][0])
        fp_flags.append((low > 0) or (high < 0))
    type1_error = sum(fp_flags) / len(units)
    print(f"Type I error (false-positive rate): {type1_error:.3f}\n")

    for eff in effect_sizes[1:]:
        fn_flags = []
        ests = []
        for response in units:
            dat = data.copy()
            dat.loc[dat.index >= T0, response] *= 1 + eff/100

            preds = [c for c in units if c != response]
            form  = f"{response} ~ 1 + " + " + ".join(preds)

            res = causalpy.SyntheticControl(
                dat,
                treatment_time=T0,
                formula=form,
                model=make_model()
            )
            summ = az.summary(res.post_impact.mean("obs_ind"), hdi_prob=hdi_prob)
            mean_eff = float(summ["mean"][0])
            low, high = float(summ["hdi_2.5%"][0]), float(summ["hdi_97.5%"][0])

            detected = (low > 0) or (high < 0)
            fn_flags.append(not detected)
            ests.append(mean_eff)

        type2_error = sum(fn_flags)/len(units)
        errors = np.array(ests) - eff
        rmspe = np.sqrt(np.mean(errors**2))

        print(f"=== Effect {eff}% ===")
        print(f"Type II error (false-negative rate): {type2_error:.3f}")
        print(f"RMSPE of effect estimates: {rmspe:.3f}\n")

        results.append({
            "effect_size": eff,
            "type1_error": type1_error,
            "type2_error": type2_error,
            "rmspe": rmspe,
        })

    summary = pd.DataFrame(results)
    print("=== Summary ===")
    print(summary.to_string(index=False))

    result_dict = {
        "model": model_name,
        "metric": metric,
        "summary": summary.to_dict(orient="records")
    }
    filename = f"{output_dir}/summary_{model_name}_{metric}.json"
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(result_dict, f, ensure_ascii=False, indent=2)
    print(f"Результаты сохранены в {filename}")

    return summary


In [None]:
data_bayes = real_data.pivot(index='time', columns='shopno', values='preprocessed_avg_delivery').reset_index(drop=True).rename_axis(None, axis=1)

run_synthetic_control_analysis(
    model_name="BayesianSyntheticControl",
    data=data_bayes,
    metric="preprocessed_avg_delivery"
)

In [None]:
data_bayes = real_data.pivot(index='time', columns='shopno', values='preprocessed_orders_per_courier').reset_index(drop=True).rename_axis(None, axis=1)

run_synthetic_control_analysis(
    model_name="BayesianSyntheticControl",
    data=data_bayes,
    metric="preprocessed_orders_per_courier"
)

In [None]:
data_bayes = real_data.pivot(index='time', columns='shopno', values='preprocessed_distance').reset_index(drop=True).rename_axis(None, axis=1)

run_synthetic_control_analysis(
    model_name="BayesianSyntheticControl",
    data=data_bayes,
    metric="preprocessed_distance"
)

In [None]:
data_bayes = real_data.pivot(index='time', columns='shopno', values='preprocessed_avg_collection_time').reset_index(drop=True).rename_axis(None, axis=1)

run_synthetic_control_analysis(
    model_name="BayesianSyntheticControl",
    data=data_bayes,
    metric="preprocessed_avg_collection_time"
)

In [None]:
data_bayes = real_data.pivot(index='time', columns='shopno', values='preprocessed_percent_late').reset_index(drop=True).rename_axis(None, axis=1)

run_synthetic_control_analysis(
    model_name="BayesianSyntheticControl",
    data=data_bayes,
    metric="preprocessed_percent_late"
)