In [1]:
from helper_functions import *
from sklearn.model_selection import ParameterGrid
import multiprocessing
from functools import partial
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
models_list=['DCM Model',
             'DSM Model',
             'Cox Regression Model',
             'Cox IPTW Model',
             'RSF Model',
             'RSF IPTW Model',
             'AFT Model',
             'AFT IPTW Model']

params_grids = {
        "DCM Model": {"k": [3], "layers": [[50, 50]], "iters": [50], "learning_rate": [1e-3]},
        "DCM IPTW Model": {"k": [3], "layers": [[50, 50]], "iters": [50], "learning_rate": [1e-3]},
        "DSM Model": {"k": [3], "distribution": ["Weibull"], "layers": [[100]], "iters": [50], "learning_rate": [1e-3]},
        "DSM IPTW Model": {"k": [3], "distribution": ["Weibull"], "layers": [[100]], "iters": [50], "learning_rate": [1e-3]},
        "Cox Regression Model": {"penalizer": [0.01]},
        "Cox IPTW Model": {"penalizer": [0.01]},
        "RSF Model": {"n_estimators": [100], "max_depth": [5]},
        "RSF IPTW Model": {"n_estimators": [100], "max_depth": [5]},
        "AFT Model": {"penalizer": [0.01]},
        "AFT IPTW Model": {"penalizer": [0.01]}
    }

In [3]:
def _run_replicate(params_grids,
                   save_dict=None,
                   rep: int=100,
                   num_group: int=3,
                   num_sample: int=1000,
                   censor_rate: float=0.3,
                   group_hetero_scale: float=2.0,
                   random_seed_base=RANDOM_SEED,
                   models_list=models_list,
                   ):
    """
    Worker function for a single replicate: Runs data generation, processing, model fitting, and metrics.
    :param params_grids: list of model parameters
    :param save_dict: dictionary with saved simulation parameters
    :param rep: number of replicates
    :param num_group: number of latent groups
    :param censor_rate: censoring rate
    :param group_hetero_scale: heterogeneity scale between groups
    :param random_seed_base: random seed
    :param models_list: list of models to use
    :return: None dict of {model: metrics} for this replicate.
    """
    seed = random_seed_base + rep
    df, _, lambda0, lambda1, Z, actual_censor_rate, diagnostics = generate_simulated_data(
        num_sample=num_sample,
        num_group=num_group,
        censor_rate=censor_rate,
        group_hetero_scale=group_hetero_scale,
        random_seed=seed,
        **save_dict  # Use fixed parameters
    )

    # Identify features
    categorical_features_list = [col for col in df.columns if col.startswith("X_c")] + ["treatment"]

    # Process data
    processed = processing_data_2_DCM(df,
                                      categorical_features_list,
                                      train_test_val_size=None)

    (X_train, X_val, X_test), (t_train, t_val, t_test), (e_train, e_val, e_test), cat_feats, num_feats = processed

    covariates = [col for col in X_train.columns if col not in ["time", "event", "treatment"]]

    # Time grid for predictions: Use test set range, excluding exact max to satisfy strict < max condition
    min_time = np.min(t_test)
    max_time = np.max(t_test)
    times = np.linspace(min_time, max_time, 100, endpoint=False)

    # True ATE (simplified marginal RMST diff)
    true_S0 = expon.sf(times[:, np.newaxis], scale=1/lambda0).mean(axis=1)
    true_S1 = expon.sf(times[:, np.newaxis], scale=1/lambda1).mean(axis=1)
    true_ate = trapz(true_S1, times) - trapz(true_S0, times)

    # Test indices for subsetting
    test_idx = X_test.index

    rep_results = {}
    for model_name in models_list:
        print(model_name)
        rep_results[model_name] = {}
        params_grid = params_grids.get(model_name)

        if 'IPTW' in model_name:
            model_wrapper = model_dict[model_name](params_grid=params_grid,
                                                   covariates=covariates)
        else:
            model_wrapper = model_dict[model_name](params_grid=params_grid)

        model_wrapper.fit((X_train, t_train, e_train), (X_val, t_val, e_val))

        metrics = compute_metrics(model_wrapper,
                                  X_test, t_test, e_test,
                                  times,
                                  Z[test_idx], lambda0[test_idx], lambda1[test_idx])
        rep_results[model_name] = metrics
    return rep_results

In [4]:
def run_simulation(num_replicates: int=100,
                   num_groups_list: list=[1, 3, 5],
                   num_sample: int=1000,
                   censor_rate: float=0.3,
                   group_hetero_scale: float=1.0,
                   models_list=models_list,
                   params_grids={},
                   random_seed_base=RANDOM_SEED,
                   fixed_param_seed=0):
    """
    Run simulations
    :param num_replicates: number of replicates
    :param num_groups_list: list of numbers of latent groups
    :param num_sample: number of samples
    :param censor_rate: censoring rate
    :param group_hetero_scale: heterogeneity scale between groups
    :param models_list: list of models to use
    :param params_grids: list of model parameters
    :param random_seed_base: random seed
    :param fixed_param_seed: random seed for the fix parameters
    :return:
    """
    results = {}
    for num_group in num_groups_list:
        results[num_group] = {model: [] for model in models_list}

        # Generate fixed parameters once per num_group with fixed seed
        _, save_dict, _, _, _, _, _ = generate_simulated_data(
            num_sample=num_sample,
            num_group=num_group,
            censor_rate=censor_rate,
            group_hetero_scale=group_hetero_scale,
            random_seed=fixed_param_seed  # Fixed seed for parameters
        )

        # Partial function for worker (fixes non-rep args, including save_dict)
        worker = partial(
            _run_replicate,
            num_group=num_group,
            censor_rate=censor_rate,
            group_hetero_scale=group_hetero_scale,
            random_seed_base=random_seed_base,
            models_list=models_list,
            params_grids=params_grids,
            save_dict=save_dict
        )

        # Parallelize over replicates
        with multiprocessing.Pool(processes=4) as pool:
            rep_outputs = list(
                tqdm(
                    pool.imap(
                        worker, range(num_replicates)
                    ), total=num_replicates, desc=f"Processing replicates for num_group={num_group}"
                )
            )

        # Collect results
        for rep_result in rep_outputs:
            for model, metrics in rep_result.items():
                results[num_group][model].append(metrics)

    # Aggregate results
    agg_results = {}
    for num_group in num_groups_list:
        agg_results[num_group] = {model: {k: np.mean([m[k] for m in results[num_group][model]]) for k in results[num_group][model][0]} for model in models_list}

    return agg_results

In [None]:
# Run the simulation (adjust replicates/groups for your compute; use fixed_param_seed for consistent params across reps)
agg_results = run_simulation(
    num_replicates=1,  # Balance speed and precision; 100+ for tight SEs
    num_groups_list=[1],  # Test sensitivity to heterogeneity
    num_sample=10,  # Sample size; increase for better estimation
    censor_rate=0.3,  # Target censoring; administrative method ensures ~this value
    group_hetero_scale=1.0,  # Heterogeneity; higher favors DCM
    models_list=models_list,
    params_grids=params_grids,
    random_seed_base=42,  # For reproducibility
    fixed_param_seed=0  # Fixed seed for save_dict (parameters like betas)
)

# Convert to DataFrame and display (statistically, add SEs for inference)
df_results = results_to_dataframe(agg_results)
print(df_results)

Processing replicates for num_group=1:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# Optional: Compute standard errors (SE) for each metric per group/model
se_results = {}
for num_group in agg_results:
    se_results[num_group] = {model: {k: np.std([m[k] for m in agg_results[num_group][model]]) / np.sqrt(len(agg_results[num_group][model])) for k in agg_results[num_group][model]} for model in agg_results[num_group]}
df_se = pd.DataFrame.from_dict({(num_group, model): se_results[num_group][model] for num_group in se_results for model in se_results[num_group]}, orient='index')
print("Standard Errors:\n", df_se)