In [1]:
import pandas as pd
import numpy as np

def generate_ipsw_test_data(n_samples=1000, seed=42):
    """
    Generates a synthetic dataset suitable for testing Inverse Propensity Score Weighting (IPSW).

    Args:
        n_samples (int): The number of samples to generate.
        seed (int): Random seed for reproducibility.

    Returns:
        pd.DataFrame: A DataFrame containing the synthetic data.
    """
    np.random.seed(seed)

    # Generate covariates (features)
    covariate_1 = np.random.normal(0, 1, n_samples)
    covariate_2 = np.random.uniform(-2, 2, n_samples)
    covariate_3 = np.random.binomial(1, 0.4, n_samples)  # Binary covariate

    # Simulate treatment assignment (propensity score)
    # The probability of treatment depends on the covariates
    propensity_score = 1 / (1 + np.exp(-(0.5 * covariate_1 + 0.8 * covariate_2 + 1.2 * covariate_3)))
    treatment = np.random.binomial(1, propensity_score, n_samples)

    # Simulate outcome variable
    # Outcome depends on covariates, treatment, and some random noise
    # We introduce a treatment effect here
    treatment_effect = 2.5
    outcome = 1.5 * covariate_1 + 2 * covariate_2 - 1 * covariate_3 + treatment_effect * treatment + np.random.normal(0, 2, n_samples)

    # Create DataFrame
    data = pd.DataFrame({
        'covariate_1': covariate_1,
        'covariate_2': covariate_2,
        'covariate_3': covariate_3,
        'treated': treatment,
        'outcome': outcome,
    })
    
    # Add an ID column, that does not influence anything for better testing
    data['ID'] = range(n_samples)

    return data

# Example usage to create a CSV:
test_data = generate_ipsw_test_data(n_samples=1000, seed=42)
test_data.to_csv("ipsw_test_data.csv", index=False)

print("IPSW test data generated and saved to 'ipsw_test_data.csv'")


IPSW test data generated and saved to 'ipsw_test_data.csv'
