In [1]:
import numpy as np
import pandas as pd

In [None]:
# Generate stationary X 
def generate_scenario_stationary(n_months=360, n_features=5, lag=4, seed=42):
    np.random.seed(seed)

    # Date and quarter
    date_range = pd.date_range(start="1990-01-01", periods=n_months, freq="MS")
    df = pd.DataFrame({'DATE': date_range})
    df['quarter'] = df['DATE'].dt.to_period('Q')

    # Generate explanatory variables (X_0 ~ X_4)
    for i in range(n_features):
        df[f'X_{i}'] = np.random.randn(n_months)
    X = df[[f'X_{i}' for i in range(n_features)]].values

    # Scenario A: MLP-advantaged
    noise_A = np.random.normal(0, 1, n_months)
    y_true_A = (
        1.5 * np.sin(X[:, 0]) +
        0.8 * X[:, 1]**2 -
        0.5 * X[:, 2] +
        noise_A +
        100
    )
    y_true_A = np.maximum(y_true_A, 1)

    # Scenario B: LSTM-advantaged
    y_true_B = np.zeros(n_months)
    noise_B = np.random.normal(0, 1, n_months)
    for m in range(lag):
        y_true_B[m] = (
            2.0 * np.cos(X[m, 0]) +
            1.0 * X[m, 1] +
            noise_B[m] +
            100
        )
    for m in range(lag, n_months):
        past_mean_0 = np.mean(X[m-lag:m, 0])
        past_mean_1 = np.mean(X[m-lag:m, 1])
        y_true_B[m] = (
            2.0 * np.cos(past_mean_0) +
            1.0 * past_mean_1 +
            noise_B[m] +
            100
        )
    y_true_B = np.maximum(y_true_B, 1)

    df['y_true_A'] = y_true_A
    df['y_true_B'] = y_true_B

    # Create quarterly Y_q: only in first month of each quarter
    df['Y_q_A'] = np.nan
    df['Y_q_B'] = np.nan
    for q in df['quarter'].unique():
        idxs = df[df['quarter'] == q].index
        if len(idxs) == 3:
            df.loc[idxs[0], 'Y_q_A'] = df.loc[idxs, 'y_true_A'].sum()
            df.loc[idxs[0], 'Y_q_B'] = df.loc[idxs, 'y_true_B'].sum()

    return df

# Run the data generation and save to CSV files
if __name__ == '__main__':
    df = generate_scenario_stationary(n_months=12*30, n_features=5)

    X_cols = [f'X_{i}' for i in range(5)]

    # Scenario A
    scenario_A = df[['DATE'] + X_cols + ['Y_q_A']].copy()
    scenario_A = scenario_A.rename(columns={'Y_q_A': 'Y_q'})
    scenario_A.to_csv('scenario_A_stationary.csv', index=False)

    y_monthly_A = df[['DATE', 'y_true_A']].copy()
    y_monthly_A = y_monthly_A.rename(columns={'y_true_A': 'y_true'})
    y_monthly_A.to_csv('monthly_y_true_A_stationary.csv', index=False)

    # Scenario B
    scenario_B = df[['DATE'] + X_cols + ['Y_q_B']].copy()
    scenario_B = scenario_B.rename(columns={'Y_q_B': 'Y_q'})
    scenario_B.to_csv('scenario_B_stationary.csv', index=False)

    y_monthly_B = df[['DATE', 'y_true_B']].copy()
    y_monthly_B = y_monthly_B.rename(columns={'y_true_B': 'y_true'})
    y_monthly_B.to_csv('monthly_y_true_B_stationary.csv', index=False)

All files saved:
- scenario_A.csv
- scenario_B.csv
- monthly_y_true_A.csv
- monthly_y_true_B.csv


In [2]:
# Generate nonstationary X 
def generate_scenario_nonstationary(n_months=360, n_features=5, lag=4, seed=42):
    np.random.seed(seed)

    # Date and quarter
    date_range = pd.date_range(start="1990-01-01", periods=n_months, freq="MS")
    df = pd.DataFrame({'DATE': date_range})
    df['quarter'] = df['DATE'].dt.to_period('Q')

    # Generate non-stationary explanatory variables (X_0 ~ X_4)
    X = np.zeros((n_months, n_features))
    for i in range(n_features):
        shocks = np.random.normal(0, 1, n_months)
        X[:, i] = np.cumsum(shocks)  # random walk process
        X[:, i] -= X[:, i].min() - 1  # shift to ensure strictly positive values

    # Assign to DataFrame
    for i in range(n_features):
        df[f'X_{i}'] = X[:, i]

    # Scenario A: MLP-advantaged
    noise_A = np.random.normal(0, 1, n_months)
    y_true_A = (
        1.5 * np.sin(X[:, 0]) +
        0.8 * X[:, 1]**2 -
        0.5 * X[:, 2] +
        noise_A +
        100
    )
    y_true_A = np.maximum(y_true_A, 1)

    # Scenario B: LSTM-advantaged
    y_true_B = np.zeros(n_months)
    noise_B = np.random.normal(0, 1, n_months)
    for m in range(lag):
        y_true_B[m] = (
            2.0 * np.cos(X[m, 0]) +
            1.0 * X[m, 1] +
            noise_B[m] +
            100
        )
    for m in range(lag, n_months):
        past_mean_0 = np.mean(X[m-lag:m, 0])
        past_mean_1 = np.mean(X[m-lag:m, 1])
        y_true_B[m] = (
            2.0 * np.cos(past_mean_0) +
            1.0 * past_mean_1 +
            noise_B[m] +
            100
        )
    y_true_B = np.maximum(y_true_B, 1)

    df['y_true_A'] = y_true_A
    df['y_true_B'] = y_true_B

    # Create quarterly Y_q: only in first month of each quarter
    df['Y_q_A'] = np.nan
    df['Y_q_B'] = np.nan
    for q in df['quarter'].unique():
        idxs = df[df['quarter'] == q].index
        if len(idxs) == 3:
            df.loc[idxs[0], 'Y_q_A'] = df.loc[idxs, 'y_true_A'].sum()
            df.loc[idxs[0], 'Y_q_B'] = df.loc[idxs, 'y_true_B'].sum()

    return df

# Run the data generation and save to CSV files
if __name__ == '__main__':
    df = generate_scenario_nonstationary(n_months=12*30, n_features=5)

    X_cols = [f'X_{i}' for i in range(5)]

    # Scenario A
    scenario_A = df[['DATE'] + X_cols + ['Y_q_A']].copy()
    scenario_A = scenario_A.rename(columns={'Y_q_A': 'Y_q'})
    scenario_A.to_csv('scenario_A_nonstationary.csv', index=False)

    y_monthly_A = df[['DATE', 'y_true_A']].copy()
    y_monthly_A = y_monthly_A.rename(columns={'y_true_A': 'y_true'})
    y_monthly_A.to_csv('monthly_y_true_A_nonstationary.csv', index=False)

    # Scenario B
    scenario_B = df[['DATE'] + X_cols + ['Y_q_B']].copy()
    scenario_B = scenario_B.rename(columns={'Y_q_B': 'Y_q'})
    scenario_B.to_csv('scenario_B_nonstationary.csv', index=False)

    y_monthly_B = df[['DATE', 'y_true_B']].copy()
    y_monthly_B = y_monthly_B.rename(columns={'y_true_B': 'y_true'})
    y_monthly_B.to_csv('monthly_y_true_B_nonstationary.csv', index=False)