In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import os
from utils import *

In [2]:
# Define the percentages of missingness and number of replicates

experiment_name = "ExpC"
experiment_data_folder = os.path.join("data", experiment_name)

if os.path.exists(experiment_data_folder) == False:
    os.makedirs(experiment_data_folder)

if os.path.exists(os.path.join(experiment_data_folder, "original_data")) == False:
    os.makedirs(os.path.join(experiment_data_folder, "original_data"))

if os.path.exists(os.path.join(experiment_data_folder, "test_data")) == False:
    os.makedirs(os.path.join(experiment_data_folder, "test_data"))

if os.path.exists(os.path.join(experiment_data_folder, "pred_data")) == False:
    os.makedirs(os.path.join(experiment_data_folder, "pred_data"))

if os.path.exists(os.path.join(experiment_data_folder, "bayes_data")) == False:
    os.makedirs(os.path.join(experiment_data_folder, "bayes_data"))


missingness_percentages = [0.25]
n_replicates = 1
Ds = [4]
Corrs = [0.33]
PropOfY1 = [0.50]
n_train = 20000
n_test = 15000
n = n_train + n_test

print("# of setups = ", n_replicates * len(missingness_percentages) * len(Corrs) * len(PropOfY1))

N_MC = 100

# The set-up dataframe should contain:
# - rep
# - n
# - trueProp1
# - true_beta
# - center_X
# - set_up (name)
# [The rest will be part of **kwargs throughout the code]
df_set_up = pd.DataFrame({
    "rep":[],
    "n":[],
    "trueProp1": [],
    "true_beta": [],
    "center_X": [],
    "set_up": [],
    "d":[],
    "corr":[],
    "prcNA": [],
    "prop1": [],
}).T

np.random.seed(1)
random.seed(1)

for rep in range(n_replicates):
    print("REP", rep)
    for d in Ds:

        beta0 = np.random.normal(0, 1.0, d)

        for corr in Corrs:

            corr_str = str(corr).replace(".", "")
                
            for prop1 in PropOfY1:

                prop1_str = str(prop1).replace(".", "")

                X_full, center_X = generate_X(d=d, corr_rate=corr, n=n,
                                    beta0=beta0, prop=prop1)
            
                y_probs = sigma(X_full @ beta0)
                y = np.random.binomial(n=1, p=y_probs)
                propY1_true = np.mean(y)

                for prc in missingness_percentages:

                    prc_str = str(prc).replace(".", "")
                    set_up = f"LOG_n{n}_d{d}_corr{corr_str}_prcNA{prc_str}_prop1{prop1_str}_rep{rep}"

                    M = generate_mask(n, d, prc) # don't allow fully missing
                    
                    X_obs = X_full.copy()
                    X_obs[M == 1] = np.nan


                    new_row = pd.Series({
                        "rep":rep,
                        "n":n,
                        "d":d,
                        "corr":corr,
                        "prcNA":prc,
                        "prop1":prop1,
                        "trueProp1":propY1_true,
                        "true_beta":beta0,
                        "center_X":center_X,
                        "set_up":set_up
                    })

                    df_set_up = pd.concat([df_set_up, new_row], axis=1, ignore_index=True)

                    data_to_save = {
                        "X_obs": X_obs,
                        "M": M,
                        "y": y,
                        "y_probs": y_probs,
                        "X_full": X_full
                    }

                    # save
                    np.savez(os.path.join(experiment_data_folder, "original_data", f"{set_up}.npz"), **data_to_save)

                    # test data
                    X_test = X_obs[n_train:]
                    y_test = y[n_train:]
                    y_probs_test = y_probs[n_train:]
                    M_test = M[n_train:]
                    data_to_save_test = {
                        "X_obs": X_test,
                        "M": M_test,
                        "y": y_test,
                        "y_probs": y_probs_test,
                        "X_full": X_full[n_train:]
                    }
                    np.savez(os.path.join(experiment_data_folder, "test_data", f"{set_up}.npz"), **data_to_save_test)

                    # bayes
                    y_probs_bayes = get_y_prob_bayes(X_test, full_mu=center_X, full_cov=toep(d, corr), true_beta=beta0, n_mc=N_MC)
                    y_probs_bayes = y_probs_bayes.mean(axis=1)

                    data_to_save_bayes = {
                        "y_probs_bayes": y_probs_bayes
                    }

                    np.savez(os.path.join(experiment_data_folder, "bayes_data", f"{set_up}.npz"), **data_to_save_bayes)


                    
# save df set-up
df_set_up.T.to_csv(os.path.join(experiment_data_folder, "set_up.csv"), index=False)

# of setups =  1
REP 0
