In [2]:
from functions.data_simulation import data_simulation8
seed_value= 1234
# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)
# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)
# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np 
np.random.seed(seed_value)
data_exp, data_obs = data_simulation8(1000, beta0=-3.0, beta1=-3.0)
data_full = np.r_[data_exp, data_obs]

# Model for the probability of participation

In [4]:
from sklearn import linear_model
# Logistic regression model for the probability of participating in the trial
# S=1 is trial participation and S=0 is non participation 
logr = linear_model.LogisticRegression()
logr.fit(data_full[:,1].reshape(-1,1),data_full[:,0])
prob = logr.predict_proba(data_full[:,1].reshape(-1,1))
prob_part = prob[:,1]
prob_nonpart = prob[:,0]
data_full = np.c_[data_full, 1/prob_part]

# Tune rho

In [None]:
from sklearn.model_selection import KFold
import numpy as np
from sklearn.metrics import mean_squared_error
import numpy.random as nprand
import matplotlib.pyplot as plt
from functions.model_training import ICM
from functions.weightedMSE import weighted_mean_squared_error

seed_value = 1234

# Define range of rho values to search over
rho_values = np.round(np.arange(0, 1.05, 0.1), 1)

# Define number of folds for cross-validation
num_folds = 5  # Example: 5-fold cross-validation

# Initialize variables to store results
cv_rmse_values = []
rho_rmse_values = {}  # Dictionary to store RMSE values for each rho
min_avg_wmse = float('inf')
best_rho = None
wmse_per_rho = {}

# Perform 5-fold cross-validation
kf = KFold(n_splits=num_folds, shuffle=True)
for rho in rho_values:
    
    print(f"Rho: {rho}")
    avg_wmse = 0
    fold_num = 0
    wmse_per_fold = []
    
    for train_index, val_index in kf.split(data_full):
        # 5 fold CV for experimental data: split them into 5 folds
        train_data, val_data= data_full[train_index], data_full[val_index]
        val_data = val_data[val_data[:,0]==1,:]
        X_val=val_data[:,1]
        T_val=val_data[:,2]
        Y_val=val_data[:,3]
        w = val_data[:,4]
        

        m0, m1 = ICM(X_E=train_data[train_data[:,0]==1,1], X_O=train_data[train_data[:,0]==0,1], 
                     T_E=train_data[train_data[:,0]==1,2], T_O=train_data[train_data[:,0]==0,2], 
                     Y_E=train_data[train_data[:,0]==1,3], Y_O=train_data[train_data[:,0]==0,3],  
                     r=2, ID=1, AD=0, rho=rho)

        predY0_exp = m0.predict_noiseless(np.c_[np.vstack(val_data[:,1]), np.ones(val_data[:,1].shape[0]) * 0])
        predY1_exp = m1.predict_noiseless(np.c_[np.vstack(val_data[:,1]), np.ones(val_data[:,1].shape[0]) * 0])
        predCATE_exp = predY1_exp[0] - predY0_exp[0]
        varCATE_exp = predY1_exp[1] + predY0_exp[1]

        wmse = weighted_mean_squared_error(weight=w, t = T_val, y_true=Y_val, y_pred0=predY0_exp[0], y_pred1=predY1_exp[0])
        avg_wmse += wmse
        wmse_per_fold.append(wmse)

        print(f"Fold {fold_num + 1} WMSE: {wmse}")
        fold_num += 1
    
    avg_wmse /= num_folds
    print(f"Average WMSE for Rho {rho}: {avg_wmse}")
    wmse_per_rho[rho] = wmse_per_fold

    # Update min_avg_wmse and best_rho if current avg_wmse is lower
    if avg_wmse < min_avg_wmse:
        min_avg_wmse = avg_wmse
        best_rho = rho

# Print the best rho and its corresponding min_avg_wmse
print(f"Best Rho: {best_rho}, Minimum Average WMSE: {min_avg_wmse}")


# Save results

In [7]:
import pandas as pd
sim8_best_rho_df = pd.DataFrame(np.c_[best_rho], columns=['best rho'])
sim8_best_rho_df.to_csv('sim8_best_rho.csv', index=False)