In [3]:
import pandas as pd
import numpy as np
import os
import GPy
import contextlib
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

### Functions

In [4]:
def R2(df_xtest, df_ytest, model):
    pred_mean, pred_var = model.predict(df_xtest) # is it prediction variance or SD?
    MSE = np.mean((pred_mean-df_ytest)**2)
    R2=1.-MSE/np.var(df_ytest)
    return(R2)

In [5]:
def emulate_RBF(input, output, percentage, restarts, suppress=False):
    # Calculate the number of rows to sample (20% of the total rows)
    sample_size = int(len(input) * percentage)

    # Randomly select indices (using random_state for reproducibility)
    sample_indices = input.sample(n=sample_size, random_state=42).index
    remaining_indices = input.index.difference(sample_indices)

    # Test Data
    xtest = input.loc[sample_indices].values
    ytest = output.loc[sample_indices].values

    # Training Data
    xtrain = input.loc[remaining_indices].values
    ytrain = output.loc[remaining_indices].values

    # Define the kernel
    kernel = GPy.kern.RBF(input_dim=xtrain.shape[1], ARD=True)

    # Create the GP model
    model = GPy.models.GPRegression(xtrain, ytrain, kernel)

    # Optimize the model with optional print suppression
    if suppress:
        with open(os.devnull, 'w') as f, contextlib.redirect_stdout(f):
            model.optimize_restarts(restarts, messages=False, num_processes=0)
    else:
        model.optimize_restarts(restarts, messages=False)

    # Print model parameters
    if not suppress:
        print(model.kern.lengthscale)
        print(model)
        for _ in range(3):
            print()
        print(f'R2 = {R2(xtest, ytest, model)}')

    # Compute R² score for the predictions versus actual test data
    r2 = R2(xtest, ytest, model)
    return model, r2

In [6]:
main_path = '../Emulation'
main_path

'../Emulation'

### Building emulators for various outputs

In [7]:
# Read Input Data
df_x = pd.read_csv(f'{main_path}/Input/input_100.csv')

# Select varying inputs only
#columns_with_multiple_values = df_x.nunique() > 1
#filtered_input = df_x.loc[:, columns_with_multiple_values]

# Select only first 5 inputs 
filtered_input = df_x.iloc[:,:5]

dataframes = {}

# Read PCA data
for i in range(3):
    df_name = f'y_PC{i+1}'  # Create the dataframe name
    dataframes[df_name] = pd.read_csv(f'{main_path}/Outputs/Out_fixed/PCA/PC{i+1}.csv')  # Read and store the dataframe
    dataframes[f'all_{df_name}'] =  pd.read_csv(f'{main_path}/Outputs/Out_fixed/PCA/all_PC{i+1}.csv') 
    dataframes[f'CO_{df_name}'] =  pd.read_csv(f'{main_path}/Outputs/Out_fixed/PCA/CO_PC{i+1}.csv') 

y_PC1 = dataframes['y_PC1']
y_PC2 = dataframes['y_PC2']
y_PC3 = dataframes['y_PC3']

all_y_PC1 = dataframes['all_y_PC1']
all_y_PC2 = dataframes['all_y_PC2']
all_y_PC3 = dataframes['all_y_PC3']

CO_y_PC1 = dataframes['CO_y_PC1']
CO_y_PC2 = dataframes['CO_y_PC2']
CO_y_PC3 = dataframes['CO_y_PC3']

df_pressure = pd.read_csv(f'{main_path}/Outputs/Out_fixed/pressure_traces_r_pat/all_pressure_traces.csv')
cardiac_output = df_pressure.iloc[:,100:101]

mean_press = df_pressure.iloc[:,:100].mean(axis=1).to_frame(name='mean_press')
max_press = df_pressure.iloc[:,:100].max(axis=1).to_frame(name='max_press')
min_press = df_pressure.iloc[:,:100].min(axis=1).to_frame(name='min_press')


 ### Set up 


In [8]:
# Choose test percentage 
test_perc = 0.2
restarts = 10

# Select Output Data
# y_PC1, y_PC2, y_PC3, all_y_PC1, all_y_PC2, all_y_PC3, CO_y_PC1, CO_y_PC2, CO_y_PC3, cardiac_output, df_pressure
output_data = max_press



### Test Emulator

In [9]:
emulate_RBF(input = filtered_input, output=output_data, percentage=test_perc, restarts=restarts, suppress=True)

(<GPy.models.gp_regression.GPRegression at 0x16c026930>, 0.997769588906109)

### Run RBF emulators for all outputs

In [10]:
main_path

'../Emulation'

In [11]:
os.system(f'mkdir -p {main_path+'/Outputs/Emulators'}')


0

In [12]:

# Initialize dictionaries to store R2 scores and models
r2_scores = {}
fitted_models = {}

for key, output_data in dataframes.items():
    model, r2 = emulate_RBF(input=filtered_input, output=output_data, percentage=test_perc, restarts=restarts, suppress=True)
    r2_scores[key] = r2
    fitted_models[key] = model

# For individual outputs
model, r2 = emulate_RBF(input=filtered_input, output=cardiac_output, percentage=test_perc, restarts=restarts, suppress=True)
r2_scores['CO'] = r2
fitted_models['CO'] = model

model, r2 = emulate_RBF(input=filtered_input, output=mean_press, percentage=test_perc, restarts=restarts, suppress=True)
r2_scores['mean_press'] = r2
fitted_models['mean_press'] = model

model, r2 = emulate_RBF(input=filtered_input, output=max_press, percentage=test_perc, restarts=restarts, suppress=True)
r2_scores['max_press'] = r2
fitted_models['max_press'] = model

model, r2 = emulate_RBF(input=filtered_input, output=min_press, percentage=test_perc, restarts=restarts, suppress=True)
r2_scores['min_press'] = r2
fitted_models['min_press'] = model

# Convert the dictionaries to a DataFrame
results_df = pd.DataFrame({
    'R2_Score': pd.Series(r2_scores),
    'Model': pd.Series(fitted_models)
})

# Now `results_df` will be a DataFrame with column names as indices, R2 scores, and models
print(results_df)

# Save the DataFrame to a CSV file (models will not be saved in this step)
results_df.to_csv(f'{main_path}/Outputs/Emulators/RBF_models_and_r2_scores.csv')

# To save the DataFrame with models, use pickle
results_df.to_pickle(f'{main_path}/Outputs/Emulators/RBF_models_and_r2_scores.csv')



            R2_Score                                              Model
y_PC1       0.998637  \nName : GP regression\nObjective : 82.9228280...
all_y_PC1   0.998650  \nName : GP regression\nObjective : 82.6900306...
CO_y_PC1    0.998637  \nName : GP regression\nObjective : 82.9228280...
y_PC2       0.997759  \nName : GP regression\nObjective : -8.5241813...
all_y_PC2   0.997584  \nName : GP regression\nObjective : -9.3995655...
CO_y_PC2    0.997759  \nName : GP regression\nObjective : -8.5241813...
y_PC3       0.984687  \nName : GP regression\nObjective : -19.126323...
all_y_PC3   0.991417  \nName : GP regression\nObjective : -30.904616...
CO_y_PC3    0.984686  \nName : GP regression\nObjective : -19.126323...
CO          0.986648  \nName : GP regression\nObjective : -126.19235...
mean_press  0.998630  \nName : GP regression\nObjective : 26.4208676...
max_press   0.997770  \nName : GP regression\nObjective : 48.2499971...
min_press   0.998254  \nName : GP regression\nObjective : 34.987

### Test a linear regression model

In [13]:
# Example input and output data
X = filtered_input
Y = max_press

# Initialize the model
linear_model = LinearRegression()

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Fit the model to the training data
linear_model.fit(X_train, y_train)

# Predict the output for the test data
y_pred = linear_model.predict(X_test)

# Compute R² score for the predictions versus actual test data
r2 = r2_score(y_test.iloc[:,0], y_pred)
# Output R² score
print("R² score:", r2)

R² score: 0.980858506720121


### Build linear emulator for all outputs and save results


In [14]:
def emulate_linear(input, output, percentage):
 # Input and output data
 X = input
 Y = output

 # Initialize the model
 model = LinearRegression()

 # Split the data into training and testing sets (80% train, 20% test)
 X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=percentage, random_state=42)

 # Fit the model to the training data
 model.fit(X_train, y_train)

 # Predict the output for the test data
 y_pred = model.predict(X_test)

 # Compute R² score for the predictions versus actual test data
 r2 = r2_score(y_test, y_pred)
 return model, r2  

In [15]:
emulate_linear(filtered_input, output=max_press, percentage=0.2)

(LinearRegression(), 0.980858506720121)

In [16]:
# Initialize dictionaries to store R2 scores and models
linear_r2_scores = {}
fitted_models = {}

# Assuming 'dataframes' is a dictionary where keys are the names and values are the output dataframes
for key, output_data in dataframes.items():
    model, r2 = emulate_linear(input=filtered_input, output=output_data, percentage=0.2)
    linear_r2_scores[key] = r2
    fitted_models[key] = model

# For individual outputs
model, r2 = emulate_linear(input=filtered_input, output=cardiac_output, percentage=0.2)
linear_r2_scores['CO'] = r2
fitted_models['CO'] = model

model, r2 = emulate_linear(input=filtered_input, output=mean_press, percentage=0.2)
linear_r2_scores['mean_press'] = r2
fitted_models['mean_press'] = model

model, r2 = emulate_linear(input=filtered_input, output=max_press, percentage=0.2)
linear_r2_scores['max_press'] = r2
fitted_models['max_press'] = model

model, r2 = emulate_linear(input=filtered_input, output=min_press, percentage=0.2)
linear_r2_scores['min_press'] = r2
fitted_models['min_press'] = model

# Convert the dictionaries to a DataFrame
results_df = pd.DataFrame({
    'R2_Score': pd.Series(linear_r2_scores),
    'Model': pd.Series(fitted_models)
})

# Now `results_df` will be a DataFrame with column names as indices, R2 scores, and models
print(results_df)

# Save the DataFrame to a CSV file (models will not be saved in this step)
results_df.to_csv(f'{main_path}/Outputs/Emulators/linear_models_and_r2_scores.csv')

# To save the DataFrame with models, use pickle
results_df.to_pickle(f'{main_path}/Outputs/Emulators/linear_models_and_r2_scores.csv')



            R2_Score               Model
y_PC1       0.987922  LinearRegression()
all_y_PC1   0.987971  LinearRegression()
CO_y_PC1    0.987922  LinearRegression()
y_PC2       0.882277  LinearRegression()
all_y_PC2   0.888562  LinearRegression()
CO_y_PC2    0.882277  LinearRegression()
y_PC3       0.647736  LinearRegression()
all_y_PC3   0.873160  LinearRegression()
CO_y_PC3    0.647736  LinearRegression()
CO          0.940900  LinearRegression()
mean_press  0.987789  LinearRegression()
max_press   0.980859  LinearRegression()
min_press   0.986044  LinearRegression()


In [17]:
results_df['Model'][0]

