In [1]:
import pandas as pd
import numpy as np
import os
import GPy
import contextlib
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [2]:
folder_path = '../Emulation'

In [3]:
# Read Input Data
df_x = pd.read_csv(f'{folder_path}/Input/input_100.csv')

# Select varying inputs only
#columns_with_multiple_values = df_x.nunique() > 1
#filtered_input = df_x.loc[:, columns_with_multiple_values]

# Select only first 5 inputs 
filtered_input = df_x.iloc[:,:5]

dataframes = {}

# Read PCA data
for i in range(3):
    df_name = f'y_PC{i+1}'  # Create the dataframe name
    dataframes[df_name] = pd.read_csv(f'{folder_path}/Outputs/Out_fixed/PCA/PC{i+1}.csv')  # Read and store the dataframe
    dataframes[f'all_{df_name}'] =  pd.read_csv(f'{folder_path}/Outputs/Out_fixed/PCA/all_PC{i+1}.csv') 
    dataframes[f'CO_{df_name}'] =  pd.read_csv(f'{folder_path}/Outputs/Out_fixed/PCA/CO_PC{i+1}.csv') 

y_PC1 = dataframes['y_PC1']
y_PC2 = dataframes['y_PC2']
y_PC3 = dataframes['y_PC3']

all_y_PC1 = dataframes['all_y_PC1']
all_y_PC2 = dataframes['all_y_PC2']
all_y_PC3 = dataframes['all_y_PC3']

CO_y_PC1 = dataframes['CO_y_PC1']
CO_y_PC2 = dataframes['CO_y_PC2']
CO_y_PC3 = dataframes['CO_y_PC3']

df_pressure = pd.read_csv(f'{folder_path}/Outputs/Out_fixed/pressure_traces_r_pat/all_pressure_traces.csv')
dataframes['CO'] = df_pressure.iloc[:,100:101]

dataframes['mean_press']= df_pressure.iloc[:,:100].mean(axis=1).to_frame(name='mean_press')
dataframes['max_press']= df_pressure.iloc[:,:100].max(axis=1).to_frame(name='max_press')
dataframes['min_press']= df_pressure.iloc[:,:100].min(axis=1).to_frame(name='min_press')


list(dataframes.keys())

['y_PC1',
 'all_y_PC1',
 'CO_y_PC1',
 'y_PC2',
 'all_y_PC2',
 'CO_y_PC2',
 'y_PC3',
 'all_y_PC3',
 'CO_y_PC3',
 'CO',
 'mean_press',
 'max_press',
 'min_press']

In [4]:
filtered_input

Unnamed: 0,# svn.c,pat.r,pat.c,rv.E_act,T
0,27.687002,0.443723,3.278327,1.623082,0.478369
1,10.935100,0.243385,4.286711,2.717826,0.805784
2,18.671167,0.356841,2.549143,0.822205,0.950488
3,24.992088,0.155291,5.101243,3.333386,0.614289
4,22.700741,0.326891,3.862239,2.779415,0.551895
...,...,...,...,...,...
95,27.406567,0.196824,3.333852,1.704984,0.646159
96,26.960391,0.395660,2.043640,0.645535,0.810764
97,10.379113,0.271468,5.547834,3.156299,0.473453
98,17.956148,0.327547,3.782205,1.440794,0.618088


In [5]:
# Read in dataframe of fitted linear models
model_dataframes = pd.read_pickle(f'{folder_path}/Outputs/Emulators/linear_models_and_r2_scores.csv')

model_dataframes

Unnamed: 0,R2_Score,Model
y_PC1,0.987922,LinearRegression()
all_y_PC1,0.987971,LinearRegression()
CO_y_PC1,0.987922,LinearRegression()
y_PC2,0.882277,LinearRegression()
all_y_PC2,0.888562,LinearRegression()
CO_y_PC2,0.882277,LinearRegression()
y_PC3,0.647736,LinearRegression()
all_y_PC3,0.87316,LinearRegression()
CO_y_PC3,0.647736,LinearRegression()
CO,0.9409,LinearRegression()


In [6]:
# Read input and output data
X = filtered_input.copy()

# Initialize a dictionary to store results
inverse_results_min_norm = {}
inverse_results_y = {}

# Loop through each row in the model_dataframes
for index, row in model_dataframes.iterrows():
    model = row['Model']  # Assuming the column containing the fitted models is named 'model'
    
    Y = dataframes[index].copy()
  
      
    # Split the data into training and testing sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    coeffs = model.coef_[0][None].T
    intercept = model.intercept_
    
    # Scale y_obs by intercept
    scale_y_obs = y_test - intercept
    
    # Compute the pseudo-inverse of the coefficient matrix
    beta_inv = np.linalg.inv(coeffs.T @ coeffs) @ coeffs.T
    x_hat = scale_y_obs @ beta_inv
    
    x_hat_headers = [f'{col}_\u0302' for col in X_test.columns]
    x_hat.columns = x_hat_headers
    
    # Combine X_test and x_hat into a dataframe
    df = pd.concat([X_test, x_hat], axis=1)
    inverse_results_min_norm[f'model_{index}'] = df

    y_calibrated = (x_hat @ coeffs) + intercept 
    y_compare = pd.concat([y_test, y_calibrated], axis=1)
    y_compare.columns = ("y_true", "y_calibrated")
    inverse_results_y[f'model_{index}'] = y_compare

In [7]:
calibrated_max_press = inverse_results_y['model_max_press']
calibrated_max_press = pd.DataFrame(calibrated_max_press)

calibrated_max_press.to_csv('calibrated_max_press.csv', index=False)


In [8]:
mse_results = []

for index in inverse_results_y:
    print(index)
    mse = np.mean((inverse_results_y[index].loc[:, 'y_calibrated'] - inverse_results_y[index].loc[:, 'y_true'])**2)
    mse_results.append(mse)

# Create a DataFrame with mse_results and the corresponding index
mse_df = pd.DataFrame(mse_results, index=inverse_results_y.keys(), columns=['MSE'])

# Save the DataFrame to a CSV file
mse_df.to_csv('linear_model_calibration_mse_results.csv')

print(mse_df)

model_y_PC1
model_all_y_PC1
model_CO_y_PC1
model_y_PC2
model_all_y_PC2
model_CO_y_PC2
model_y_PC3
model_all_y_PC3
model_CO_y_PC3
model_CO
model_mean_press
model_max_press
model_min_press
                           MSE
model_y_PC1       6.557406e-30
model_all_y_PC1   3.540260e-29
model_CO_y_PC1    6.557406e-30
model_y_PC2       1.117425e-31
model_all_y_PC2   5.669938e-32
model_CO_y_PC2    1.117425e-31
model_y_PC3       8.017652e-31
model_all_y_PC3   1.369875e-30
model_CO_y_PC3    8.017652e-31
model_CO          1.873545e-31
model_mean_press  1.893266e-30
model_max_press   3.313216e-30
model_min_press   4.575393e-30
