### Get the simulated data from NONMEM

In [None]:
# Import packages
# Basic
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error
#models
from sklearn.linear_model import LinearRegression, Lasso
import xgboost as xgb

In [None]:
# Read the CSV data file from NONMEM
df = pd.read_csv('../data_from_NONMEM.csv')
df = df.rename(columns={'ID': 'CID','AMT': 'Dose', 'DV': 'Concn'})
# Select the first 500 patients
df = df[df['CID'].between(1, 500)]

In [None]:
# Convert all columns into float first and handle missing values
# The 'errors'='coerce' argument will replace non-convertible values with NaN. 
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df[col] = df[col].fillna(0) #filling missing values with 0

In [None]:
# Define the specific time points you want to keep
time_points = [0, 1, 2, 4, 6, 8, 12, 18, 24, 36, 48, 72, 96, 120, 144]

# Filter the dataframe to only include rows where the 'TIME' column matches one of the specified time points
data = df[df['TIME'].isin(time_points)]
data

In [None]:
# Optionally, save the filtered dataset to a new CSV file
data.to_csv('data_ready.csv', index=False)

### Data preprocess for ML

In [None]:
# Set some numbers
PT = 100 #sample size
SEED = 12 #random seed to make the results reproducible
METRIC = 'neg_root_mean_squared_error' #the main metric we will use for grid search and cross validation

In [None]:
# Read the dataset
df = pd.read_csv('data_ready.csv')
# Select the first some patients for the subset
df = df[df['CID'].between(1, PT)]

In [None]:
# Sort values by CID and TIME to ensure correct alignment
df.sort_values(by=['CID', 'TIME'], inplace=True)

# Convert "TIME" to time difference within each patient
df['Time_diff'] = df.groupby('CID')['TIME'].diff().fillna(0)

# Shift the Concn column to create the target variable: next time point's concentration
# We group by 'CID' to make sure the shift is done within each patient
df['Next_Concn'] = df.groupby('CID')['Concn'].shift(-1)

# Drop the last observation of each patient because it now has a NaN target
df.dropna(subset=['Next_Concn'], inplace=True)

In [None]:
# Unique patients (CIDs)
unique_cids = df['CID'].unique()
# Randomly shuffle the CIDs
random.shuffle(unique_cids)

# Split the CIDs into training and testing
train_size = int(0.7 * len(unique_cids))
train_cids = unique_cids[:train_size]
test_cids = unique_cids[train_size:]

# Filter the data based on the selected CIDs for training and testing
train_data = df[df['CID'].isin(train_cids)]
test_data = df[df['CID'].isin(test_cids)]

In [None]:
# Define normalization function
def normalize_column(column, scale_range):
    min_val = column.min()
    max_val = column.max()
    normalized = (column - min_val) / (max_val - min_val) * (scale_range[1] - scale_range[0]) + scale_range[0]
    return normalized

In [None]:
# Apply normalization to each column
train_data['td'] = normalize_column(train_data['Time_diff'], (train_data['Dose'].min(), train_data['Dose'].max()))
train_data['con'] = normalize_column(train_data['Concn'], (train_data['Dose'].min(), train_data['Dose'].max()))
train_data['next'] = normalize_column(train_data['Next_Concn'], (train_data['Dose'].min(), train_data['Dose'].max()))

test_data['td'] = normalize_column(test_data['Time_diff'], (test_data['Dose'].min(), test_data['Dose'].max()))
test_data['con'] = normalize_column(test_data['Concn'], (test_data['Dose'].min(), test_data['Dose'].max()))
test_data['next'] = normalize_column(test_data['Next_Concn'], (test_data['Dose'].min(), test_data['Dose'].max()))

In [None]:
# Splitting features and target for training and testing
X_train = train_data[['td', 'Dose', 'con']]
y_train = train_data['next']
X_test = test_data[['td', 'Dose', 'con']]
y_test = test_data['next']

### Run ML methods to get the predictions

In [None]:
# Define ML models
models = {
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(),
    'XGBoost': xgb.XGBRegressor(objective='reg:squarederror', seed=SEED)
}

In [None]:
# Grid search to find the best hyperparameter set
param_grids = {
    'Linear Regression': {},
    'Lasso': {'alpha': [0.01, 0.1, 1]},
    'XGBoost': {'n_estimators': [50, 100, 300], 'max_depth': [3, 5, 10], 'learning_rate': [0.05, 0.1, 0.3]}
}

best_estimators = {}
for name, model in models.items():
    print(f"Grid search for {name}")
    grid_search = GridSearchCV(model, param_grids[name], cv=10, scoring=METRIC, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_estimators[name] = grid_search.best_estimator_
    print(f"Best parameters for {name}: {grid_search.best_params_}")

In [None]:
# Define reverse normalization function
def reverse_normalize_column(column, original_column, scale_range=(df['Dose'].min(), df['Dose'].max())):
    min_val = original_column.min()
    max_val = original_column.max()
    reversed_values = column * (max_val - min_val) / (scale_range[1] - scale_range[0]) + min_val
    return reversed_values

In [None]:
# Fit models with best hyperparameters and evaluate
metrics = {}
preds = {}

for name, model in best_estimators.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    pred_real = reverse_normalize_column(pred, df['Next_Concn'])
    preds[name] = pred_real
      
    metrics[name] = {
        'RMSE': root_mean_squared_error(y_test, pred),
        'MAE': mean_absolute_error(y_test, pred),
        'R^2': r2_score(y_test, pred)
    }
    print(f"Evaluation for {name}: RMSE={metrics[name]['RMSE']:.4f}, MAE={metrics[name]['MAE']:.4f}, R^2={metrics[name]['R^2']:.4f}")

### Get AUCs for evaluation

In [None]:
pred_df = pd.DataFrame(preds)
pred_df.head()

In [None]:
# Reset the indices of both dataframes
test_data_reset = test_data.reset_index(drop=True)
pred_df_reset = pred_df.reset_index(drop=True)
# Concatenate the dataframes along axis 1
res = pd.concat([test_data_reset, pred_df_reset], axis=1)

In [None]:
# List of predicted concentration columns
predicted_columns = ['Linear Regression', 'Lasso', 'XGBoost']

# Initialize a dictionary to store the AUCs for each model including the real concentration
auc_results = {'Real': []}
for col in predicted_columns:
    auc_results[col] = []

# Iterate over each patient
for patient_id in res['CID'].unique():
    # Filter the data for the selected patient
    patient_data = res[res['CID'] == patient_id]

    # Extract Time and Concn columns for the real concentration curve
    time = patient_data['TIME']
    real_concn = patient_data['Concn']

    # Calculate AUC for the real concentration curve and store it
    auc_real = np.trapz(real_concn, time)
    auc_results['Real'].append(auc_real)

    # Calculate AUC for each predicted model and store it
    for col in predicted_columns:
        auc_pred = np.trapz(patient_data[col], time)
        auc_results[col].append(auc_pred)

# Calculate the average AUC and standard deviation for each model
results = []
for model, aucs in auc_results.items():
    avg_auc = np.mean(aucs)
    std_auc = np.std(aucs)
    formatted_result = f"Avg. AUC(std) for {model}: {avg_auc:.2f} ({std_auc:.2f})"
    results.append(formatted_result)

# Print the formatted results
for result in results:
    print(result)