In [1]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
from collections import deque
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import xml.etree.ElementTree as ET
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from sklearn.metrics import mean_squared_error

In [2]:
# Import everything from glucose_transformer
from glucose_transformer import (
    TimeSeriesDataset,
    TransformerEncoder_version2,
    TransformerEncoder,
    load_ohio_series_train,
    create_population_splits,
    create_loocv_splits,
    create_4fold_splits,
    split_into_continuous_series,
    create_train_val_datasets,
    train_model,
    evaluate_model,
    evaluate_and_save_metrics_population,
    evaluate_and_save_metrics,
    save_model,
    load_model
)

# Train on Ohio dataset

## Population data

In [None]:
folder_path_train_2018 = f"./OhioT1DM 2020/2018/train"
folder_path_train_2020 = "./OhioT1DM 2020/2020/train"
train_files_2018 = [f for f in os.listdir(folder_path_train_2018) if f.endswith('.xml')]
train_files_2020 = [f for f in os.listdir(folder_path_train_2020) if f.endswith('.xml')]

folder_path_test_2018 = f"./OhioT1DM 2020/2018/test"
folder_path_test_2020 = "./OhioT1DM 2020/2020/test"
test_files_2018 = [f for f in os.listdir(folder_path_test_2018) if f.endswith('.xml')]
test_files_2020 = [f for f in os.listdir(folder_path_test_2020) if f.endswith('.xml')]

population_splits = create_population_splits(
    folder_path_train_2018,
    folder_path_train_2020,
    train_files_2018,
    train_files_2020,
    folder_path_test_2018,
    folder_path_test_2020,
    test_files_2018,
    test_files_2020
)

print(population_splits)

Test file: ['./OhioT1DM 2020/2018/test/559-ws-testing.xml', './OhioT1DM 2020/2018/test/588-ws-testing.xml', './OhioT1DM 2020/2018/test/570-ws-testing.xml', './OhioT1DM 2020/2018/test/563-ws-testing.xml', './OhioT1DM 2020/2018/test/591-ws-testing.xml', './OhioT1DM 2020/2018/test/575-ws-testing.xml', './OhioT1DM 2020/2020/test/552-ws-testing.xml', './OhioT1DM 2020/2020/test/540-ws-testing.xml', './OhioT1DM 2020/2020/test/544-ws-testing.xml', './OhioT1DM 2020/2020/test/596-ws-testing.xml', './OhioT1DM 2020/2020/test/584-ws-testing.xml', './OhioT1DM 2020/2020/test/567-ws-testing.xml']
Number of training files: 12
Training files:
  575-ws-training.xml
  563-ws-training.xml
  559-ws-training.xml
{'train': ['./OhioT1DM 2020/2018/train/575-ws-training.xml', './OhioT1DM 2020/2018/train/563-ws-training.xml', './OhioT1DM 2020/2018/train/559-ws-training.xml', './OhioT1DM 2020/2018/train/588-ws-training.xml', './OhioT1DM 2020/2018/train/570-ws-training.xml', './OhioT1DM 2020/2018/train/591-ws-train

In [4]:
# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 4. Set hyperparameters
past_sequence_length = 12
future_offset = 6
batch_size = 64
max_interval_minutes = 30

# 5. Train model
model = TransformerEncoder_version2(
    past_seq_len=past_sequence_length,
    num_layers=1,
    d_model=512,
    nhead=4,
    input_dim=1,
    dropout=0.2
)
model = model.to(device)

# Load and process training data
train_dfs = []
for train_file in population_splits['train']:
    df = load_ohio_series_train(train_file, "glucose_level", "value")
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    train_dfs.append(df)

# Create datasets
train_series_list = []
for df in train_dfs:
    series_list = split_into_continuous_series(df, past_sequence_length, future_offset, max_interval_minutes)
    train_series_list.extend(series_list)

train_dataset, val_dataset = create_train_val_datasets(
    train_series_list,
    train_ratio=0.8,
    past_seq_len=past_sequence_length,
    future_offset=future_offset
)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Train model
train_losses, val_losses = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    num_epochs=200,
    learning_rate=1e-3
)




Epoch [1/200], Train Loss: 4342.886330, Val Loss: 3841.350531
Epoch [6/200], Train Loss: 3613.103631, Val Loss: 3871.530792
Epoch [11/200], Train Loss: 3614.899665, Val Loss: 3803.747021
Epoch [16/200], Train Loss: 851.203074, Val Loss: 698.923365
Epoch [21/200], Train Loss: 775.662844, Val Loss: 819.003807
Epoch [26/200], Train Loss: 700.592793, Val Loss: 844.026089
Epoch [31/200], Train Loss: 670.113331, Val Loss: 537.026765
Epoch [36/200], Train Loss: 605.341914, Val Loss: 503.041718
Epoch [41/200], Train Loss: 610.231651, Val Loss: 507.908971


KeyboardInterrupt: 

In [None]:
# 4. Set hyperparameters
past_sequence_length = 12
future_offset = 6
batch_size = 64
max_interval_minutes = 30
test_eval = []

# Evaluate on test data
metrics = evaluate_and_save_metrics_population(
    model=model,
    test_file_path=population_splits['test'],
    save_dir='evaluation_metrics',
    past_sequence_length=past_sequence_length,
    future_offset=future_offset,
    batch_size=batch_size,
    max_interval_minutes=max_interval_minutes
)

# Save the trained model
save_model(model, 'population_version2', save_dir='saved_models_original_ohio')

# evaluation on whole test set
print(f"RMSE: {metrics['rmse']:.2f}")
print(f"MAE: {metrics['mae']:.2f}")
print(f"MAPE: {metrics['mape']:.2f}%")

In [None]:
# Load the saved model
model = load_model('population_version2', save_dir='saved_models_original_ohio')

# 4. Set hyperparameters
past_sequence_length = 12
future_offset = 6
batch_size = 64
max_interval_minutes = 30
test_eval = []

for test in population_splits['test']:
    print(test)
    # Evaluate on test data individually 
    metrics = evaluate_and_save_metrics(
        model=model,
        test_file_path=test,
        save_dir='evaluation_metrics',
        past_sequence_length=past_sequence_length,
        future_offset=future_offset,
        batch_size=batch_size,
        max_interval_minutes=max_interval_minutes
    )

    id = test.split('/')[-1].split('-')[0]
    test_eval.append([id, round(metrics['rmse'], 2), round(metrics['mae'], 2), round(metrics['mape'], 2)])

    # print(f"\nResults for population model:")
    print(f"RMSE: {metrics['rmse']:.2f}")
    print(f"MAE: {metrics['mae']:.2f}")
    print(f"MAPE: {metrics['mape']:.2f}%")

In [None]:
print(test_eval)
df = pd.DataFrame(test_eval, columns=['test patient', 'RMSE', 'MAE', 'MAPE'])
df.to_csv('./evaluation_metrics/individual_test_eval2.csv', index=False)

## Leave-one-out

In [None]:
# Example usage
# 1. Set up paths and get file lists
folder_path_train_2018 = f"./OhioT1DM 2020/2018/train"
folder_path_train_2020 = "./OhioT1DM 2020/2020/train"
train_files_2018 = [f for f in os.listdir(folder_path_train_2018) if f.endswith('.xml')]
train_files_2020 = [f for f in os.listdir(folder_path_train_2020) if f.endswith('.xml')]

# 2. Create LOOCV splits
loocv_splits = create_loocv_splits(
    folder_path_train_2018,
    folder_path_train_2020,
    train_files_2018,
    train_files_2020
)

# 3. Initialize model
# model = TransformerEncoder(
#     num_layers=3,
#     d_model=64,
#     nhead=4,
#     input_dim=1,
#     dim_feedforward=256,
#     dropout=0.1
# )

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# 4. Set hyperparameters
past_sequence_length = 12
future_offset = 6
batch_size = 64
max_interval_minutes = 30

# Get the items starting from fold9

fold_items = dict(list(loocv_splits.items())[1:])

# 5. Train and evaluate for each fold
for fold_name, fold_data in fold_items.items():
    model = TransformerEncoder(
        num_layers=1,
        d_model=512,
        nhead=4,
        input_dim=1,
        dim_feedforward=256,
        dropout=0.2
    )
    model = model.to(device)
    print(f"\nProcessing {fold_name}...")

    # Load and process training data
    train_dfs = []
    for train_file in fold_data['train']:
        df = load_ohio_series_train(train_file, "glucose_level", "value")
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        train_dfs.append(df)

    # Create datasets
    train_series_list = []
    for df in train_dfs:
        series_list = split_into_continuous_series(df, past_sequence_length, future_offset, max_interval_minutes)
        train_series_list.extend(series_list)

    train_dataset, val_dataset = create_train_val_datasets(
        train_series_list,
        train_ratio=0.8,
        past_seq_len=past_sequence_length,
        future_offset=future_offset
    )

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Train model
    train_losses, val_losses = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        num_epochs=200,
        learning_rate=1e-3
    )

    # Evaluate on test data
    metrics = evaluate_and_save_metrics(
        model=model,
        test_file_path=fold_data['test'],
        save_dir='evaluation_metrics',
        past_sequence_length=past_sequence_length,
        future_offset=future_offset,
        batch_size=batch_size,
        max_interval_minutes=max_interval_minutes
    )

    # Save the trained model
    save_model(model, fold_data['test'][-19:], save_dir='saved_models_original_ohio')

    print(f"\nResults for {fold_name}:")
    print(f"RMSE: {metrics['rmse']:.2f}")
    print(f"MAE: {metrics['mae']:.2f}")
    print(f"MAPE: {metrics['mape']:.2f}%")

In [None]:
fold_data['test'][-19:]

In [None]:
save_model(model, fold_data['test'][-19:], save_dir='saved_models_original_ohio')

print(f"\nResults for {fold_name}:")
print(f"RMSE: {metrics['rmse']:.2f}")
print(f"MAE: {metrics['mae']:.2f}")
print(f"MAPE: {metrics['mape']:.2f}%")

In [None]:
# Check CUDA availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name()}")

In [None]:
# Save model after training each fold
def save_model(model, fold_name, save_dir='saved_models'):
    os.makedirs(save_dir, exist_ok=True)
    model_path = os.path.join(save_dir, f'model_{fold_name}.pth')
    torch.save(model.state_dict(), model_path)
    print(f"Model saved to {model_path}")

# Load model for evaluation
def load_model(fold_name, model_class=TransformerEncoder, save_dir='saved_models'):
    model_path = os.path.join(save_dir, f'model_{fold_name}.pth')

    # Initialize a new model with the same architecture
    model = model_class(
        num_layers=3,
        d_model=64,
        nhead=4,
        input_dim=1,
        dim_feedforward=256,
        dropout=0.1
    )

    # Load the saved weights
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.load_state_dict(torch.load(model_path, map_location=device))
    model = model.to(device)
    model.eval()

    print(f"Model loaded from {model_path}")
    return model

In [None]:
fold_name

In [None]:
# Example usage
# 1. Set up paths and get file lists
folder_path_train_2018 = "/Users/baiyinglu/Desktop/AugmentedHealthLab/T1DEXI_Apr52024/try/OhioT1DM/2018/train"
folder_path_train_2020 = "/Users/baiyinglu/Desktop/AugmentedHealthLab/T1DEXI_Apr52024/try/OhioT1DM/2020/train"
train_files_2018 = [f for f in os.listdir(folder_path_train_2018) if f.endswith('.xml')]
train_files_2020 = [f for f in os.listdir(folder_path_train_2020) if f.endswith('.xml')]

# 2. Create LOOCV splits
loocv_splits = create_loocv_splits(
    folder_path_train_2018,
    folder_path_train_2020,
    train_files_2018,
    train_files_2020
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = model.to(device)

# 4. Set hyperparameters
past_sequence_length = 7
future_offset = 6
batch_size = 32
max_interval_minutes = 30

# Get the items starting from fold9
starting_fold = 9

In [None]:
# For a specific fold
fold_name = 'fold9'
fold_data = loocv_splits[fold_name]

# Load the saved model
model = load_model(fold_name)

metrics = evaluate_and_save_metrics(
    model=model,
    test_file_path=fold_data['test'],
    save_dir='evaluation_metrics',
    past_sequence_length=past_sequence_length,
    future_offset=future_offset,
    batch_size=batch_size,
    max_interval_minutes=max_interval_minutes
)

print(f"\nResults for {fold_name}:")
print(f"RMSE: {metrics['rmse']:.2f}")
print(f"MAE: {metrics['mae']:.2f}")
print(f"MAPE: {metrics['mape']:.2f}%")