In [1]:
import os
print(os.getcwd())

/content


In [2]:
import torch as t
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, random_split
import matplotlib.pyplot as plt

In [3]:
import zipfile
import os
import pandas as pd
import torch as t  # optional, used later in ML if needed

# Step 1: Define the zip file path
zip_path = '/content/Ohio Data.zip'

# Step 2: Define the extraction target path
extract_to = '/content'

# Step 3: Extract if not already done
extracted_main_folder = os.path.join(extract_to, 'Ohio Data')  # because zip contains "Ohio Data/" folder
if not os.path.exists(extracted_main_folder):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)  # extracts to /content/Ohio Data

# Step 4: Define paths to 2018 and 2020 data folders
folder_2018 = os.path.join(extracted_main_folder, 'Ohio2018_processed')
folder_2020 = os.path.join(extracted_main_folder, 'Ohio2020_processed')

# Step 5: Print check info
print("✅ Folders Exist?")
print("2018 Folder:", folder_2018, "| Exists:", os.path.exists(folder_2018))
print("2020 Folder:", folder_2020, "| Exists:", os.path.exists(folder_2020))

# Step 6: List contents of main folders
print("\n📂 Contents of Ohio2018_processed:", os.listdir(folder_2018))
print("📂 Contents of Ohio2020_processed:", os.listdir(folder_2020))

# Step 7: Define train/test paths
train_2018_path = os.path.join(folder_2018, 'train')
test_2018_path = os.path.join(folder_2018, 'test')
train_2020_path = os.path.join(folder_2020, 'train')
test_2020_path = os.path.join(folder_2020, 'test')

# Step 8: List CSV files in each
train_files_2018 = os.listdir(train_2018_path)
test_files_2018 = os.listdir(test_2018_path)
train_files_2020 = os.listdir(train_2020_path)
test_files_2020 = os.listdir(test_2020_path)

print("\n📄 Train 2018 files:", train_files_2018)
print("📄 Test 2018 files:", test_files_2018)
print("📄 Train 2020 files:", train_files_2020)
print("📄 Test 2020 files:", test_files_2020)

# Step 9: Load one training CSV from 2018
sample_file_path = os.path.join(train_2018_path, train_files_2018[0])
sample_data = pd.read_csv(sample_file_path)

# Step 10: Show the last 100 rows
print("\n📊 Sample CSV preview (last 100 rows):")
sample_data.tail(100)

✅ Folders Exist?
2018 Folder: /content/Ohio Data/Ohio2018_processed | Exists: True
2020 Folder: /content/Ohio Data/Ohio2020_processed | Exists: True

📂 Contents of Ohio2018_processed: ['train', 'test']
📂 Contents of Ohio2020_processed: ['train', 'test']

📄 Train 2018 files: ['588-ws-training_processed.csv', '559-ws-training_processed.csv', '563-ws-training_processed.csv', '570-ws-training_processed.csv', '575-ws-training_processed.csv', '591-ws-training_processed.csv']
📄 Test 2018 files: ['588-ws-testing_processed.csv', '559-ws-testing_processed.csv', '570-ws-testing_processed.csv', '575-ws-testing_processed.csv', '563-ws-testing_processed.csv', '591-ws-testing_processed.csv']
📄 Train 2020 files: ['567-ws-training_processed.csv', '544-ws-training_processed.csv', '540-ws-training_processed.csv', '552-ws-training_processed.csv', '596-ws-training_processed.csv', '584-ws-training_processed.csv']
📄 Test 2020 files: ['552-ws-testing_processed.csv', '540-ws-testing_processed.csv', '596-ws-tes

Unnamed: 0,5minute_intervals_timestamp,missing_cbg,cbg,finger,basal,hr,gsr,carbInput,bolus
13005,5.447421e+06,0.0,102.0,,1.25,98.0,0.000059,,
13006,5.447422e+06,0.0,110.0,,1.25,79.0,0.000057,,
13007,5.447423e+06,0.0,118.0,,1.25,75.0,0.000058,,
13008,5.447424e+06,0.0,125.0,,1.25,77.0,0.000058,,
13009,5.447425e+06,0.0,128.0,,1.25,83.0,0.000057,,
...,...,...,...,...,...,...,...,...,...
13100,5.447516e+06,0.0,150.0,,1.25,63.0,5.370000,,
13101,5.447517e+06,0.0,144.0,,1.25,63.0,6.562000,,
13102,5.447518e+06,0.0,140.0,,1.25,63.0,9.904000,,
13103,5.447519e+06,0.0,137.0,,1.25,60.0,8.846000,,


In [4]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
from scipy.interpolate import CubicSpline

def get_preprocessor(data_df):
    data_df1 = data_df.copy()

    # Replace missing cbg values (where 'missing_cbg' == 1) using cubic spline interpolation
    if 'cbg' in data_df1.columns and 'missing_cbg' in data_df1.columns:
        valid_indices = data_df1[data_df1['missing_cbg'] == 0].index
        valid_timestamps = data_df1.loc[valid_indices, '5minute_intervals_timestamp']
        valid_cbg = data_df1.loc[valid_indices, 'cbg']

        # Apply spline interpolation only if we have enough points
        if len(valid_cbg) > 3:
            spline = CubicSpline(valid_timestamps, valid_cbg)
            missing_indices = data_df1[data_df1['missing_cbg'] == 1].index
            missing_timestamps = data_df1.loc[missing_indices, '5minute_intervals_timestamp']
            data_df1.loc[missing_indices, 'cbg'] = spline(missing_timestamps)

    # Move 'cbg' to the end
    cbg = data_df1.pop('cbg')
    data_df1 = data_df1.assign(cbg=cbg)

    # Drop time column
    data_df1 = data_df1.drop(columns=['5minute_intervals_timestamp'])

    # Fill NaNs with (min - 1% of abs(min))
    column_mins = data_df1.min()
    fill_values = column_mins - 0.01 * np.abs(column_mins)
    data_df2 = data_df1.fillna(fill_values)

    # Fit MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(data_df2)

    return scaler

In [5]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
from scipy.interpolate import CubicSpline

def interpolate_cbg(data_df):
    data_df = data_df.copy()

    # Perform cubic spline interpolation on 'cbg' where 'missing_cbg' == 1
    missing_cbg_indices = data_df[data_df['missing_cbg'] == 1].index
    valid_indices = data_df[data_df['missing_cbg'] == 0].index

    if len(valid_indices) > 3:  # CubicSpline needs > 3 points
        cs = CubicSpline(valid_indices, data_df.loc[valid_indices, 'cbg'])
        data_df.loc[missing_cbg_indices, 'cbg'] = cs(missing_cbg_indices)

    return data_df

def move_cbg_to_end(df):
    cbg = df.pop('cbg')
    return df.assign(cbg=cbg)

def get_scaler(data_df):
    df = interpolate_cbg(data_df)

    # Drop unnecessary columns if they exist
    df = df.drop(columns=[col for col in ['5minute_intervals_timestamp', 'missing_cbg', 'index'] if col in df.columns])

    df = move_cbg_to_end(df)

    column_mins = df.min()
    fill_values = column_mins - 0.01 * np.abs(column_mins)
    df_filled = df.fillna(fill_values)

    scaler = MinMaxScaler()
    scaler.fit(df_filled)

    return scaler, fill_values

def preprocess(scaler, fill_values, data_df):
    df = interpolate_cbg(data_df)

    df = df.drop(columns=[col for col in ['5minute_intervals_timestamp', 'missing_cbg', 'index'] if col in df.columns])
    df = move_cbg_to_end(df)

    values = df.values
    values = np.where(np.isnan(values), fill_values.values, values)

    # Final check
    if np.isnan(values).sum() > 0:
        raise ValueError("NaNs still present after fill.")

    df_filled = pd.DataFrame(values, columns=df.columns)
    df_scaled = pd.DataFrame(scaler.transform(df_filled), columns=df.columns)

    return df_scaled


In [6]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import os
import torch as t

class OhioT1DMDataset(Dataset):
    def __init__(self, data_dirs, seq_length):
        self.seq_length = seq_length
        dataframes = []

        # Load data from multiple files
        for data_dir in data_dirs:
            for subdir, dirs, files in os.walk(data_dir):
                for file in files:
                    file_path = os.path.join(subdir, file)
                    data_df = pd.read_csv(file_path)
                    dataframes.append(data_df)

        merged_data = pd.concat(dataframes, ignore_index=True)

        # Apply preprocessing
        scaler, fill_values = get_scaler(merged_data)
        self.scaler = scaler
        self.preprocessed_dfs = [preprocess(scaler, fill_values, df) for df in dataframes]
        self.data = [t.tensor(df.values, dtype=t.float32) for df in self.preprocessed_dfs]

    def __len__(self):
        return sum(len(d) - self.seq_length + 1 for d in self.data)

    def __getitem__(self, index):
        data_idx = 0
        while index >= len(self.data[data_idx]) - self.seq_length + 1:
            index -= len(self.data[data_idx]) - self.seq_length + 1
            data_idx += 1

        sequence = self.data[data_idx][index:index+self.seq_length]
        inputs = sequence[:-1, :]
        target = sequence[-1, -1]  # Only predict cbg
        return inputs, target

def create_dataloader(data_dirs, seq_length, batch_size):
    dataset = OhioT1DMDataset(data_dirs, seq_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    def unscale(data):
        flat = data.reshape(-1, data.shape[-1])
        df = pd.DataFrame(flat, columns=dataset.preprocessed_dfs[0].columns)
        unscaled = dataset.scaler.inverse_transform(df)
        return t.tensor(unscaled).reshape(data.shape)

    dataloader.__dict__['unscale'] = unscale
    return dataloader


In [7]:
class SimpleTransformerModel(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_layers, max_seq_len, output_dim):
        super(SimpleTransformerModel, self).__init__()
        self.model_dim = model_dim
        self.embedding = nn.Linear(input_dim, model_dim)
        self.pos_embedding = nn.Parameter(t.randn(1, max_seq_len, model_dim))  # Large enough

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=model_dim,
            nhead=num_heads,
            dim_feedforward=4*model_dim,
            dropout=0.1,
            activation='relu',
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.regressor = nn.Linear(model_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        pos = self.pos_embedding[:, :x.size(1), :]  # Slice to match
        x = x + pos
        x = self.transformer_encoder(x)
        x = x[:, -1, :]
        return self.regressor(x)


In [8]:
def create_dataloader(data_dirs, seq_length, batch_size, split_val=False, val_split_ratio=0.1):
    dataset = OhioT1DMDataset(data_dirs, seq_length)

    if split_val:
        total_len = len(dataset)
        val_len = int(val_split_ratio * total_len)
        train_len = total_len - val_len
        train_dataset, val_dataset = random_split(dataset, [train_len, val_len])
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    else:
        train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        val_loader = None

    def unscale(data):
        flat = data.reshape(-1, 1)
        dummy = np.zeros((flat.shape[0], len(dataset.preprocessed_dfs[0].columns)))
        dummy[:, 0] = flat.squeeze()
        unscaled = dataset.scaler.inverse_transform(dummy)
        return t.tensor(unscaled[:, 0]).reshape(data.shape)

    train_loader.__dict__['unscale'] = lambda data: unscale(data)
    if val_loader:
        val_loader.__dict__['unscale'] = lambda data: unscale(data)

    return train_loader, val_loader


In [9]:
def train_transformer(model, train_loader, val_loader, test_loader, num_epochs=100, lr=0.001):
    device = t.device("cuda" if t.cuda.is_available() else "cpu")
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    train_losses, val_losses, test_losses = [], [], []
    best_val_loss = float('inf')
    best_model = None

    for epoch in range(num_epochs):
        model.train()
        epoch_train_loss = []

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets.squeeze())
            loss.backward()
            optimizer.step()
            epoch_train_loss.append(loss.item())

        train_losses.append(np.mean(epoch_train_loss))

        model.eval()
        with t.no_grad():
            val_loss = [criterion(model(x.to(device)).squeeze(), y.to(device).squeeze()).item() for x, y in val_loader]
            test_loss = [criterion(model(x.to(device)).squeeze(), y.to(device).squeeze()).item() for x, y in test_loader]

        val_losses.append(np.mean(val_loss))
        test_losses.append(np.mean(test_loss))

        print(f"Epoch {epoch+1}/{num_epochs} | Train: {train_losses[-1]:.6f} | Val: {val_losses[-1]:.6f} | Test: {test_losses[-1]:.6f}")

        if val_losses[-1] < best_val_loss:
            best_val_loss = val_losses[-1]
            best_model = model.state_dict()

    model.load_state_dict(best_model)
    t.save(model.state_dict(), "transformer_model.pth")
    return model


In [10]:
import os
path = os.getcwd()

train_data_dirs = [
    os.path.join(path, "Ohio Data", "Ohio2018_processed", "train"),
    os.path.join(path, "Ohio Data", "Ohio2020_processed", "train")
]

test_data_dirs = [
    os.path.join(path, "Ohio Data", "Ohio2018_processed", "test"),
    os.path.join(path, "Ohio Data", "Ohio2020_processed", "test")
]

In [13]:
# Adjust input_dim according to your dataset (usually 7)
input_dim = 7
model_dim = 64
num_heads = 4
num_layers = 2
seq_len = 25
output_dim = 1

transformer_model = SimpleTransformerModel(input_dim, model_dim, num_heads, num_layers, seq_len, output_dim)

# Load data
train_loader, val_loader = create_dataloader(train_data_dirs, seq_len, batch_size=500, split_val=True)
test_loader, _ = create_dataloader(test_data_dirs, seq_len, batch_size=500, split_val=False)

# Train
transformer_model = train_transformer(transformer_model, train_loader, val_loader, test_loader, num_epochs=150, lr=0.001)


Epoch 1/150 | Train: 0.014086 | Val: 0.000856 | Test: 0.007240
Epoch 2/150 | Train: 0.001139 | Val: 0.000184 | Test: 0.000554
Epoch 3/150 | Train: 0.000522 | Val: 0.000289 | Test: 0.000397
Epoch 4/150 | Train: 0.000378 | Val: 0.000356 | Test: 0.000433
Epoch 5/150 | Train: 0.000299 | Val: 0.000084 | Test: 0.000100
Epoch 6/150 | Train: 0.000234 | Val: 0.000124 | Test: 0.000170
Epoch 7/150 | Train: 0.000192 | Val: 0.000072 | Test: 0.000120
Epoch 8/150 | Train: 0.000166 | Val: 0.000043 | Test: 0.000054
Epoch 9/150 | Train: 0.000136 | Val: 0.000041 | Test: 0.000045
Epoch 10/150 | Train: 0.000114 | Val: 0.000031 | Test: 0.000035
Epoch 11/150 | Train: 0.000098 | Val: 0.000032 | Test: 0.000033
Epoch 12/150 | Train: 0.000094 | Val: 0.000027 | Test: 0.000034
Epoch 13/150 | Train: 0.000079 | Val: 0.000116 | Test: 0.000169
Epoch 14/150 | Train: 0.000080 | Val: 0.000108 | Test: 0.000096
Epoch 15/150 | Train: 0.000078 | Val: 0.000018 | Test: 0.000019
Epoch 16/150 | Train: 0.000071 | Val: 0.000016 | 

In [16]:
import numpy as np
import torch as t
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, dataloader):
    device = t.device("cuda" if t.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    y_true, y_pred = [], []

    with t.no_grad():
        for inputs, targets in dataloader:
            inputs = inputs.to(device)
            targets = targets.to(device)

            outputs = model(inputs)
            y_true.extend(targets.squeeze().cpu().numpy())
            y_pred.extend(outputs.squeeze().cpu().numpy())

    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    return mse, rmse, mae, r2


In [17]:
mse, rmse, mae, r2 = evaluate_model(transformer_model, val_loader)
print(f"MSE: {mse:.4f}\nRMSE: {rmse:.4f}\nMAE: {mae:.4f}\nR2 Score: {r2:.4f}")


MSE: 0.0000
RMSE: 0.0027
MAE: 0.0018
R2 Score: 0.9958


In [18]:
# Save the model state_dict
t.save(transformer_model.state_dict(), 'transformer_model.pth')
