In [2]:
import pandas as pd

df = pd.read_csv("/Users/akki/Desktop/AKKI/college/AI in Healthcare/Project/Blood Glucose Prediction/Diabetes-Data/patient_data.csv")
print(df.shape)
print(df.head())

(29330, 7)
         Date   Time  Code Value  patient_id             datetime record_type
0  04-21-1991   9:09    58   100           1  1991-04-21 09:09:00  Electronic
1  04-21-1991   9:09    33     9           1  1991-04-21 09:09:00  Electronic
2  04-21-1991   9:09    34    13           1  1991-04-21 09:09:00  Electronic
3  04-21-1991  17:08    62   119           1  1991-04-21 17:08:00  Electronic
4  04-21-1991  17:08    33     7           1  1991-04-21 17:08:00  Electronic


In [3]:
df['Value'] = pd.to_numeric(df['Value'], errors='coerce')

In [4]:
bg_codes = [58, 60, 62]
bg_df = df[df['Code'].isin(bg_codes)].copy()

In [5]:
# Mark event type columns
df['is_insulin'] = df['Code'].isin([33, 34, 35]).astype(int)
df['is_meal']    = df['Code'].isin([66, 67, 68]).astype(int)
df['is_exercise']= df['Code'].isin([69, 70, 71]).astype(int)

# For each datetime, aggregate
agg_df = df.groupby(['patient_id','datetime']).agg({
    'Value':'mean',
    'is_insulin':'max',
    'is_meal':'max',
    'is_exercise':'max'
}).reset_index()

In [6]:
agg_df = agg_df.sort_values(['patient_id','datetime'])
agg_df['prev_bg'] = agg_df.groupby('patient_id')['Value'].shift(1)
agg_df['prev_insulin'] = agg_df.groupby('patient_id')['is_insulin'].shift(1)
agg_df['prev_meal'] = agg_df.groupby('patient_id')['is_meal'].shift(1)
agg_df['prev_exercise'] = agg_df.groupby('patient_id')['is_exercise'].shift(1)

In [7]:
agg_df['target_bg'] = agg_df.groupby('patient_id')['Value'].shift(-1)
model_df = agg_df.dropna()

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

X = model_df[['Value','prev_bg','prev_insulin','prev_meal','prev_exercise']]
y = model_df['target_bg']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, preds))
print("RMSE:", np.sqrt(mean_squared_error(y_test, preds)))

MAE: 74.51928788746062
RMSE: 105.91294400865351


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

patient_models = {}
results = {}

for pid, pdf in agg_df.groupby("patient_id"):
    # Drop rows with missing features
    pdf = pdf.dropna(subset=["Value", "prev_bg"])
    
    if len(pdf) < 50:  # skip very small datasets
        continue
    
    X = pdf[["prev_bg", 'prev_insulin','prev_meal','prev_exercise']]
    y = pdf["Value"]

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    # Model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    patient_models[pid] = model
    results[pid] = {"MAE": mae, "RMSE": rmse}

# Show results
for pid, metrics in results.items():
    print(f"Patient {pid}: MAE={metrics['MAE']:.2f}, RMSE={metrics['RMSE']:.2f}")

Patient 1: MAE=52.08, RMSE=64.84
Patient 2: MAE=32.79, RMSE=41.60
Patient 3: MAE=78.61, RMSE=100.38
Patient 4: MAE=79.09, RMSE=103.67
Patient 5: MAE=85.04, RMSE=103.71
Patient 6: MAE=45.99, RMSE=49.69
Patient 7: MAE=38.08, RMSE=50.92
Patient 8: MAE=99.21, RMSE=116.74
Patient 9: MAE=71.00, RMSE=105.78
Patient 10: MAE=58.28, RMSE=79.12
Patient 11: MAE=37.62, RMSE=63.60
Patient 12: MAE=60.35, RMSE=86.57
Patient 13: MAE=66.19, RMSE=99.46
Patient 14: MAE=52.81, RMSE=92.24
Patient 15: MAE=52.14, RMSE=74.13
Patient 16: MAE=60.39, RMSE=99.03
Patient 17: MAE=65.49, RMSE=86.82
Patient 18: MAE=54.39, RMSE=75.96
Patient 19: MAE=72.14, RMSE=108.97
Patient 20: MAE=41.67, RMSE=48.61
Patient 21: MAE=64.70, RMSE=87.10
Patient 22: MAE=56.37, RMSE=80.59
Patient 23: MAE=53.96, RMSE=87.07
Patient 24: MAE=60.34, RMSE=86.64
Patient 25: MAE=80.45, RMSE=100.50
Patient 26: MAE=45.73, RMSE=55.98
Patient 27: MAE=30.84, RMSE=42.12
Patient 28: MAE=30.86, RMSE=42.45
Patient 29: MAE=35.58, RMSE=46.28
Patient 30: MAE=

In [10]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

# 1. Create sequence dataset
class BGDataset(Dataset):
    def __init__(self, df, seq_len=5):
        self.seq_len = seq_len
        self.features = df[["Value", "is_insulin", "is_meal", "is_exercise"]].values
        self.targets = df["Value"].values

    def __len__(self):
        return len(self.features) - self.seq_len

    def __getitem__(self, idx):
        X = self.features[idx:idx+self.seq_len]
        y = self.targets[idx+self.seq_len]
        return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# 2. LSTM Model
class BGLSTM(nn.Module):
    def __init__(self, input_size=4, hidden_size=64, num_layers=2):
        super(BGLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # last time step
        return out.squeeze()

# 3. Train function
def train_lstm(df, seq_len=5, epochs=200, lr=0.001):
    dataset = BGDataset(df, seq_len)
    loader = DataLoader(dataset, batch_size=32, shuffle=False)

    model = BGLSTM()
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X, y in loader:
            optimizer.zero_grad()
            preds = model(X)
            loss = criterion(preds, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss={total_loss/len(loader):.4f}")

    return model

In [39]:
df

Unnamed: 0,Date,Time,Code,Value,patient_id,datetime,record_type,is_insulin,is_meal,is_exercise
0,04-21-1991,9:09,58,100.0,1,1991-04-21 09:09:00,Electronic,0,0,0
1,04-21-1991,9:09,33,9.0,1,1991-04-21 09:09:00,Electronic,1,0,0
2,04-21-1991,9:09,34,13.0,1,1991-04-21 09:09:00,Electronic,1,0,0
3,04-21-1991,17:08,62,119.0,1,1991-04-21 17:08:00,Electronic,0,0,0
4,04-21-1991,17:08,33,7.0,1,1991-04-21 17:08:00,Electronic,1,0,0
...,...,...,...,...,...,...,...,...,...,...
29325,05-09-1989,08:00,33,1.0,70,1989-05-09 08:00:00,Paper,1,0,0
29326,05-09-1989,08:00,34,7.0,70,1989-05-09 08:00:00,Paper,1,0,0
29327,05-10-1989,08:00,34,7.0,70,1989-05-10 08:00:00,Paper,1,0,0
29328,05-11-1989,08:00,34,7.0,70,1989-05-11 08:00:00,Paper,1,0,0


In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

# 1. Create sequence dataset
class BGDataset(Dataset):
    def __init__(self, df, seq_len=5):
        self.seq_len = seq_len
        self.features = df[["Value", "is_insulin", "is_meal", "is_exercise"]].values
        self.targets = df["Value"].values

    def __len__(self):
        return len(self.features) - self.seq_len

    def __getitem__(self, idx):
        X = self.features[idx:idx+self.seq_len]
        y = self.targets[idx+self.seq_len]
        return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# 2. LSTM Model
class BGLSTM(nn.Module):
    def __init__(self, input_size=4, hidden_size=64, num_layers=2):
        super(BGLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # last time step
        return out.squeeze()

# 3. Train function with validation
def train_lstm(df, seq_len=5, epochs=200, lr=0.001):
    # Split into train and validation (80/20 chronologically)
    train_size = int(0.8 * len(df))
    train_df = df.iloc[:train_size]
    val_df = df.iloc[train_size:]

    train_loader = DataLoader(BGDataset(train_df, seq_len), batch_size=32, shuffle=False)
    val_loader = DataLoader(BGDataset(val_df, seq_len), batch_size=32, shuffle=False)

    model = BGLSTM()
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        # ---- Training ----
        model.train()
        total_loss = 0
        for X, y in train_loader:
            optimizer.zero_grad()
            preds = model(X)
            loss = criterion(preds, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)

        # ---- Validation ----
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X, y in val_loader:
                preds = model(X)
                loss = criterion(preds, y)
                val_loss += loss.item()
        avg_val_loss = val_loss / len(val_loader)

        print(f"Epoch {epoch+1}/{epochs}, Train Loss={avg_train_loss:.4f}, Val Loss={avg_val_loss:.4f}")

    return model

In [38]:
# Select one patient (example: patient 1)
# pdf = pdf.dropna(subset=["Value", "prev_bg"])
pdf = agg_df[agg_df["patient_id"] == 1].sort_values("datetime")
print(pdf)
# Train LSTM
model = train_lstm(pdf, seq_len=10, epochs=120)

     patient_id             datetime       Value  is_insulin  is_meal  \
0             1  1991-04-21 09:09:00   40.666667           1        0   
1             1  1991-04-21 17:08:00   63.000000           1        0   
2             1  1991-04-21 22:51:00  123.000000           0        0   
3             1  1991-04-22 07:35:00   79.666667           1        0   
4             1  1991-04-22 13:40:00    2.000000           1        0   
..          ...                  ...         ...         ...      ...   
505           1  1991-09-02 08:51:00   64.333333           1        0   
506           1  1991-09-02 13:00:00    4.000000           1        0   
507           1  1991-09-02 17:30:00   34.000000           1        0   
508           1  1991-09-02 23:00:00  155.000000           0        0   
509           1  1991-09-03 07:20:00   45.000000           1        0   

     is_exercise     prev_bg  prev_insulin  prev_meal  prev_exercise  \
0              0         NaN           NaN        N

In [45]:
import pandas as pd
import numpy as np

def enrich_features(df):
    df = df.copy()
    df["datetime"] = pd.to_datetime(df["datetime"])
    df = df.sort_values(["patient_id", "datetime"])
    
    # Base indicators
    df["is_insulin"] = df["Code"].isin([33, 34, 35]).astype(int)
    df["is_meal"] = df["Code"].isin([66, 67, 68, 72]).astype(int)
    df["is_exercise"] = df["Code"].isin([69, 70, 71]).astype(int)
    
    # Insulin dose
    df["insulin_dose"] = np.where(df["is_insulin"] == 1, df["Value"], 0)

    # Meal size mapping
    meal_map = {66: 1, 67: 2, 68: -1, 72: 0}
    df["meal_size"] = df["Code"].map(meal_map).fillna(0)

    # Exercise level mapping
    ex_map = {69: 1, 70: 2, 71: -1}
    df["exercise_level"] = df["Code"].map(ex_map).fillna(0)

    # Time since last reading per patient
    df["time_since_last"] = (
        df.groupby("patient_id")["datetime"].diff().dt.total_seconds() / 60
    ).fillna(0)

    # Normalize BG per patient
    df["Value_norm"] = (
        df.groupby("patient_id")["Value"].transform(lambda x: (x - x.mean()) / x.std(ddof=0))
    )

    # Fill any remaining NaNs
    df = df.fillna(0)

    return df

df = enrich_features(df)

In [57]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

# 1. Create sequence dataset
class BGDataset(Dataset):
    def __init__(self, df, seq_len=5):
        self.seq_len = seq_len
        self.features = df[["Value", "is_insulin", "is_meal", "meal_size", "is_exercise", "exercise_level", "time_since_last"]].values
        self.targets = df["Value"].values

    def __len__(self):
        return len(self.features) - self.seq_len

    def __getitem__(self, idx):
        X = self.features[idx:idx+self.seq_len]
        y = self.targets[idx+self.seq_len]
        return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# 2. LSTM Model
class BGLSTM(nn.Module):
    def __init__(self, input_size=8, hidden_size=64, num_layers=2):
        super(BGLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # last time step
        return out.squeeze()

# 3. Train function with validation
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

def train_lstm(df, seq_len=5, epochs=200, lr=0.001, patience=50):
    # ---- Chronological 80/20 split ----
    train_size = int(0.8 * len(df))
    train_df = df.iloc[:train_size]
    val_df = df.iloc[train_size:]

    train_loader = DataLoader(BGDataset(train_df, seq_len), batch_size=32, shuffle=False)
    val_loader = DataLoader(BGDataset(val_df, seq_len), batch_size=32, shuffle=False)

    model = BGLSTM(input_size=7)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    best_val_rmse = float("inf")
    epochs_no_improve = 0
    best_model_state = None

    for epoch in range(epochs):
        # ---- Training ----
        model.train()
        total_loss = 0
        for X, y in train_loader:
            optimizer.zero_grad()
            preds = model(X)
            loss = criterion(preds, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)
        train_rmse = np.sqrt(avg_train_loss)

        # ---- Validation ----
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X, y in val_loader:
                preds = model(X)
                loss = criterion(preds, y)
                val_loss += loss.item()
        avg_val_loss = val_loss / len(val_loader)
        val_rmse = np.sqrt(avg_val_loss)

        # ---- Early Stopping ----
        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            best_model_state = model.state_dict()
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1

        if epochs_no_improve >= patience:
            print(f"⏹️ Early stopping at epoch {epoch+1} (no improvement for {patience} epochs)")
            break

        print(f"Epoch {epoch+1}/{epochs}, Train RMSE={train_rmse:.4f}, Val RMSE={val_rmse:.4f}")

    # Restore best model
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    print(f"Best Val RMSE: {best_val_rmse:.4f}")
    return model


In [58]:
# Select one patient (example: patient 1)
# pdf = pdf.dropna(subset=["Value", "prev_bg"])
pdf = df[df["patient_id"] == 1].sort_values("datetime")
print(pdf)
# Train LSTM
model = train_lstm(pdf, seq_len=10, epochs=200)

           Date   Time  Code  Value  patient_id             datetime  \
0    04-21-1991   9:09    58  100.0           1  1991-04-21 09:09:00   
1    04-21-1991   9:09    33    9.0           1  1991-04-21 09:09:00   
2    04-21-1991   9:09    34   13.0           1  1991-04-21 09:09:00   
3    04-21-1991  17:08    62  119.0           1  1991-04-21 17:08:00   
4    04-21-1991  17:08    33    7.0           1  1991-04-21 17:08:00   
..          ...    ...   ...    ...         ...                  ...   
937  09-02-1991  17:30    62   61.0           1  1991-09-02 17:30:00   
939  09-02-1991  23:00    48  155.0           1  1991-09-02 23:00:00   
941  09-03-1991   7:20    33    9.0           1  1991-09-03 07:20:00   
940  09-03-1991   7:20    58  110.0           1  1991-09-03 07:20:00   
942  09-03-1991   7:20    34   16.0           1  1991-09-03 07:20:00   

    record_type  is_insulin  is_meal  is_exercise  insulin_dose  meal_size  \
0    Electronic           0        0            0        

In [64]:
def train_tcn(df, seq_len=10, epochs=500, lr=0.001, patience=20, min_delta=1e-4):
    dataset = BGDataset(df, seq_len)
    n = len(dataset)
    if n < 50:
        raise ValueError("Dataset too small for training")

    train_size = int(0.8 * n)
    val_size = n - train_size

    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    model = TCNModel()
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    best_val_loss = float("inf")
    epochs_no_improve = 0
    best_model_state = None

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X, y in train_loader:
            optimizer.zero_grad()
            preds = model(X)
            loss = criterion(preds, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X, y in val_loader:
                preds = model(X)
                val_loss += criterion(preds, y).item()
        val_loss /= len(val_loader)

        # Early Stopping logic
        if best_val_loss - val_loss > min_delta:
            best_val_loss = val_loss
            epochs_no_improve = 0
            best_model_state = model.state_dict()
        else:
            epochs_no_improve += 1

        print(f"Epoch {epoch+1}/{epochs}, Train Loss={total_loss/len(train_loader):.4f}, Val Loss={val_loss:.4f}")

        if epochs_no_improve >= patience:
            print(f"\nEarly stopping triggered after {epoch+1} epochs. Best Val Loss={best_val_loss:.4f}")
            break

    # Restore best model weights
    if best_model_state:
        model.load_state_dict(best_model_state)

    # --- Compute true RMSE in mg/dL ---
    bg_std = df["Value"].std()  # standard deviation in mg/dL
    val_rmse_mgdl = (best_val_loss ** 0.5) * bg_std
    print(f"\nFinal Validation RMSE ≈ {val_rmse_mgdl:.2f} mg/dL")

    return model

In [66]:
pdf = df[df["patient_id"] == 1].sort_values("datetime")
model = train_tcn(df, seq_len=20, epochs=500, patience=20)

Epoch 1/500, Train Loss=3937.5957, Val Loss=50.2669
Epoch 2/500, Train Loss=135.2427, Val Loss=6.5182
Epoch 3/500, Train Loss=109.5938, Val Loss=2.6934
Epoch 4/500, Train Loss=50.4568, Val Loss=1.2888
Epoch 5/500, Train Loss=31.2680, Val Loss=4.8166
Epoch 6/500, Train Loss=5.1114, Val Loss=1.0759
Epoch 7/500, Train Loss=5.9161, Val Loss=0.9990
Epoch 8/500, Train Loss=5.0175, Val Loss=1.0171
Epoch 9/500, Train Loss=11.6311, Val Loss=0.9906
Epoch 10/500, Train Loss=2.2615, Val Loss=0.9953
Epoch 11/500, Train Loss=1.6084, Val Loss=0.9752
Epoch 12/500, Train Loss=1.3699, Val Loss=1.0297
Epoch 13/500, Train Loss=1.1331, Val Loss=0.9846
Epoch 14/500, Train Loss=1.2915, Val Loss=0.9965
Epoch 15/500, Train Loss=1.5798, Val Loss=0.9959
Epoch 16/500, Train Loss=4.0096, Val Loss=0.9956
Epoch 17/500, Train Loss=15.7582, Val Loss=0.9960
Epoch 18/500, Train Loss=1.4457, Val Loss=0.9956
Epoch 19/500, Train Loss=1.1049, Val Loss=0.9956
Epoch 20/500, Train Loss=1.0097, Val Loss=0.9956
Epoch 21/500, Tra