In [3]:
import pandas as pd

df = pd.read_csv("/Users/akki/Desktop/AKKI/college/AI in Healthcare/Project/Blood Glucose Prediction/Diabetes-Data/patient_data.csv")
print(df.shape)
print(df.head())

(29330, 7)
         Date   Time  Code Value  patient_id             datetime record_type
0  04-21-1991   9:09    58   100           1  1991-04-21 09:09:00  Electronic
1  04-21-1991   9:09    33     9           1  1991-04-21 09:09:00  Electronic
2  04-21-1991   9:09    34    13           1  1991-04-21 09:09:00  Electronic
3  04-21-1991  17:08    62   119           1  1991-04-21 17:08:00  Electronic
4  04-21-1991  17:08    33     7           1  1991-04-21 17:08:00  Electronic


In [4]:
df['Value'] = pd.to_numeric(df['Value'], errors='coerce')

In [5]:
bg_codes = [58, 60, 62]
bg_df = df[df['Code'].isin(bg_codes)].copy()

In [8]:
# Mark event type columns
df['is_insulin'] = df['Code'].isin([33, 34, 35]).astype(int)
df['is_meal']    = df['Code'].isin([66, 67, 68]).astype(int)
df['is_exercise']= df['Code'].isin([69, 70, 71]).astype(int)

# For each datetime, aggregate
agg_df = df.groupby(['patient_id','datetime']).agg({
    'Value':'mean',
    'is_insulin':'max',
    'is_meal':'max',
    'is_exercise':'max'
}).reset_index()

In [9]:
agg_df = agg_df.sort_values(['patient_id','datetime'])
agg_df['prev_bg'] = agg_df.groupby('patient_id')['Value'].shift(1)
agg_df['prev_insulin'] = agg_df.groupby('patient_id')['is_insulin'].shift(1)
agg_df['prev_meal'] = agg_df.groupby('patient_id')['is_meal'].shift(1)
agg_df['prev_exercise'] = agg_df.groupby('patient_id')['is_exercise'].shift(1)

In [10]:
agg_df['target_bg'] = agg_df.groupby('patient_id')['Value'].shift(-1)
model_df = agg_df.dropna()

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

X = model_df[['Value','prev_bg','prev_insulin','prev_meal','prev_exercise']]
y = model_df['target_bg']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, preds))
print("RMSE:", np.sqrt(mean_squared_error(y_test, preds)))

MAE: 74.51928788746062
RMSE: 105.91294400865351


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

patient_models = {}
results = {}

for pid, pdf in agg_df.groupby("patient_id"):
    # Drop rows with missing features
    pdf = pdf.dropna(subset=["Value", "prev_bg"])
    
    if len(pdf) < 50:  # skip very small datasets
        continue
    
    X = pdf[["prev_bg", 'prev_insulin','prev_meal','prev_exercise']]
    y = pdf["Value"]

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    # Model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    patient_models[pid] = model
    results[pid] = {"MAE": mae, "RMSE": rmse}

# Show results
for pid, metrics in results.items():
    print(f"Patient {pid}: MAE={metrics['MAE']:.2f}, RMSE={metrics['RMSE']:.2f}")

Patient 1: MAE=52.08, RMSE=64.84
Patient 2: MAE=32.79, RMSE=41.60
Patient 3: MAE=78.61, RMSE=100.38
Patient 4: MAE=79.09, RMSE=103.67
Patient 5: MAE=85.04, RMSE=103.71
Patient 6: MAE=45.99, RMSE=49.69
Patient 7: MAE=38.08, RMSE=50.92
Patient 8: MAE=99.21, RMSE=116.74
Patient 9: MAE=71.00, RMSE=105.78
Patient 10: MAE=58.28, RMSE=79.12
Patient 11: MAE=37.62, RMSE=63.60
Patient 12: MAE=60.35, RMSE=86.57
Patient 13: MAE=66.19, RMSE=99.46
Patient 14: MAE=52.81, RMSE=92.24
Patient 15: MAE=52.14, RMSE=74.13
Patient 16: MAE=60.39, RMSE=99.03
Patient 17: MAE=65.49, RMSE=86.82
Patient 18: MAE=54.39, RMSE=75.96
Patient 19: MAE=72.14, RMSE=108.97
Patient 20: MAE=41.67, RMSE=48.61
Patient 21: MAE=64.70, RMSE=87.10
Patient 22: MAE=56.37, RMSE=80.59
Patient 23: MAE=53.96, RMSE=87.07
Patient 24: MAE=60.34, RMSE=86.64
Patient 25: MAE=80.45, RMSE=100.50
Patient 26: MAE=45.73, RMSE=55.98
Patient 27: MAE=30.84, RMSE=42.12
Patient 28: MAE=30.86, RMSE=42.45
Patient 29: MAE=35.58, RMSE=46.28
Patient 30: MAE=

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

# 1. Create sequence dataset
class BGDataset(Dataset):
    def __init__(self, df, seq_len=5):
        self.seq_len = seq_len
        self.features = df[["Value", "is_insulin", "is_meal", "is_exercise"]].values
        self.targets = df["Value"].values

    def __len__(self):
        return len(self.features) - self.seq_len

    def __getitem__(self, idx):
        X = self.features[idx:idx+self.seq_len]
        y = self.targets[idx+self.seq_len]
        return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# 2. LSTM Model
class BGLSTM(nn.Module):
    def __init__(self, input_size=4, hidden_size=64, num_layers=2):
        super(BGLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # last time step
        return out.squeeze()

# 3. Train function
def train_lstm(df, seq_len=5, epochs=20, lr=0.001):
    dataset = BGDataset(df, seq_len)
    loader = DataLoader(dataset, batch_size=32, shuffle=False)

    model = BGLSTM()
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X, y in loader:
            optimizer.zero_grad()
            preds = model(X)
            loss = criterion(preds, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss={total_loss/len(loader):.4f}")

    return model

In [18]:
# Select one patient (example: patient 1)
pdf = pdf.dropna(subset=["Value", "prev_bg"])
pdf = agg_df[agg_df["patient_id"] == 1].sort_values("datetime")

# Train LSTM
model = train_lstm(pdf, seq_len=5, epochs=20)

Epoch 1/20, Loss=nan
Epoch 2/20, Loss=nan
Epoch 3/20, Loss=nan
Epoch 4/20, Loss=nan
Epoch 5/20, Loss=nan
Epoch 6/20, Loss=nan
Epoch 7/20, Loss=nan
Epoch 8/20, Loss=nan
Epoch 9/20, Loss=nan
Epoch 10/20, Loss=nan
Epoch 11/20, Loss=nan
Epoch 12/20, Loss=nan
Epoch 13/20, Loss=nan
Epoch 14/20, Loss=nan
Epoch 15/20, Loss=nan
Epoch 16/20, Loss=nan
Epoch 17/20, Loss=nan
Epoch 18/20, Loss=nan
Epoch 19/20, Loss=nan
Epoch 20/20, Loss=nan
