## Data Preparation

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Parameters
data_dir = "data/openAPS_patient"  # Directory containing patient folders
sequence_length = 10  # Number of timesteps in each sequence
features = ['rate', 'Carbs', 'IOB']  # Features for the model
target = 'CGM_glucose'  # Target column

def load_and_prepare_data(data_dir):
    all_data = []
    
    # Load all patient data
    for patient_folder in os.listdir(data_dir):
        patient_path = os.path.join(data_dir, patient_folder)
        if not os.path.isdir(patient_path):  # Skip files like .DS_Store
            continue
        for file in os.listdir(patient_path):
            file_path = os.path.join(patient_path, file)
            if not file.endswith(".csv"):  # Ensure only CSV files are processed
                continue
            df = pd.read_csv(file_path)
            df['Time'] = pd.to_datetime(df['Time'], format='mixed', errors='coerce') # Ensure datetime format
            df = df.sort_values('Time')  # Sort by time

            # Replace invalid string entries with NaN
            df.replace(' null', np.nan, inplace=True)

            # Drop rows with null values
            df = df.dropna()

            all_data.append(df)
    
    # Combine all patient data
    combined_data = pd.concat(all_data, axis=0)

    # Replace invalid string entries with NaN in the combined data
    combined_data.replace(' null', np.nan, inplace=True)

    # Ensure numeric columns
    for col in features + [target]:
        combined_data[col] = pd.to_numeric(combined_data[col], errors='coerce')

    # Drop rows with NaN
    combined_data = combined_data.dropna(subset=features + [target])

    # Normalize features
    scaler = MinMaxScaler()
    combined_data[features + [target]] = scaler.fit_transform(combined_data[features + [target]])
    
    # Create sequences
    X, y = [], []
    for i in range(len(combined_data) - sequence_length):
        seq_x = combined_data[features].iloc[i:i + sequence_length].values
        seq_y = combined_data[target].iloc[i + sequence_length]
        X.append(seq_x)
        y.append(seq_y)
    
    return np.array(X), np.array(y), scaler



# Load and prepare data
X, y, scaler = load_and_prepare_data(data_dir)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## LSTM Model Definition

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Define the LSTM model
def build_lstm_model(input_shape):
    model = Sequential([
        LSTM(64, activation='tanh', input_shape=input_shape, return_sequences=False),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(1)  # Predict one value: CGM_glucose
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Build the model
input_shape = (X_train.shape[1], X_train.shape[2])  # (sequence_length, number_of_features)
model = build_lstm_model(input_shape)
model.summary()


## Model Training

In [None]:
# Train the model
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=32,
    verbose=1
)

## Model Evaluation

In [None]:
# Evaluate the model
loss, mae = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test MAE: {mae}")

# Predict and inverse transform the results
y_pred = model.predict(X_test)
y_pred_rescaled = scaler.inverse_transform(np.concatenate((np.zeros((y_pred.shape[0], len(features))), y_pred), axis=1))[:, -1]
y_test_rescaled = scaler.inverse_transform(np.concatenate((np.zeros((y_test.shape[0], len(features))), y_test.reshape(-1, 1)), axis=1))[:, -1]

# Plot predictions vs actual values
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.plot(y_test_rescaled, label='Actual')
plt.plot(y_pred_rescaled, label='Predicted')
plt.legend()
plt.show()