In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report, accuracy_score, f1_score
from sklearn.ensemble import IsolationForest
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import os

# Configuration
DATA_PATH = 
OUTPUT_DIR = 
os.makedirs(OUTPUT_DIR, exist_ok=True)

np.random.seed(42)
tf.random.set_seed(42)

# Load data
with open(DATA_PATH, 'r') as f:
    records = json.load(f)

# Prepare data
ppg_signals = []
clinical_features = []
labels = []
glucose_values = []
skipped = 0

for r in records:
    try:
        ppg = r['window_data']
        if not isinstance(ppg, list) or len(ppg) == 0:
            skipped += 1
            continue
        if len(ppg) < 100:
            ppg = ppg + [ppg[-1]] * (100 - len(ppg))
        elif len(ppg) > 100:
            ppg = ppg[:100]
        ppg = [float(x) for x in ppg]

        hr = float(r['hr_data']) if str(r['hr_data']).replace('.', '').replace('-', '').isdigit() else np.nan
        glucose = float(r['glucose']) if str(r['glucose']).replace('.', '').replace('-', '').isdigit() else np.nan

        if np.isnan([hr, glucose]).any():
            skipped += 1
            continue

        ppg_signals.append(ppg)
        clinical_features.append([hr])
        labels.append(int(r['preop_dm']))
        glucose_values.append(glucose)
    except:
        skipped += 1

ppg_signals = np.array(ppg_signals, dtype=np.float32)
clinical_features = np.array(clinical_features, dtype=np.float32)
labels = np.array(labels, dtype=np.int32)
glucose_values = np.array(glucose_values, dtype=np.float32)

# Remove outliers
def remove_outliers(X_ppg, X_clin, y, glucose):
    X = np.hstack([X_ppg.reshape(len(X_ppg), -1), X_clin, glucose.reshape(-1, 1)])
    iso = IsolationForest(contamination=0.15, random_state=42)
    mask = iso.fit_predict(X) == 1
    return X_ppg[mask], X_clin[mask], y[mask], glucose[mask]

ppg_signals, clinical_features, labels, glucose_values = remove_outliers(ppg_signals, clinical_features, labels, glucose_values)

# Balance classes
mask0 = labels == 0
mask1 = labels == 1
if sum(mask0) < sum(mask1):
    min_ppg, maj_ppg = ppg_signals[mask0], ppg_signals[mask1]
    min_clin, maj_clin = clinical_features[mask0], clinical_features[mask1]
    min_y, maj_y = labels[mask0], labels[mask1]
    min_g, maj_g = glucose_values[mask0], glucose_values[mask1]
else:
    min_ppg, maj_ppg = ppg_signals[mask1], ppg_signals[mask0]
    min_clin, maj_clin = clinical_features[mask1], clinical_features[mask0]
    min_y, maj_y = labels[mask1], labels[mask0]
    min_g, maj_g = glucose_values[mask1], glucose_values[mask0]

n = len(min_ppg)
idx = np.random.choice(len(maj_ppg), n, replace=False)
maj_ppg = maj_ppg[idx]
maj_clin = maj_clin[idx]
maj_y = maj_y[idx]
maj_g = maj_g[idx]

ppg_bal = np.vstack([min_ppg, maj_ppg])
clin_bal = np.vstack([min_clin, maj_clin])
y_bal = np.hstack([min_y, maj_y])
glucose_bal = np.hstack([min_g, maj_g])

# Normalize
ppg_scaler = StandardScaler()
ppg_norm = ppg_scaler.fit_transform(ppg_bal.reshape(-1, 1)).reshape(ppg_bal.shape)
clin_scaler = StandardScaler()
clin_norm = clin_scaler.fit_transform(clin_bal)

# Shuffle and split
idx = np.random.permutation(len(y_bal))
ppg_norm = ppg_norm[idx]
clin_norm = clin_norm[idx]
y_bal = y_bal[idx]
glucose_bal = glucose_bal[idx]
ppg_norm = ppg_norm.reshape(-1, 100, 1)

X_ppg_tr, X_ppg_temp, X_clin_tr, X_clin_temp, y_tr, y_temp, g_tr, g_temp = train_test_split(
    ppg_norm, clin_norm, y_bal, glucose_bal, test_size=0.3, random_state=42, stratify=y_bal)
X_ppg_val, X_ppg_test, X_clin_val, X_clin_test, y_val, y_test, g_val, g_test = train_test_split(
    X_ppg_temp, X_clin_temp, y_temp, g_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Build lightweight GRU model
def build_gru():
    ppg_in = layers.Input(shape=(100, 1))
    x = layers.Bidirectional(layers.GRU(32, return_sequences=False))(ppg_in)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(16, activation='relu')(x)
    x = layers.Dropout(0.2)(x)

    clin_in = layers.Input(shape=(1,))
    c = layers.Dense(16, activation='relu')(clin_in)
    c = layers.BatchNormalization()(c)
    c = layers.Dropout(0.2)(c)

    merged = layers.Concatenate()([x, c])
    merged = layers.Dense(32, activation='relu')(merged)
    merged = layers.BatchNormalization()(merged)
    merged = layers.Dropout(0.2)(merged)
    merged = layers.Dense(16, activation='relu')(merged)
    out = layers.Dense(1, activation='sigmoid')(merged)

    model = tf.keras.Model([ppg_in, clin_in], out)
    return model

model = build_gru()

# Class weights
w0 = len(y_tr) / (2 * sum(y_tr == 0))
w1 = len(y_tr) / (2 * sum(y_tr == 1))
class_weight = {0: w0, 1: w1}

model.compile(optimizer='adam', loss='binary_crossentropy',
              metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

# Train
early_stop = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=8, min_lr=1e-6)

model.fit([X_ppg_tr, X_clin_tr], y_tr,
          validation_data=([X_ppg_val, X_clin_val], y_val),
          epochs=100, batch_size=32, class_weight=class_weight,
          callbacks=[early_stop, reduce_lr], verbose=1)

# Evaluate
y_pred_proba = model.predict([X_ppg_test, X_clin_test]).flatten()
fpr, tpr, thr = roc_curve(y_test, y_pred_proba)
optimal_idx = np.argmax(tpr - fpr)
thr_opt = thr[optimal_idx]
y_pred = (y_pred_proba >= thr_opt).astype(int)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = np.sum((y_pred == 1) & (y_test == 1)) / np.sum(y_pred == 1) if np.sum(y_pred == 1) > 0 else 0
recall = np.sum((y_pred == 1) & (y_test == 1)) / np.sum(y_test == 1) if np.sum(y_test == 1) > 0 else 0
specificity = np.sum((y_pred == 0) & (y_test == 0)) / np.sum(y_test == 0) if np.sum(y_test == 0) > 0 else 0
roc_auc = auc(fpr, tpr)

print("\n" + "="*50)
print("GRU MODEL RESULTS")
print("="*50)
print(classification_report(y_test, y_pred, target_names=['No Diabetes', 'Diabetes']))
print(f"Accuracy:    {accuracy:.4f}")
print(f"Precision:   {precision:.4f}")
print(f"Recall:      {recall:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"F1-Score:    {f1:.4f}")
print(f"AUC:         {roc_auc:.4f}")
print(f"Optimal Threshold: {thr_opt:.3f}")

# Save model
model.save(os.path.join(OUTPUT_DIR, 'gru_diabetes_model.h5'))
print(f"Model saved to {OUTPUT_DIR}")