# Machine Learning Pipeline for Fatigue Prediction in the full IBD Cohort

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    recall_score,
    auc,
    roc_curve,
    roc_auc_score,
)
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import shap

from sklearn.model_selection import (
    cross_val_score,
    GridSearchCV,
    GroupShuffleSplit,
    GroupKFold,
)
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
import datetime
from tensorflow.keras.metrics import AUC

  from .autonotebook import tqdm as notebook_tqdm


## Pipeline Configuration

In [2]:
random_seed = 42  # Random seed to ensure reproducibility
output_path = "output/tensorflow/"
cmap = "seismic"  # Colormap for SHAP plots use "seismic" for full cohort and "berlin" for biochem remission cohort
file_prefix="tensorflow"
# file_prefix = "biochem_remission"

## Data Loading

In [3]:
df = pd.read_csv("working_data/all_ibd_ml_input.csv")
# df = df[df["aggregate_disease_activity_Biochemical remission"] == 1] # Uncomment to run biochem remission pipeline

## Further Data Pre-Processing

In [4]:
# convert categorical columns to numerical
df["sex"] = df["sex"].map({"Male": 1, "Female": 0})
df["fatigue_outcome"] = df["fatigue_outcome"].map({"fatigue": 1, "no_fatigue": 0})

In [5]:
# These columns are not features we want to use in the model
# Aggregate disease activity in some ways is a reflection of the other raw variables
columns_to_drop = [
    "aggregate_disease_activity_Active",
    "aggregate_disease_activity_Biochemical remission",
    "aggregate_disease_activity_Remission",
    "season_no_data",
    "study",
    # "redcap_event_name_timepoint_1",
    # "redcap_event_name_timepoint_2",
    # "redcap_event_name_timepoint_3",
    # "redcap_event_name_timepoint_4",
    # "redcap_event_name_timepoint_5",
]

df.drop(columns=columns_to_drop, inplace=True)

In [6]:
# This column is dropped as all the values are 0.
columns_to_drop = [
    "baseline_eims_pyoderma_gangrenosum",
]

df.drop(columns=columns_to_drop, inplace=True)

In [7]:
numerical_features = [
    "age",
    "height",
    "weight",
    "bmi",
    "age_at_diagnosis",
    "albumin",
    "crp",
    "haemoglobin",
    "red_cell_count",
    "haematocrit",
    "white_cell_count",
    "neutrophils",
    "lymphocytes",
    "monocytes",
    "eosinophils",
    "basophils",
    "platelets",
    "urea",
    "creatinine",
    "sodium",
    "potassium",
    "calprotectin",
    "ada_drug_level",
    "ifx_drug_level",
    "diagnosis_year",
    "disease_duration_weeks",
]
scaler = StandardScaler()  # We have tried a variety of scaling methods and they did not affect final model output
scaler.fit(df[numerical_features])
df[numerical_features] = scaler.transform(df[numerical_features])

## Create Train and Test Datasets

GroupShuffleSplit used to ensure same participant only appears in either train or test set.

In [8]:
# Create Train Validate and Test Datasets

# First split into train and temp 70% train, 30% temp which will be split 50:50 into 15% val and 15% test

# GroupShuffleSplit
splitter = GroupShuffleSplit(test_size=0.3, n_splits=1, random_state=random_seed)

# Perform the split
for train_idx, test_idx in splitter.split(df, groups=df["study_id"]):
    train_data = df.iloc[train_idx]
    test_data = df.iloc[test_idx]

# Drop 'study_id' from X_train and X_test as it's not a feature
X_train = train_data.drop(columns=["fatigue_outcome", "study_id"])
y_train = train_data["fatigue_outcome"]

# We convert study_id hashes into integers and then to a numpy array
groups = train_data["study_id"] 
groups = groups.astype('category')
groups = groups.cat.codes
groups = groups.to_numpy()



X_test = test_data.drop(columns=["fatigue_outcome", "study_id"])
y_test = test_data["fatigue_outcome"]


In [9]:
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (834, 85)
Test shape: (381, 85)


In [10]:
# GroupKFold
cross_validator = GroupKFold(n_splits=5)

for i, (train_index, val_index) in enumerate(cross_validator.split(X_train, y_train, groups)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index[:5]}, group={groups[train_index[:5]]}")
    print(f"  Validation:  index={val_index[:5]}, group={groups[val_index[:5]]}")

Fold 0:
  Train: index=[0 1 2 3 4], group=[320 320 320 320 320]
  Validation:  index=[20 21 22 23 24], group=[88 88 88 88 88]
Fold 1:
  Train: index=[10 11 12 13 14], group=[362 362 362 362 362]
  Validation:  index=[0 1 2 3 4], group=[320 320 320 320 320]
Fold 2:
  Train: index=[0 1 2 3 4], group=[320 320 320 320 320]
  Validation:  index=[35 36 37 38 39], group=[361 361 361 361 361]
Fold 3:
  Train: index=[0 1 2 3 4], group=[320 320 320 320 320]
  Validation:  index=[25 26 27 28 29], group=[78 78 78 78 78]
Fold 4:
  Train: index=[0 1 2 3 4], group=[320 320 320 320 320]
  Validation:  index=[10 11 12 13 14], group=[362 362 362 362 362]


## Deep Learning with TensorFlow


### Model Definition

In [11]:


log_dir = "logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

### Compiling the Model

In [12]:
def build_model():
    model = keras.Sequential(
        [
            layers.Dense(85, activation="relu"),
            layers.Dropout(0.3),
            layers.Dense(85, activation="relu"),
            layers.Dense(1, activation="sigmoid"),
        ]
    )
    model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy", AUC()])
    return model

In [19]:
results_list=[]

for i, (train_index, val_index) in enumerate(cross_validator.split(X_train, y_train, groups)):
    print("Processing fold", i)
    X_train_fold = X_train.iloc[train_index]
    y_train_fold = y_train.iloc[train_index]
    X_val_fold = X_train.iloc[val_index]
    y_val_fold = y_train.iloc[val_index]

    model = build_model()

    history = model.fit(
        X_train_fold,
        y_train_fold,
        epochs=50,
        batch_size=32,
        validation_data=(X_val_fold, y_val_fold),
        callbacks=[tensorboard_callback],
        verbose=0,
    )

    results_list.append(history.history)


print(results_list)


Processing fold 0
Processing fold 1
Processing fold 2
Processing fold 3
Processing fold 4
[{'accuracy': [0.5862069129943848, 0.6566716432571411, 0.6716641783714294, 0.6971514225006104, 0.7271364331245422, 0.7316341996192932, 0.7376312017440796, 0.7481259107589722, 0.7466266751289368, 0.782608687877655, 0.7811094522476196, 0.791604220867157, 0.7991004586219788, 0.7961019277572632, 0.7961019277572632, 0.8170914649963379, 0.8275862336158752, 0.8425787091255188, 0.8440779447555542, 0.8365817070007324, 0.8440779447555542, 0.8665667176246643, 0.8650674819946289, 0.8635681867599487, 0.8950524926185608, 0.8785607218742371, 0.8950524926185608, 0.8800599575042725, 0.8890554904937744, 0.8815591931343079, 0.8965517282485962, 0.8860569596290588, 0.8935531973838806, 0.9010494947433472, 0.9025487303733826, 0.9115442037582397, 0.9280359745025635, 0.9235382080078125, 0.931034505367279, 0.9295352101325989, 0.9280359745025635, 0.9205397367477417, 0.9430285096168518, 0.9415292143821716, 0.9130434989929199

In [20]:
results_df = pd.DataFrame(results_list)
results_df['average_val_accuracy'] = results_df['val_accuracy'].apply(np.mean)
print(f"Average Validation Accuracy: {results_df['average_val_accuracy']}")

Average Validation Accuracy: 0    0.648383
1    0.597725
2    0.684311
3    0.643114
4    0.704578
Name: average_val_accuracy, dtype: float64
