In [None]:
# EHR Patient Outcome Prediction (Text (clinical notes) + Tabular features)
# TensorFlow 2.x full runnable example (synthetic data)
# Run in Google Colab or local environment with TF installed.

import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [None]:
# -------------------------
# 1) Synthetic dataset
# -------------------------
def synth_clinical_note(label):
    # simple synthetic clinical note generator with keywords correlated to label
    base_notes = [
        "patient complains of chest pain and shortness of breath",
        "fever, cough, and sore throat for three days",
        "elevated blood pressure, headache, dizziness",
        "postoperative follow up, incision clean, no drainage",
        "diabetic patient with high blood sugar and neuropathy",
        "severe infection, septic, required IV antibiotics",
        "mild abdominal pain, tolerating oral intake",
        "fall with hip pain, imaging recommended",
        "chest x-ray shows infiltrates consistent with pneumonia",
        "stable vitals, ambulating, discharge planned"
    ]
    # bias words for positive outcome (readmission = 1)
    if label == 1:
        additions = [
            "recurrent", "worsening", "unstable", "requires readmission",
            "acute deterioration", "sepsis", "respiratory failure"
        ]
    else:
        additions = [
            "stable", "improved", "discharged", "outpatient follow up",
            "no complications", "stable for discharge"
        ]
    note = random.choice(base_notes)
    note += ". " + " ".join(random.choices(additions, k=2))
    # add a few random tokens for variability
    extras = ["history of hypertension", "allergic to penicillin", "smoker",
              "no known drug allergies", "family history of diabetes", "on metformin"]
    note += ". " + random.choice(extras)
    return note

N = 8000  # dataset size
# Create structured features: age, gender, num_prior_admissions, length_of_stay, lab_score (synthetic)
rows = []
for i in range(N):
    # synthetic risk score to drive label generation
    age = np.random.randint(18, 95)
    gender = np.random.choice([0, 1])  # 0 female, 1 male
    num_prior = np.random.poisson(0.8)
    los = max(1, int(np.random.exponential(3)))  # length of stay
    lab_score = np.clip(np.random.normal(0.0 + 0.03*(age-60) + 0.7*(num_prior), 1.0), -3, 3)
    # Compute probability of readmission (synthetic logistic)
    logit = -3.0 + 0.02*(age) + 0.7*(num_prior) + 0.6*(los>5) + 0.9*lab_score + 0.2*gender
    prob = 1.0 / (1.0 + np.exp(-logit))
    label = np.random.binomial(1, prob)
    note = synth_clinical_note(label)
    rows.append({
        "note": note,
        "age": age,
        "gender": gender,
        "num_prior": num_prior,
        "los": los,
        "lab_score": lab_score,
        "readmit": label
    })

df = pd.DataFrame(rows)
print("Dataset shape:", df.shape)
print(df.head())

Dataset shape: (8000, 7)
                                                note  age  gender  num_prior  \
0  fever, cough, and sore throat for three days. ...   69       0          0   
1  elevated blood pressure, headache, dizziness. ...   92       1          0   
2  fever, cough, and sore throat for three days. ...   70       1          2   
3  postoperative follow up, incision clean, no dr...   66       0          1   
4  postoperative follow up, incision clean, no dr...   79       1          1   

   los  lab_score  readmit  
0    4  -0.927806        0  
1    1   3.000000        1  
2   14   3.000000        1  
3    1   2.418715        1  
4    2   0.725617        0  


In [None]:
# -------------------------
# 2) Train/Val/Test split
# -------------------------
train_df, test_df = train_test_split(df, test_size=0.15, random_state=SEED, stratify=df["readmit"])
train_df, val_df = train_test_split(train_df, test_size=0.15, random_state=SEED, stratify=train_df["readmit"])

X_train_text = train_df["note"].astype(str).to_numpy()
X_val_text   = val_df["note"].astype(str).to_numpy()
X_test_text  = test_df["note"].astype(str).to_numpy()

tabular_cols = ["age", "gender", "num_prior", "los", "lab_score"]
X_train_tab = train_df[tabular_cols].to_numpy()
X_val_tab   = val_df[tabular_cols].to_numpy()
X_test_tab  = test_df[tabular_cols].to_numpy()

y_train = train_df["readmit"].to_numpy()
y_val   = val_df["readmit"].to_numpy()
y_test  = test_df["readmit"].to_numpy()

# Standardize tabular data
scaler = StandardScaler()
X_train_tab = scaler.fit_transform(X_train_tab)
X_val_tab = scaler.transform(X_val_tab)
X_test_tab = scaler.transform(X_test_tab)

In [None]:
# -------------------------
# 3) Text preprocessing: TextVectorization
# -------------------------
max_tokens = 20000
max_len = 120

text_vectorizer = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_len
)
# adapt on training text
text_vectorizer.adapt(X_train_text)

In [None]:
# -------------------------
# 4) Build TF Dataset pipelines
# -------------------------
batch_size = 64

def make_dataset(texts, tabs, labels, shuffle=False, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(((texts, tabs), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(labels), seed=SEED)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

train_ds = make_dataset(X_train_text, X_train_tab, y_train, shuffle=True, batch_size=batch_size)
val_ds = make_dataset(X_val_text, X_val_tab, y_val, shuffle=False, batch_size=batch_size)
test_ds = make_dataset(X_test_text, X_test_tab, y_test, shuffle=False, batch_size=batch_size)

In [None]:
# EHR Patient Outcome Prediction (Text (clinical notes) + Tabular features)
# TensorFlow 2.x full runnable example (synthetic data)
# Run in Google Colab or local environment with TF installed.

import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# -------------------------
# 1) Synthetic dataset
# -------------------------
def synth_clinical_note(label):
    # simple synthetic clinical note generator with keywords correlated to label
    base_notes = [
        "patient complains of chest pain and shortness of breath",
        "fever, cough, and sore throat for three days",
        "elevated blood pressure, headache, dizziness",
        "postoperative follow up, incision clean, no drainage",
        "diabetic patient with high blood sugar and neuropathy",
        "severe infection, septic, required IV antibiotics",
        "mild abdominal pain, tolerating oral intake",
        "fall with hip pain, imaging recommended",
        "chest x-ray shows infiltrates consistent with pneumonia",
        "stable vitals, ambulating, discharge planned"
    ]
    # bias words for positive outcome (readmission = 1)
    if label == 1:
        additions = [
            "recurrent", "worsening", "unstable", "requires readmission",
            "acute deterioration", "sepsis", "respiratory failure"
        ]
    else:
        additions = [
            "stable", "improved", "discharged", "outpatient follow up",
            "no complications", "stable for discharge"
        ]
    note = random.choice(base_notes)
    note += ". " + " ".join(random.choices(additions, k=2))
    # add a few random tokens for variability
    extras = ["history of hypertension", "allergic to penicillin", "smoker",
              "no known drug allergies", "family history of diabetes", "on metformin"]
    note += ". " + random.choice(extras)
    return note

N = 8000  # dataset size
# Create structured features: age, gender, num_prior_admissions, length_of_stay, lab_score (synthetic)
rows = []
for i in range(N):
    # synthetic risk score to drive label generation
    age = np.random.randint(18, 95)
    gender = np.random.choice([0, 1])  # 0 female, 1 male
    num_prior = np.random.poisson(0.8)
    los = max(1, int(np.random.exponential(3)))  # length of stay
    lab_score = np.clip(np.random.normal(0.0 + 0.03*(age-60) + 0.7*(num_prior), 1.0), -3, 3)
    # Compute probability of readmission (synthetic logistic)
    logit = -3.0 + 0.02*(age) + 0.7*(num_prior) + 0.6*(los>5) + 0.9*lab_score + 0.2*gender
    prob = 1.0 / (1.0 + np.exp(-logit))
    label = np.random.binomial(1, prob)
    note = synth_clinical_note(label)
    rows.append({
        "note": note,
        "age": age,
        "gender": gender,
        "num_prior": num_prior,
        "los": los,
        "lab_score": lab_score,
        "readmit": label
    })

df = pd.DataFrame(rows)
print("Dataset shape:", df.shape)
print(df.head())

# -------------------------
# 2) Train/Val/Test split
# -------------------------
train_df, test_df = train_test_split(df, test_size=0.15, random_state=SEED, stratify=df["readmit"])
train_df, val_df = train_test_split(train_df, test_size=0.15, random_state=SEED, stratify=train_df["readmit"])

X_train_text = train_df["note"].astype(str).to_numpy()
X_val_text   = val_df["note"].astype(str).to_numpy()
X_test_text  = test_df["note"].astype(str).to_numpy()

tabular_cols = ["age", "gender", "num_prior", "los", "lab_score"]
X_train_tab = train_df[tabular_cols].to_numpy()
X_val_tab   = val_df[tabular_cols].to_numpy()
X_test_tab  = test_df[tabular_cols].to_numpy()

y_train = train_df["readmit"].to_numpy()
y_val   = val_df["readmit"].to_numpy()
y_test  = test_df["readmit"].to_numpy()

# Standardize tabular data
scaler = StandardScaler()
X_train_tab = scaler.fit_transform(X_train_tab)
X_val_tab = scaler.transform(X_val_tab)
X_test_tab = scaler.transform(X_test_tab)

# -------------------------
# 3) Text preprocessing: TextVectorization
# -------------------------
max_tokens = 20000
max_len = 120

text_vectorizer = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_len
)
# adapt on training text
text_vectorizer.adapt(X_train_text)

# -------------------------
# 4) Build TF Dataset pipelines
# -------------------------
batch_size = 64

def make_dataset(texts, tabs, labels, shuffle=False, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(((texts, tabs), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(labels), seed=SEED)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

train_ds = make_dataset(X_train_text, X_train_tab, y_train, shuffle=True, batch_size=batch_size)
val_ds = make_dataset(X_val_text, X_val_tab, y_val, shuffle=False, batch_size=batch_size)
test_ds = make_dataset(X_test_text, X_test_tab, y_test, shuffle=False, batch_size=batch_size)

# -------------------------
# 5) Model: Text branch (Embedding + BiLSTM), Tabular branch (Dense), combine
# -------------------------
# Text input
text_input = keras.Input(shape=(), dtype="string", name="note")
x = text_vectorizer(text_input)                          # ints
vocab_size = int(text_vectorizer.vocabulary_size())
embed_dim = 128
x = layers.Embedding(input_dim=vocab_size + 1, output_dim=embed_dim, mask_zero=True)(x)
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
x = layers.GlobalMaxPool1D()(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.3)(x)

# Tabular input
tab_input = keras.Input(shape=(len(tabular_cols),), dtype="float32", name="tabular")
t = layers.Dense(64, activation="relu")(tab_input)
t = layers.BatchNormalization()(t)
t = layers.Dropout(0.2)(t)
t = layers.Dense(32, activation="relu")(t)

# Combine
combined = layers.concatenate([x, t])
combined = layers.Dense(64, activation="relu")(combined)
combined = layers.Dropout(0.4)(combined)
combined = layers.Dense(32, activation="relu")(combined)
output = layers.Dense(1, activation="sigmoid", name="readmit")(combined)

model = keras.Model(inputs=[text_input, tab_input], outputs=output, name="ehr_text_tab_model")
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=[keras.metrics.BinaryAccuracy(name="accuracy"),
             keras.metrics.AUC(name="auc")]
)
model.summary()

Dataset shape: (8000, 7)
                                                note  age  gender  num_prior  \
0  fever, cough, and sore throat for three days. ...   69       0          0   
1  elevated blood pressure, headache, dizziness. ...   92       1          0   
2  fever, cough, and sore throat for three days. ...   70       1          2   
3  postoperative follow up, incision clean, no dr...   66       0          1   
4  postoperative follow up, incision clean, no dr...   79       1          1   

   los  lab_score  readmit  
0    4  -0.927806        0  
1    1   3.000000        1  
2   14   3.000000        1  
3    1   2.418715        1  
4    2   0.725617        0  




In [None]:
# -------------------------
# 7) Callbacks
# -------------------------
callbacks = [
    keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, verbose=1),
    keras.callbacks.EarlyStopping(monitor="val_loss", patience=6, restore_best_weights=True, verbose=1),
    keras.callbacks.ModelCheckpoint("ehr_text_tab_model.h5", save_best_only=True, monitor="val_loss")
]


NameError: name 'keras' is not defined

In [None]:
# -------------------------
# 8) Train
# -------------------------
# -------------------------
# 6) Calculate class weights for imbalanced data
# -------------------------
# Calculate class counts
class_counts = train_df['readmit'].value_counts()

# Calculate total number of samples
total_samples = class_counts.sum()

# Calculate weights for each class
# weight = total_samples / (num_classes * count_of_class)
num_classes = len(class_counts)
class_weight = {i: total_samples / (num_classes * count) for i, count in class_counts.items()}

print("Class weights:", class_weight)

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=30,
    class_weight=class_weight,
    callbacks=callbacks
)

NameError: name 'train_df' is not defined

In [None]:
# -------------------------
# 6) Calculate class weights for imbalanced data
# -------------------------
# Calculate class counts
class_counts = train_df['readmit'].value_counts()

# Calculate total number of samples
total_samples = class_counts.sum()

# Calculate weights for each class
# weight = total_samples / (num_classes * count_of_class)
num_classes = len(class_counts)
class_weight = {i: total_samples / (num_classes * count) for i, count in class_counts.items()}

print("Class weights:", class_weight)

Class weights: {0: np.float64(0.8065866592241139), 1: np.float64(1.3154301319981794)}


In [None]:
# -------------------------
# 9) Evaluate on test set
# -------------------------
results = model.evaluate(test_ds, verbose=1)
print("Test loss, Test acc, Test AUC:", results)

# Predict and show classification report
y_pred_prob = model.predict(test_ds)
y_pred = (y_pred_prob.ravel() >= 0.5).astype(int)

print("\nClassification Report (Test):")
print(classification_report(y_test, y_pred, digits=4))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))




[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 268ms/step - accuracy: 0.3840 - auc: 0.4612 - loss: 0.6959
Test loss, Test acc, Test AUC: [0.6961308717727661, 0.3916666805744171, 0.45164090394973755]
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 187ms/step

Classification Report (Test):
              precision    recall  f1-score   support

           0     0.5201    0.2433    0.3315       744
           1     0.3392    0.6338    0.4419       456

    accuracy                         0.3917      1200
   macro avg     0.4297    0.4385    0.3867      1200
weighted avg     0.4514    0.3917    0.3735      1200

Confusion Matrix:
[[181 563]
 [167 289]]


In [None]:
# -------------------------
# 10) Example inference function
# -------------------------
def predict_single(note_text, tab_features_array):
    """
    note_text: string (will be vectorized inside function)
    tab_features_array: array-like of shape (len(tabular_cols),) in original scale (before StandardScaler)
    """
    tab_arr = np.array(tab_features_array).reshape(1, -1)
    tab_arr_scaled = scaler.transform(tab_arr)

    # Convert single string to numpy array of strings before vectorization
    text_np_array = np.array([note_text], dtype=object) # Use dtype=object for variable-length strings

    # Vectorize the text input before passing to the model
    text_vectorized = text_vectorizer(text_np_array) # Vectorize numpy array input

    print(f"Vectorized text dtype: {text_vectorized.dtype}")
    print(f"Vectorized text shape: {text_vectorized.shape}")
    print(f"Tabular tensor dtype: {tab_arr_scaled.dtype}")
    print(f"Tabular tensor shape: {tab_arr_scaled.shape}")


    # Pass inputs as a dictionary
    prob = model.predict({"note": text_vectorized, "tabular": tab_arr_scaled})
    prob_val = float(prob[0][0])
    percent = round(prob_val * 100, 2)

    if prob_val >= 0.5:
        print(f"Prediction: HIGH RISK of readmission ({percent}% probability).")
        print(f"➡ Out of 100 similar patients, about {percent} may be readmitted.")
    else:
        print(f"Prediction: LOW RISK of readmission ({percent}% probability).")
        print(f"➡ Out of 100 similar patients, about {percent} may be readmitted.")

    return prob_val

# Example
example_note = "Patient with fever and cough, worsening shortness of breath. Recurrent admissions for pneumonia."
example_tab = [72, 1, 2, 6, 1.5]  # age, gender, num_prior, los, lab_score
print("\nExample prediction (prob of readmit):", predict_single(example_note, example_tab))

Vectorized text dtype: <dtype: 'int64'>
Vectorized text shape: (1, 120)
Tabular tensor dtype: float64
Tabular tensor shape: (1, 5)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
Prediction: LOW RISK of readmission (48.6% probability).
➡ Out of 100 similar patients, about 48.6 may be readmitted.

Example prediction (prob of readmit): 0.4860389232635498


In [None]:
# -------------------------
# 11) Save preprocessing objects (vectorizer & scaler) and model
# -------------------------
# Save Keras model architecture to JSON and weights to HDF5
model_json = model.to_json()
with open("ehr_text_tab_model.json", "w") as json_file:
    json_file.write(model_json)
# Save weights
model.save_weights("ehr_text_tab_model_weights.weights.h5")


# Save text_vectorizer vocabulary
vocab = text_vectorizer.get_vocabulary()
with open("text_vectorizer_vocab.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(vocab))

# Save scaler params
np.savez("tabular_scaler_params.npz", mean=scaler.mean_, scale=scaler.scale_)

print("\nSaved model architecture to 'ehr_text_tab_model.json', weights to 'ehr_text_tab_model_weights.weights.h5', vocab to 'text_vectorizer_vocab.txt', scaler to 'tabular_scaler_params.npz'")


Saved model architecture to 'ehr_text_tab_model.json', weights to 'ehr_text_tab_model_weights.weights.h5', vocab to 'text_vectorizer_vocab.txt', scaler to 'tabular_scaler_params.npz'


In [None]:
# -------------------------
# 5) Model: Text branch (Embedding + BiLSTM), Tabular branch (Dense), combine
# -------------------------
# Text input - now accepts integer sequences
text_input = keras.Input(shape=(max_len,), dtype="int64", name="note") # Changed dtype to int64 and shape to (max_len,)
x = layers.Embedding(input_dim=vocab_size + 1, output_dim=embed_dim, mask_zero=True)(text_input) # Use text_input directly
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
x = layers.GlobalMaxPool1D()(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.3)(x)

# Tabular input
tab_input = keras.Input(shape=(len(tabular_cols),), dtype="float32", name="tabular")
t = layers.Dense(64, activation="relu")(tab_input)
t = layers.BatchNormalization()(t)
t = layers.Dropout(0.2)(t)
t = layers.Dense(32, activation="relu")(t)

# Combine
combined = layers.concatenate([x, t])
combined = layers.Dense(64, activation="relu")(combined)
combined = layers.Dropout(0.4)(combined)
combined = layers.Dense(32, activation="relu")(combined)
output = layers.Dense(1, activation="sigmoid", name="readmit")(combined)

model = keras.Model(inputs=[text_input, tab_input], outputs=output, name="ehr_text_tab_model")
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=[keras.metrics.BinaryAccuracy(name="accuracy"),
             keras.metrics.AUC(name="auc")]
)
model.summary()



In [None]:
# -------------------------
# 10) Example inference function
# -------------------------
def predict_single(note_text, tab_features_array):
    """
    note_text: string (will be vectorized inside function)
    tab_features_array: array-like of shape (len(tabular_cols),) in original scale (before StandardScaler)
    """
    tab_arr = np.array(tab_features_array).reshape(1, -1)
    tab_arr_scaled = scaler.transform(tab_arr)

    # Convert single string to numpy array of strings before vectorization
    text_np_array = np.array([note_text], dtype=object) # Use dtype=object for variable-length strings

    # Vectorize the text input before passing to the model
    text_vectorized = text_vectorizer(text_np_array) # Vectorize numpy array input

    print(f"Vectorized text dtype: {text_vectorized.dtype}")
    print(f"Vectorized text shape: {text_vectorized.shape}")
    print(f"Tabular tensor dtype: {tab_arr_scaled.dtype}")
    print(f"Tabular tensor shape: {tab_arr_scaled.shape}")


    # Pass inputs as a dictionary
    prob = model.predict({"note": text_vectorized, "tabular": tab_arr_scaled})
    prob_val = float(prob[0][0])
    percent = round(prob_val * 100, 2)

    if prob_val >= 0.5:
        print(f"Prediction: HIGH RISK of readmission ({percent}% probability).")
        print(f"➡ Out of 100 similar patients, about {percent} may be readmitted.")
    else:
        print(f"Prediction: LOW RISK of readmission ({percent}% probability).")
        print(f"➡ Out of 100 similar patients, about {percent} may be readmitted.")

    return prob_val

# Example
example_note = "Patient with fever and cough, worsening shortness of breath. Recurrent admissions for pneumonia."
example_tab = [72, 1, 2, 6, 1.5]  # age, gender, num_prior, los, lab_score
print("\nExample prediction (prob of readmit):", predict_single(example_note, example_tab))

Vectorized text dtype: <dtype: 'int64'>
Vectorized text shape: (1, 120)
Tabular tensor dtype: float64
Tabular tensor shape: (1, 5)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
Prediction: LOW RISK of readmission (48.6% probability).
➡ Out of 100 similar patients, about 48.6 may be readmitted.

Example prediction (prob of readmit): 0.4860389232635498
