In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Embedding, Flatten, Concatenate, MultiHeadAttention, LayerNormalization, Add
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
data_path = r'C:\Users\aaani\OneDrive - Birmingham City University\Postgrad\Dissertation\Data'
historical = pd.read_csv(f'{data_path}/historical_cohort.csv.gz')
contemporary = pd.read_csv(f'{data_path}/contemporary_cohort.csv.gz')

In [3]:
# Defining the target variable
target_variable = 'readmission_30d'

# Separate features (X) and the target variable (y)
X_historical = historical.drop(target_variable, axis=1)
y_historical = historical[target_variable]

X_contemporary = contemporary.drop(target_variable, axis=1)
y_contemporary = contemporary[target_variable]

# Display the shapes of the data to confirm everything is loaded correctly
print(f"Historical Features (X_historical) shape: {X_historical.shape}")
print(f"Historical Target (y_historical) shape: {y_historical.shape}")
print(f"Contemporary Features (X_contemporary) shape: {X_contemporary.shape}")
print(f"Contemporary Target (y_contemporary) shape: {y_contemporary.shape}")

Historical Features (X_historical) shape: (5052629, 73)
Historical Target (y_historical) shape: (5052629,)
Contemporary Features (X_contemporary) shape: (1546728, 73)
Contemporary Target (y_contemporary) shape: (1546728,)


In [4]:
# encode categorical variable gender using one-hot encoding
historical_x_encoded = pd.get_dummies(X_historical, columns=['gender'], prefix='gender', dtype=int)
contemporary_x_encoded = pd.get_dummies(X_contemporary, columns=['gender'], prefix='gender', dtype=int)

In [5]:
#  number encode 'race'
race_encoder = LabelEncoder()
race_encoder.fit(historical_x_encoded['race']) #learn unique race values from historical data
#mapping learned data to both historical and contemporary datasets
historical_x_encoded['race_encoded'] = race_encoder.transform(historical_x_encoded['race'])
contemporary_x_encoded['race_encoded'] = race_encoder.transform(contemporary_x_encoded['race'])

# number encode 'icd_code'
icd_encoder = LabelEncoder()
icd_encoder.fit(historical_x_encoded['icd_code'])
n_icd_codes = len(icd_encoder.classes_)
historical_x_encoded['icd_code_encoded'] = icd_encoder.transform(historical_x_encoded['icd_code'])

icd_mapping = {code: idx for idx, code in enumerate(icd_encoder.classes_)}

contemporary_x_encoded['icd_code_encoded'] = contemporary_x_encoded['icd_code'].map(icd_mapping).fillna(n_icd_codes).astype(int)

In [6]:
historical_x_encoded = historical_x_encoded.drop(columns=['race', 'icd_code'])
contemporary_x_encoded = contemporary_x_encoded.drop(columns=['race', 'icd_code'])

In [17]:
# a list of columns to exclude from scaling
cols_to_exclude = [
    'subject_id', 'hadm_id', 'stay_id', 
    'anchor_year_group', 'hospital_expire_flag', 
    'admitted_to_icu', 'gender_F', 'gender_M', 
    'race_encoded', 'icd_code_encoded', 'icd_version'
]

# Identify numerical columns to scale
numerical_cols_to_scale = [
    col for col in historical_x_encoded.select_dtypes(include=np.number).columns 
    if col not in cols_to_exclude
]

# Initialise the scaler
scaler = StandardScaler()
scaler.fit(historical_x_encoded[numerical_cols_to_scale]) #fit only on historical data only, to prevent data leakage

# create scaled versions of the datasets
historical_x_scaled = historical_x_encoded.copy()
contemporary_x_scaled = contemporary_x_encoded.copy()


historical_x_scaled[numerical_cols_to_scale] = scaler.transform(historical_x_encoded[numerical_cols_to_scale])
contemporary_x_scaled[numerical_cols_to_scale] = scaler.transform(contemporary_x_encoded[numerical_cols_to_scale])

In [None]:
# to stop model from discriminating based on year group, we will drop this column after saving it separately for comparison later
historical_year_group = historical_x_scaled['anchor_year_group']
contemporary_year_group = contemporary_x_scaled['anchor_year_group']

historical_x_scaled = historical_x_scaled.drop(columns=['anchor_year_group'])
contemporary_x_scaled = contemporary_x_scaled.drop(columns=['anchor_year_group'])

In [31]:
# 99/1/1 split due to large dataset size and further testing on the contemporary dataset

X_train, X_test, y_train, y_test = train_test_split(historical_x_scaled, y_historical, 
    test_size=0.01, 
    random_state=42,
    stratify=y_historical # Stratify to ensure class balance in both train and test sets
)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
    test_size=0.0101, # 1% of original data
    random_state=42,
    stratify=y_train # Stratify to ensure class balance in both train and val sets
)


print(f"Training samples: {len(X_train)} (~98%)")
print(f"Validation samples: {len(X_val)} (~1%)")
print(f"Test samples: {len(X_test)} (~1%)")

Training samples: 4951580 (~98%)
Validation samples: 50522 (~1%)
Test samples: 50527 (~1%)


In [None]:
print(y_train.value_counts())

In [32]:
#initalise the random undersampler
rus = RandomUnderSampler(random_state=42)

#apply undersampling only to the training data
X_train, y_train = rus.fit_resample(X_train, y_train)

print(y_train.value_counts())

readmission_30d
0    1161471
1    1161471
Name: count, dtype: int64


In [None]:
categorical_features = ['race_encoded', 'icd_code_encoded']
numerical_features = [col for col in X_train.columns if col not in categorical_features]

X_train_cat = X_train[categorical_features].values
X_train_num = X_train[numerical_features].values

X_val_cat = X_val[categorical_features].values
X_val_num = X_val[numerical_features].values


In [None]:
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Attention and Normalization
    x = MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(inputs, inputs)
    x = Dropout(dropout)(x)
    x = LayerNormalization(epsilon=1e-6)(x + inputs) # Add & Norm

    # Feed Forward Part
    ff_output = Dense(ff_dim, activation="relu")(x)
    ff_output = Dropout(dropout)(ff_output)
    ff_output = Dense(inputs.shape[-1])(ff_output)
    
    return LayerNormalization(epsilon=1e-6)(x + ff_output) # Add & Norm

categorical_input = Input(shape=(X_train_cat.shape[1],), name='categorical_input', dtype='int64')
embeddings = []
embedding_dim = 32
num_transformer_blocks = 2
num_heads = 4
ff_dim = 32
mlp_units = [128, 64]
dropout_rate = 0.2

for i, feature in enumerate(categorical_features):
    n_unique = pd.concat([X_train[feature], X_val[feature]]).nunique()
    emb = Embedding(input_dim=n_unique, output_dim=embedding_dim)(categorical_input[:, i])
    embeddings.append(emb)
x_cat = Concatenate()(embeddings)
for _ in range(num_transformer_blocks):
    x_cat = transformer_encoder(x_cat, embedding_dim, num_heads, ff_dim, dropout_rate)
x_cat = Flatten()(x_cat)

In [46]:
X_train_numerical = X_train.drop(columns=['race_encoded', 'icd_code_encoded']).values
X_train_race = X_train['race_encoded'].values
X_train_icd = X_train['icd_code_encoded'].values

X_val_numerical = X_val.drop(columns=['race_encoded', 'icd_code_encoded']).values
X_val_race = X_val['race_encoded'].values
X_val_icd = X_val['icd_code_encoded'].values

X_test_numerical = X_test.drop(columns=['race_encoded', 'icd_code_encoded']).values
X_test_race = X_test['race_encoded'].values
X_test_icd = X_test['icd_code_encoded'].values

In [47]:

numerical_input_layer = Input(shape=(X_train_numerical.shape[1],), name='X_train_numerical')
race_input_layer = Input(shape=(1,), name='X_train_race')
icd_input_layer = Input(shape=(1,), name='X_train_icd')

# This learns a dense vector for each race category
n_unique_races = historical_x_encoded['race_encoded'].nunique()
race_embedding = Embedding(input_dim=n_unique_races, output_dim=8, name='race_embedding')(race_input_layer)
race_flat = Flatten()(race_embedding)

# This learns a dense vector for each ICD code
n_unique_icd = len(icd_encoder.classes_) + 1
icd_embedding = Embedding(input_dim=n_unique_icd, output_dim=50, name='icd_embedding')(icd_input_layer)
icd_flat = Flatten()(icd_embedding)

concatenated_inputs = Concatenate()([numerical_input_layer, race_flat, icd_flat])


In [54]:
x = Dense(256, activation='relu')(concatenated_inputs)
x = Dropout(0.2)(x)
x = BatchNormalization()(x)

x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
x = BatchNormalization()(x)


# --- The Final Output Layer ---
output = Dense(1, activation='sigmoid')(x)

# --- Create and Compile the Final Model ---
model = Model(inputs=[numerical_input_layer, race_input_layer, icd_input_layer], outputs=output)

print("Model Summary:")
model.summary()

optimizer = Adam(learning_rate=0.01)

model.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.AUC(name='auc')] # AUC is a great metric for this task
)


Model Summary:


In [55]:
early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=10, 
    restore_best_weights=True
)

# Train the model
history = model.fit(
    [X_train_numerical, X_train_race, X_train_icd], y_train,
    epochs=100,
    batch_size=256,
    validation_data=([X_val_numerical, X_val_race, X_val_icd], y_val),
    #callbacks=[early_stopping],
    verbose=1
)

print("\nModel training complete.")

Epoch 1/100
[1m9074/9074[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 16ms/step - auc: 0.5072 - loss: 0.6942 - val_auc: 0.5088 - val_loss: 0.6937
Epoch 2/100
[1m9074/9074[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 16ms/step - auc: 0.5073 - loss: 0.6933 - val_auc: 0.5077 - val_loss: 0.6941
Epoch 3/100
[1m9074/9074[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 16ms/step - auc: 0.5073 - loss: 0.6933 - val_auc: 0.5072 - val_loss: 0.7027
Epoch 4/100
[1m9074/9074[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 16ms/step - auc: 0.5070 - loss: 0.6933 - val_auc: 0.5073 - val_loss: 0.6867
Epoch 5/100
[1m8615/9074[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m8s[0m 19ms/step - auc: 0.5069 - loss: 0.6934

KeyboardInterrupt: 