In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, LinearSVR, SVR
from torch.utils.data import DataLoader

In [None]:
import sys
sys.path.append('..')
from libs.eval import (
    print_classification_performance_summary,
    print_regression_performance_summary,
)

In [None]:
SEED = 240302
np.random.seed(SEED)

In [None]:
def is_binary(series, allow_na=False):
    if allow_na:
        series.dropna(inplace=True)
    return sorted(series.unique()) == [0, 1]

def is_binary_cols(df, inv=False):
    mask = df.apply(is_binary, axis=0)
    if inv:
        mask = ~mask
    return df.iloc[:, np.argwhere(mask).flatten()].columns.tolist()

def train_target(df, outcome='LVM.group'):
    X, y = df.drop([outcome], axis=1), df[[outcome]]
    return X, y[outcome].values

In [None]:
df = pd.read_parquet('ECG.orig_val.with_ecg.parquet.gzip')
df['LVM.group'] = 2 - df['LVM.group']
df = pd.concat([df.drop(columns='smoking.status'), pd.get_dummies(df['smoking.status'], prefix='smoking.status', drop_first=False).astype(int)], axis=1)
df = df.merge(pd.read_parquet('ECG.orig_val.parquet.gzip')[['f.eid', 'indexed.LVM']], on='f.eid')

In [None]:
TRAIN_FEIDS = pd.read_csv('ukb_feids.train_split.list')['f.eid'].values
VAL_FEIDS = pd.read_csv('ukb_feids.val_split.list')['f.eid'].values
TEST_FEIDS = pd.read_csv('ukb_feids.test_split.list')['f.eid'].values

In [None]:
binary_cols = is_binary_cols(df)
cont_cols = is_binary_cols(df, inv=True)
cont_cols.remove('f.eid')
cont_cols.remove('indexed.LVM')
assert df.shape[1] == (len(binary_cols) + len(cont_cols) + 2) # Removed FEID/iLVM

In [None]:
results = {}

# Benchmark (classification)

In [None]:
df_train = df[df['f.eid'].isin(TRAIN_FEIDS)].drop(columns=['f.eid', 'indexed.LVM'])
df_val = df[df['f.eid'].isin(VAL_FEIDS)].drop(columns=['f.eid', 'indexed.LVM'])
df_test = df[df['f.eid'].isin(TEST_FEIDS)]
results['f.eid'] = df_test['f.eid'].values
results['is_m'] = (df_test['Sex'] == 1).astype(int).values
results['lvh_true'] = df_test['LVM.group'].values
results['ilvm_true'] = df_test['indexed.LVM'].values
df_test = df_test.drop(columns=['f.eid', 'indexed.LVM'])

In [None]:
X_train, y_train = train_target(df_train, outcome='LVM.group')
X_val, y_val = train_target(df_val, outcome='LVM.group')
X_test, y_test = train_target(df_test, outcome='LVM.group')

y_train_lvh = y_train.copy()
y_val_lvh = y_val.copy()
y_test_lvh = y_test.copy()

In [None]:
scaler = StandardScaler()
scaler.fit(X_train[cont_cols])
X_train[cont_cols] = scaler.transform(X_train[cont_cols])
X_val[cont_cols] = scaler.transform(X_val[cont_cols])
X_test[cont_cols] = scaler.transform(X_test[cont_cols])

### Replication of Naderi et al. (2023)

In [None]:
sampler = RandomUnderSampler(sampling_strategy='not minority', random_state=SEED)
X_train_under, y_train_under = sampler.fit_resample(X_train, y_train)

In [None]:
''' MATLAB model from Naderi et al. (2023)
 classificationSVM = fitcsvm(...
    predictors, ...
    response, ...
    'KernelFunction', 'gaussian', ...
    'PolynomialOrder', [], ...
    'KernelScale', 43, ...
    'BoxConstraint', 1, ...
    'Standardize', true, ...
    'ClassNames', [1; 2]);
'''
svm_clf = SVC(C=1, gamma='scale', kernel='rbf', probability=True, random_state=SEED)
svm_clf.fit(X_train_under, y_train_under)

y_pred_svm = svm_clf.predict(X_test)
print(Counter(y_pred_svm))
y_pred_svm_proba = svm_clf.predict_proba(X_test)

_ = print_classification_performance_summary(y_test, y_pred_svm, y_pred_svm_proba[:, 1], multi_class=False)

In [None]:
results['lvh_proba'] = y_pred_svm_proba[:, 1]

In [None]:
# Bayes-rule adjustment for prior in training set
P_Y1 = np.mean(y_train)
P_Y0 = 1 - P_Y1
p_adjusted = (P_Y1 * y_pred_svm_proba[:, 1]) / (P_Y1 * y_pred_svm_proba[:, 1] + P_Y0 * (1 - y_pred_svm_proba[:, 1]))

In [None]:
results['lvh_proba_prioradj'] = p_adjusted

### Re-optimised

In [None]:
param_grid = {
    'C': [0.1, 0.5, 1, 5, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

svm_clf = SVC(probability=True, random_state=SEED)
grid_search = GridSearchCV(svm_clf, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_val, y_val)
best_svm = grid_search.best_estimator_

In [None]:
grid_search.best_params_

In [None]:
best_svm.fit(X_train_under, y_train_under)
y_pred_svm = best_svm.predict(X_test)
print(Counter(y_pred_svm))
y_pred_svm_proba = best_svm.predict_proba(X_test)
_ = print_classification_performance_summary(y_test, y_pred_svm, y_pred_svm_proba[:, 1], multi_class=False)

### ECG-only

In [None]:
ecg_cols = X_train.columns.values[12:-4]
ecg_cols

In [None]:
X_train_ecg_only = X_train[ecg_cols]
X_val_ecg_only = X_val[ecg_cols]
X_test_ecg_only = X_test[ecg_cols]

In [None]:
sampler = RandomUnderSampler(sampling_strategy='not minority', random_state=SEED)
X_train_under_ecg_only, y_train_under_ecg_only = sampler.fit_resample(X_train_ecg_only, y_train)
Counter(y_train_under)

In [None]:
svm_clf = SVC(C=1, gamma='scale', kernel='rbf', class_weight='balanced', probability=True, random_state=SEED)
svm_clf.fit(X_train_under_ecg_only, y_train_under_ecg_only)

y_pred_svm = svm_clf.predict(X_test_ecg_only)
print(Counter(y_pred_svm))
y_pred_svm_proba = svm_clf.predict_proba(X_test_ecg_only)

_ = print_classification_performance_summary(y_test, y_pred_svm, y_pred_svm_proba[:, 1])

In [None]:
results['lvh_proba_ecgonly'] = y_pred_svm_proba[:, 1]

# Benchmark (regression)

In [None]:
df_train = df[df['f.eid'].isin(TRAIN_FEIDS)].drop(columns=['f.eid', 'LVM.group'])
df_val = df[df['f.eid'].isin(VAL_FEIDS)].drop(columns=['f.eid', 'LVM.group'])
df_test = df[df['f.eid'].isin(TEST_FEIDS)]
df_test['f.eid'] = pd.Categorical(df_test['f.eid'], categories=results['f.eid'], ordered=True)
df_test = df_test.sort_values('f.eid')
df_test = df_test.drop(columns=['f.eid', 'LVM.group'])

In [None]:
X_train, y_train = train_target(df_train, outcome='indexed.LVM')
X_val, y_val = train_target(df_val, outcome='indexed.LVM')
X_test, y_test = train_target(df_test, outcome='indexed.LVM')

In [None]:
scaler = StandardScaler()
scaler.fit(X_train[cont_cols])
X_train[cont_cols] = scaler.transform(X_train[cont_cols])
X_val[cont_cols] = scaler.transform(X_val[cont_cols])
X_test[cont_cols] = scaler.transform(X_test[cont_cols])

In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'epsilon': [0.01, 0.1, 0.5],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

svm_reg = SVR() # 1GB
grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search.fit(X_val, y_val)
best_svm = grid_search.best_estimator_

In [None]:
grid_search.best_params_

In [None]:
best_svm.fit(X_train, y_train)
y_pred_svr = best_svm.predict(X_test)
print_regression_performance_summary(y_test, y_pred_svr)

In [None]:
results['ilvm_pred'] = y_pred_svr

In [None]:
results['lvh_pred_cutoff'] = (y_pred_svr >= df_test.Sex.apply(lambda x: 70 if x == 1 else 55)).astype(int).values

In [None]:
y_pred_svr_val = best_svm.predict(X_val)

In [None]:
lr_clf = LogisticRegression(
    class_weight="balanced", max_iter=25000, random_state=20240302
)
lr_clf.fit(y_pred_svr_val.reshape(-1, 1), y_val_lvh)
y_pred_svr_proba = lr_clf.predict_proba(y_pred_svr.reshape(-1, 1))

In [None]:
_ = print_classification_performance_summary(y_test_lvh, y_pred_svm_proba[:, 1] > 0.5, y_pred_svm_proba[:, 1])

In [None]:
results['lvh_proba_lr'] = y_pred_svr_proba[:, 1]

In [None]:
# Bayes-rule adjustment for prior in training set
P_Y1 = np.mean(y_train_lvh)
P_Y0 = 1 - P_Y1
p_adjusted = (P_Y1 * y_pred_svr_proba[:, 1]) / (P_Y1 * y_pred_svr_proba[:, 1] + P_Y0 * (1 - y_pred_svr_proba[:, 1]))

In [None]:
_ = print_classification_performance_summary(y_test_lvh, p_adjusted > 0.5, p_adjusted)

In [None]:
results['lvh_proba_prioradj_lr'] = p_adjusted

# Benchmark (traditional ECG criteria)

In [None]:
rule_cols = ['Pathological_Q_wave', 'Sokolov_Lyon', 'Cornell_voltage']
df_test_rule = df[df['f.eid'].isin(TEST_FEIDS)][['LVM.group'] + rule_cols]

In [None]:
for col in rule_cols:
    print(f"> {col}")
    _ = print_classification_performance_summary(df_test_rule['LVM.group'], df_test_rule[col], df_test_rule[col])
    print()

In [None]:
for col in rule_cols:
    results[f'lvh_pred_{col.lower()}'] = df_test_rule[col].values

# Save

In [None]:
pd.DataFrame(results).to_csv('UKB_benchmarks.20250402.results.csv', index=False)