In [None]:
import pandas as pd
import numpy as np
import random

import pickle
# import seaborn as sns

from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, roc_auc_score, confusion_matrix, make_scorer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
%load_ext autoreload
%autoreload 2

from tqdm import tqdm

from model import Patient
# Update to new DB
from datetime import datetime, timedelta

random_seed = 42

In [None]:
first_diag = pd.read_csv('../../data/first_diagnosis.csv')

In [None]:
def get_first_diagnosis(pid):
    # print(pid)
    res = first_diag[(first_diag['Patient'] == pid)]['EntryDate']
    # print(len(res))
    if(len(res) == 0): return None
    # # else: return res[0]
    else:
        # print(type(res),res.iloc[0])
        # print(res.iloc[0])
        return datetime.strptime(res.iloc[0], '%Y-%m-%d').date()
    # if len(res) == 0: return None \
    # else: return datetime.strptime(get_first_diagnosis(res.iloc[0]), '%Y-%m-%d').date()

In [None]:
print(get_first_diagnosis(194430))

In [None]:
def patient_to_row(patient, CKD):
    cols = [patient.patient_id, bool(CKD), patient.ckd_stage, patient.sex, patient.age, patient.egfr, patient.average_egfr,
            patient.uacr, patient.average_uacr, patient.pu, patient.average_pu, patient.upcr, patient.average_upcr]
    diag = patient.diagnoses
    transplants = patient.transplants
    cols = np.concatenate([cols, diag, transplants])
    return cols


In [None]:
train_ids_pos = pd.read_csv('../../data/TRN/pos_trn.csv')
train_ids_neg = pd.read_csv('../../data/TRN/neg_trn.csv')
train_ids_neg.rename(columns={'Patient_id': 'Patient'}, inplace=True)
train_ids = pd.concat([train_ids_pos, train_ids_neg])
train_ids['Patient'] = train_ids['Patient'].astype(int)

In [None]:
test_ids_pos = pd.read_csv('../../data/TST/pos_tst.csv')
test_ids_neg = pd.read_csv('../../data/TST/neg_tst.csv')
test_ids_neg.rename(columns={'Patient_id': 'Patient'}, inplace=True)
test_ids = pd.concat([test_ids_pos, test_ids_neg])
test_ids['Patient'] = test_ids['Patient'].astype(int)

In [None]:
train_ids_pos.head()

In [None]:
train_ids = train_ids['Patient'].to_list()
test_ids = test_ids['Patient'].to_list()

In [None]:
random.seed(random_seed)
random.shuffle(train_ids)
random.shuffle(test_ids)

In [None]:
def patients_to_df(id_list, delta=365, crop_when_diagnosed=True):
    columns=['ID', 'CKD', 'CKD_stage', 'sex', 'age', 'last_egfr', 'avg_egfr', 'last_uacr', 'average_uacr', 'last_pu', 'average_pu',
            'last_upcr', 'average_upcr', 'Obesity', 'Hypertension', 'Aldosteronism', 'Hyperuricemia', 'CKD_mild', 'CKD_DB', 'kidney_failure_not_CKD', 'kidney_transplant', 'dialysis', 'cardiovascular', 'diabetes',
             'kidney', 'heart', 'liver', 'pancreas', 'islets', 'veins', 'uterus', 'small_intestine']
    rows = []
    for pid in tqdm(id_list):
        fd = get_first_diagnosis(pid)
        if fd is not None:
            fd -= timedelta(days=delta)
        CKD = fd != None
        if not crop_when_diagnosed:
            fd = None
        rows.append(patient_to_row(Patient(pid, fd), CKD))
    df_pat = pd.DataFrame(rows, columns=columns)
    df_pat['CKD'] = df_pat['CKD'].astype(bool)
    return df_pat

In [None]:
df_train = patients_to_df(train_ids, 0)

In [None]:
df_test = patients_to_df(test_ids, 365)

In [None]:
df_test_2yrs = patients_to_df(test_ids, 2*365)

In [None]:
# df = df.dropna(subset=['last_egfr'], how='all')
df_train = df_train.dropna(subset=['last_egfr'], how='all')
df_test = df_test.dropna(subset=['last_egfr'], how='all')
df_test_2yrs = df_test_2yrs.dropna(subset=['last_egfr'], how='all')

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(df.drop(columns = ['CDK', 'ID', 'Unnamed: 0']), df['CDK'], test_size=0.2, random_state=random_seed)

In [None]:
X_train = df_train.drop(columns = ['CKD', 'ID', 'CKD_DB', 'CKD_mild'])
y_train = df_train['CKD']

In [None]:
X_test = df_test.drop(columns = ['CKD', 'ID', 'CKD_DB', 'CKD_mild'])
y_test = df_test['CKD']

X_test_2yrs = df_test_2yrs.drop(columns = ['CKD', 'ID', 'CKD_DB', 'CKD_mild'])
y_test_2yrs = df_test_2yrs['CKD']

In [None]:
clf = RandomForestClassifier(max_depth=3, random_state=random_seed, class_weight="balanced" )
clf.fit(X_train, y_train)

In [None]:
with open("../ML_models/RF_labs_diagnoses_unoptimised.pkl", "wb") as model_file:
    pickle.dump(clf, model_file)

In [None]:
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

pred_test_2yrs = clf.predict(X_test_2yrs)

In [None]:
X_train[X_train['average_upcr'].isna()]

In [None]:
X_train.head()

In [None]:
# print(f"train recall_score: {recall_score(y_train,pred_train)}")
# print(f"val recall_score: {recall_score(y_val, pred_val)}")
# print(f"train accuracy_score: {accuracy_score(y_train,pred_train)}")
# print(f"val accuracy_score: {accuracy_score(y_val, pred_val)}")
# print(f"train f1_score: {f1_score(y_train,pred_train)}")
# print(f"val f1_score: {f1_score(y_val, pred_val)}")
# print(f"train roc_auc_score: {roc_auc_score(y_train,pred_train)}")
# print(f"val roc_auc_score: {roc_auc_score(y_val, pred_val)}")
# print(f"train precision_score: {precision_score(y_train,pred_train)}")
# print(f"val precision_score: {precision_score(y_val, pred_val)}")

print(f"train recall_score: {recall_score(y_train, pred_train)}")
print(f"test recall_score: {recall_score(y_test, pred_test)}")
print(f"train accuracy_score: {accuracy_score(y_train, pred_train)}")
print(f"test accuracy_score: {accuracy_score(y_test, pred_test)}")
print(f"train f1_score: {f1_score(y_train, pred_train)}")
print(f"test f1_score: {f1_score(y_test, pred_test)}")
print(f"train roc_auc_score: {roc_auc_score(y_train, pred_train)}")
print(f"test roc_auc_score: {roc_auc_score(y_test, pred_test)}")
print(f"train precision_score: {precision_score(y_train, pred_train)}")
print(f"test precision_score: {precision_score(y_test, pred_test)}")

In [None]:
print(f"train recall_score: {recall_score(y_train, pred_train)}")
print(f"test recall_score: {recall_score(y_test_2yrs, pred_test_2yrs)}")
print(f"train accuracy_score: {accuracy_score(y_train, pred_train)}")
print(f"test accuracy_score: {accuracy_score(y_test_2yrs, pred_test_2yrs)}")
print(f"train f1_score: {f1_score(y_train, pred_train)}")
print(f"test f1_score: {f1_score(y_test_2yrs, pred_test_2yrs)}")
print(f"train roc_auc_score: {roc_auc_score(y_train, pred_train)}")
print(f"test roc_auc_score: {roc_auc_score(y_test_2yrs, pred_test_2yrs)}")
print(f"train precision_score: {precision_score(y_train, pred_train)}")
print(f"test precision_score: {precision_score(y_test_2yrs, pred_test_2yrs)}")

In [None]:
print(confusion_matrix(y_test, pred_test))

In [None]:
print(len(pred_test))
print('actually false: ', len(y_test) - y_test.sum())
print('pred as false: ', len(pred_test) - pred_test.sum())

In [None]:
# for i, tree in enumerate(clf.estimators_):
#     plt.figure(figsize=(10, 6))
#     plot_tree(tree, filled=True, feature_names=X_train.columns, class_names=["False", "True"])
#     plt.title(f"Decision Tree {i+1}")
#     plt.show()

In [None]:
param_grid = {
    "n_estimators": [50, 100],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "class_weight": ["balanced"]  # Ensure class balancing
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scorer = make_scorer(recall_score)

with tqdm(total=len(param_grid["n_estimators"]) * len(param_grid["max_depth"]) * len(param_grid["min_samples_split"]) * len(param_grid["min_samples_leaf"])) as pbar:
    def callback(*args, **kwargs):
        pbar.update(1)

    grid_search = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid,
        scoring=scorer,  # Optimize for recall
        cv=cv,
        n_jobs=-1,  # Use all available CPU cores
        verbose=2
    )
    grid_search.fit(X_train, y_train)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_
print("Best Recall Score:", grid_search.best_score_)
print("Best Parameters:", grid_search.best_params_)

In [None]:
best_clf = RandomForestClassifier(**grid_search.best_params_, random_state=42)
best_clf.fit(X_train, y_train)

In [None]:
pred_train = best_clf.predict(X_train)
pred_test = best_clf.predict(X_test)

In [None]:
print(f"train recall_score: {recall_score(y_train, pred_train)}")
print(f"test recall_score: {recall_score(y_test, pred_test)}")
print(f"train accuracy_score: {accuracy_score(y_train, pred_train)}")
print(f"test accuracy_score: {accuracy_score(y_test, pred_test)}")
print(f"train f1_score: {f1_score(y_train, pred_train)}")
print(f"test f1_score: {f1_score(y_test, pred_test)}")
print(f"train roc_auc_score: {roc_auc_score(y_train, pred_train)}")
print(f"test roc_auc_score: {roc_auc_score(y_test, pred_test)}")
print(f"train precision_score: {precision_score(y_train, pred_train)}")
print(f"test precision_score: {precision_score(y_test, pred_test)}")

In [None]:
confusion_matrix(y_test, pred_test)

In [None]:
with open("../ML_models/RF_labs_diagnoses_optimised.pkl", "wb") as model_file:
    pickle.dump(best_clf, model_file)
