In [143]:
import pandas as pd
import numpy as np
import random

import seaborn as sns

from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
%load_ext autoreload
%autoreload 2

from tqdm import tqdm

from model import Patient

from datetime import datetime, timedelta

random_seed = 42

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [98]:
first_diag = pd.read_csv('../../data/first_diagnosis.csv')

In [99]:
def get_first_diagnosis(pid):
    # print(pid)
    res = first_diag[(first_diag['Patient'] == pid)]['EntryDate']
    # print(len(res))
    if(len(res) == 0): return None
    # # else: return res[0]
    else: 
        # print(type(res),res.iloc[0])
        # print(res.iloc[0])
        return datetime.strptime(res.iloc[0], '%Y-%m-%d').date()
    # if len(res) == 0: return None \
    # else: return datetime.strptime(get_first_diagnosis(res.iloc[0]), '%Y-%m-%d').date()

In [100]:
print(get_first_diagnosis(194430))

2023-08-02


In [101]:
def patient_to_row(patient, CKD):
    return [patient.patient_id, CKD, patient.ckd_stage, patient.sex, patient.age, patient.egfr, patient.average_egfr, 
            patient.uacr, patient.average_uacr, patient.pu, patient.average_pu, patient.upcr, patient.average_upcr]
    

In [102]:
train_ids_pos = pd.read_csv('../../data/TRN/pos_trn.csv')
train_ids_neg = pd.read_csv('../../data/TRN/neg_trn.csv')
train_ids_neg.rename(columns={'Patient_id': 'Patient'}, inplace=True)
train_ids = pd.concat([train_ids_pos, train_ids_neg])
train_ids['Patient'] = train_ids['Patient'].astype(int)

In [103]:
train_ids = train_ids['Patient'].to_list()

In [104]:
random.seed(random_seed)
random.shuffle(train_ids)

In [146]:
def patients_to_df(id_list):
    columns=['ID', 'CKD', 'CKD_stage', 'sex', 'age', 'last_egfr', 'avg_egfr', 'last_uacr', 'average_uacr', 'last_pu', 'average_pu',
            'last_upcr', 'average_upcr']
    rows = []
    for pid in tqdm(id_list):
        fd = get_first_diagnosis(pid)
        if fd is not None:
            fd -= timedelta(days=365)
        CKD = fd == None
        rows.append(patient_to_row(Patient(pid, fd), CKD))
    return pd.DataFrame(rows, columns=columns)

In [148]:
# df = patients_to_df(train_ids)
df = pd.read_csv('../../data/processed_data.csv')

  3%|██▌                                                                           | 373/11267 [00:11<05:26, 33.36it/s]


KeyboardInterrupt: 

In [110]:
df.count()

Unnamed: 0      11267
ID              11267
CDK             11267
CDK_stage         640
sex             11267
age             11266
last_egfr        7856
avg_egfr         7857
last_uacr         646
average_uacr      721
last_pu           281
average_pu        290
last_upcr         246
average_upcr      252
dtype: int64

In [111]:
df = df.dropna(subset=['last_egfr'], how='all')

In [112]:
df.count()

Unnamed: 0      7856
ID              7856
CDK             7856
CDK_stage        640
sex             7856
age             7856
last_egfr       7856
avg_egfr        7856
last_uacr        640
average_uacr     715
last_pu          279
average_pu       288
last_upcr        244
average_upcr     250
dtype: int64

In [130]:
df.dtypes

Unnamed: 0        int64
ID                int64
CDK                bool
CDK_stage       float64
sex               int64
age             float64
last_egfr       float64
avg_egfr        float64
last_uacr       float64
average_uacr    float64
last_pu         float64
average_pu      float64
last_upcr       float64
average_upcr    float64
dtype: object

In [122]:
X_train, X_val, y_train, y_val = train_test_split(df.drop(columns = ['CDK', 'ID']), df['CDK'], test_size=0.2, random_state=random_seed)

In [123]:
clf = RandomForestClassifier(max_depth=3, random_state=random_seed)
clf.fit(X_train, y_train)

In [125]:
pred_train = clf.predict(X_train)
pred_val = clf.predict(X_val)

In [142]:
print(f"train recall_score: {recall_score(y_train,pred_train)}")
print(f"val recall_score: {recall_score(y_val, pred_val)}")
print(f"train accuracy_score: {accuracy_score(y_train,pred_train)}")
print(f"val accuracy_score: {accuracy_score(y_val, pred_val)}")
print(f"train f1_score: {f1_score(y_train,pred_train)}")
print(f"val f1_score: {f1_score(y_val, pred_val)}")
print(f"train roc_auc_score: {roc_auc_score(y_train,pred_train)}")
print(f"val roc_auc_score: {roc_auc_score(y_val, pred_val)}")
print(f"train precision_score: {precision_score(y_train,pred_train)}")
print(f"val precision_score: {precision_score(y_val, pred_val)}")

train recall_score: 0.9989495798319328
val recall_score: 0.9992917847025495
train accuracy_score: 0.9237746658179503
val accuracy_score: 0.9141221374045801
train f1_score: 0.959717433352956
val f1_score: 0.9543456205613797
train roc_auc_score: 0.586013251454428
val roc_auc_score: 0.5808958923512748
train precision_score: 0.9234503965042887
val precision_score: 0.9132686084142395


In [None]:
def test_models(X_train, X_val, y_train, y_val):
    param_grid = { 
    'criterion': ['friedman_mse'],
    # 'criterion': ['squared_error', 'friedman_mse', 'absolute_error'],
    # 'min_samples_split': range(3,8),
    # 'min_samples_leaf': range(1,5)
    }
    param_comb = ParameterGrid(param_grid)

    best_rmse = float("inf")
    best_params = None
    best_depth = None
    best_n_estimators = None
    best_max_samples = None

    rmses = []
    best_reg = None
    
    # n_estimators, max_samples, max_depth
    for tqdm(n_estim in range(10, 100, 10)):
        for depth in range(2, 6):
            for param in param_comb:
                clf = RandomForestClassifier(n_estimators=n_estim, max_depth=depth, **param)
                clf.fit(X_train, y_train)
                rmse = root_mean_squared_error(clf.predict(X_val), y_val)
                if(rmse < best_rmse):
                    best_depth = depth
                    best_n_estimators = n_estim
                    best_params = param
                    best_rmse = rmse
                    rmses.append(rmse)
                    best_clf = clf
        


    print(f"best_rmse: {best_rmse}")
    print(f"best_depth: {best_depth}")
    print(f"best_n_estimators: {best_n_estimators}")
    print(f"best_params: {best_params}")

    return best_rmse, best_depth, best_n_estimators, best_params, rmses, best_clf