In [1]:
import pandas as pd
import numpy as np
import random

import seaborn as sns

from sklearn.model_selection import train_test_split, ParameterGrid
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
%load_ext autoreload
%autoreload 2

from tqdm import tqdm

from model import Patient

from datetime import datetime

random_seed = 42

In [2]:
first_diag = pd.read_csv('../../data/first_diagnosis.csv')

In [82]:
def get_first_diagnosis(pid):
    # print(pid)
    res = first_diag[(first_diag['Patient'] == pid)]['EntryDate']
    # print(len(res))
    if(len(res) == 0): return None
    # # else: return res[0]
    else: 
        # print(type(res),res.iloc[0])
        # print(res.iloc[0])
        return datetime.strptime(res.iloc[0], '%Y-%m-%d').date()
    # if len(res) == 0: return None \
    # else: return datetime.strptime(get_first_diagnosis(res.iloc[0]), '%Y-%m-%d').date()

In [84]:
print(get_first_diagnosis(194430))

2023-08-02


In [94]:
def patient_to_row(patient, CKD):
    return [patient.patient_id, CKD, patient.ckd_stage, patient.sex, patient.age, patient.egfr, patient.average_egfr, 
            patient.uacr, patient.average_uacr, patient.pu, patient.average_pu, patient.upcr, patient.average_upcr]
    

In [86]:
train_ids_pos = pd.read_csv('../../data/TRN/pos_trn.csv')
train_ids_neg = pd.read_csv('../../data/TRN/neg_trn.csv')
train_ids_neg.rename(columns={'Patient_id': 'Patient'}, inplace=True)
train_ids = pd.concat([train_ids_pos, train_ids_neg])
train_ids['Patient'] = train_ids['Patient'].astype(int)

In [87]:
train_ids = train_ids['Patient'].to_list()

In [88]:
random.seed(random_seed)
random.shuffle(train_ids)

In [95]:
def patients_to_df(id_list):
    columns=['ID', 'CDK', 'CDK_stage' 'sex', 'age', 'last_egfr', 'avg_egfr', 'last_uacr', 'average_uacr', 'last_pu', 'average_pu'
            'last_upcr', 'average_upcr']
    rows = []
    for pid in tqdm(id_list):
        fd = get_first_diagnosis(pid)
        CKD = fd == None
        rows.append(patient_to_row(Patient(pid, fd), CKD))
    return pd.DataFrame(rows, columns=columns)

In [None]:
df = patients_to_df(train_ids)

  3%|██▏                                                                         | 327/11267 [02:34<1:19:57,  2.28it/s]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df.drop(columns = ['CKD']), df['CKD'], test_size=0.2, random_state=random_seed)

In [None]:
clf = RandomForestClassifier(max_depth=3, random_state=random_seed)
clf.fit(X_train, y_train)

In [None]:
print(f"train RMSE: {root_mean_squared_error(y_train, reg_RF.predict(X_train))}")
print(f"val RMSE: {root_mean_squared_error(y_val, reg_RF.predict(X_val))}")

In [None]:
def test_models(X_train, X_val, y_train, y_val):
    param_grid = { 
    'criterion': ['friedman_mse'],
    # 'criterion': ['squared_error', 'friedman_mse', 'absolute_error'],
    # 'min_samples_split': range(3,8),
    # 'min_samples_leaf': range(1,5)
    }
    param_comb = ParameterGrid(param_grid)

    best_rmse = float("inf")
    best_params = None
    best_depth = None
    best_n_estimators = None
    best_max_samples = None

    rmses = []
    best_reg = None
    
    # n_estimators, max_samples, max_depth
    for tqdm(n_estim in range(10, 100, 10)):
        for depth in range(2, 6):
            for param in param_comb:
                clf = RandomForestClassifier(n_estimators=n_estim, max_depth=depth, **param)
                clf.fit(X_train, y_train)
                rmse = root_mean_squared_error(clf.predict(X_val), y_val)
                if(rmse < best_rmse):
                    best_depth = depth
                    best_n_estimators = n_estim
                    best_params = param
                    best_rmse = rmse
                    rmses.append(rmse)
                    best_clf = clf
        


    print(f"best_rmse: {best_rmse}")
    print(f"best_depth: {best_depth}")
    print(f"best_n_estimators: {best_n_estimators}")
    print(f"best_params: {best_params}")

    return best_rmse, best_depth, best_n_estimators, best_params, rmses, best_clf