In [None]:
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

ori_csv_data = pd.read_csv(os.path.join('sets', '165818_CKD-origin.csv'), encoding='ISO-8859-1')
print("Data shape (rows, columns) =  ", ori_csv_data.shape)

#print(ori_csv_data.info())
print("Columns: " + str(list(ori_csv_data.columns)))

In [None]:
###################################
# data pre-processing
###################################
#step 1: remove non label(dCKD) rows
dat_proc_1 = ori_csv_data[ori_csv_data['dCKD'].notna()]
#print("Data shape (rows, columns) =  ", res_dat_proc_1.shape)

data_labels = dat_proc_1.dCKD.values
print("Global TRUE rate: ", data_labels.sum()/len(data_labels))
print("TRUE count: ", list(dat_proc_1.dCKD).count(1))
print("FALSE count: ", list(dat_proc_1.dCKD).count(0))

#step 2: choose relevant columns
dat_proc_2 = dat_proc_1[[ 'age', 'height', 'weight', 'rLen', 'rShort', 'rPT', 'rPTLPA', 'LE', 'rKME', 'rKUPE', 'rRPE ', 'lPT', 'lPGA', 'lTSPA', 'lSPA', 'lPSA', 'lCKE', 'lRUPE', 'lRPE', 'DSK', 'DLK', 'DSR', 'DLR']]
#dat_proc_2 = dat_proc_1[[ 'age', 'height', 'weight', 'rLen', 'rPT', 'rPTLPA', 'rKME', 'rKUPE', 'rRPE ', 'lPT', 'lPGA', 'lTSPA', 'lPSA', 'lCKE', 'lRUPE', 'lRPE', 'DSK', 'DLK']]
#dat_proc_2 = dat_proc_1[[ 'age', 'height', 'weight', 'rLen', 'rShort', 'rPT', 'rPTLPA', 'LE', 'rKME', 'rKUPE', 'rRPE ', 'lPT', 'lPGA', 'lTSPA', 'lSPA', 'lPSA', 'lCKE', 'lRUPE', 'lRPE', 'DSK', 'DLK']]
#dat_proc_2 = dat_proc_1[[ 'age', 'height', 'weight', 'rLen', 'rShort', 'rPT', 'rPTLPA', 'LE', 'rKME', 'rKUPE', 'rRPE ', 'lPT', 'lPGA', 'lTSPA', 'lSPA', 'lPSA', 'lCKE', 'lRUPE', 'lRPE', 'DSR', 'DLR']]
#print("Data shape (rows, columns) =  ", dat_proc_2.shape)
#print(dat_proc_2.count())

#step3-6: replace illigal data to median 
dat_proc_3 = dat_proc_2.replace('#DIV/0!', 0)
dat_proc_4 = dat_proc_3.fillna(0)
dat_proc_5 = dat_proc_4.astype('float32')
dat_proc_6 = dat_proc_5.replace(0, dat_proc_5.median())
#print(dat_proc_6.head(10))

#step 7: normalization
# max-min normalization
data = (dat_proc_6 - dat_proc_6.min())/(dat_proc_6.max() - dat_proc_6.min())
#print(data.head(10))

# z-score normalization
data1 = (dat_proc_6 - dat_proc_6.mean())/(dat_proc_6.std())  
#print(data1.head(10))


In [None]:
import numpy as np
for f in list(dat_proc_6.columns):
    print(""+str(f)+":"+str(np.unique(dat_proc_6[f])))


In [None]:
for f in list(data.columns):
    print(""+str(f)+":"+str(np.unique(data[f])))

In [None]:
for f in list(data1.columns):
    print(""+str(f)+":"+str(np.unique(data1[f])))

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib as mpl  
import matplotlib.pyplot as plt

def plot_roc(labels, predict_prob):
    false_positive_rate,true_positive_rate,thresholds=roc_curve(labels, predict_prob)
    roc_auc=auc(false_positive_rate, true_positive_rate)
    plt.title('ROC')
    plt.plot(false_positive_rate, true_positive_rate,'b',label='AUC = %0.4f'% roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.ylabel('TPR')
    plt.xlabel('FPR')
    plt.show()
    

In [None]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import matplotlib.pylab as plt
%matplotlib inline

# Split dataset to training and testing dataset
from sklearn.model_selection import train_test_split
train_data, test_data, train_labels, test_labels = train_test_split(data1, data_labels, test_size=0.5, random_state=10)
print("training data shape (rows, columns) =  ", train_data.shape)
print("TRUE rate: ", train_labels.sum()/len(train_labels))

print("testing data shape (rows, columns) =  ", test_data.shape)
print("TRUE rate: ", test_labels.sum()/len(test_labels))

X = train_data
y = train_labels

rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X,y)
print(rf0.oob_score_)

y_predprob = rf0.predict_proba(X)[:,1]

print("AUC Score (Train): %f" % metrics.roc_auc_score(y,y_predprob))

plot_roc(y, y_predprob)

In [None]:
from sklearn.model_selection import GridSearchCV # old version from sklearn.grid_search import GridSearchCV

#n_estimators
param_test1= {'n_estimators':list(range(30,201,20))}
gsearch1= GridSearchCV(estimator = RandomForestClassifier(min_samples_split=10,
                                 min_samples_leaf=20,max_depth=10,max_features='sqrt' ,random_state=10),
                       param_grid =param_test1, scoring='roc_auc',cv=6, n_jobs=-1)
gsearch1.fit(X,y)
gsearch1.cv_results_,gsearch1.best_params_, gsearch1.best_score_


In [None]:
#max_depth and min_samples_split
param_test2= {'max_depth':list(range(4,15,2)), 'min_samples_split':list(range(10,61,10))}
gsearch2= GridSearchCV(estimator = RandomForestClassifier(n_estimators=150,min_samples_leaf=20,max_features='sqrt',
                                                          oob_score=True,random_state=10),
                       param_grid = param_test2,scoring='roc_auc',iid=False, cv=6, n_jobs=-1)
gsearch2.fit(X,y)
gsearch2.cv_results_,gsearch2.best_params_, gsearch2.best_score_


In [None]:
#
rf1= RandomForestClassifier(n_estimators=150, max_depth=8, min_samples_split=10,
                            min_samples_leaf=20, max_features='sqrt' ,oob_score=True,random_state=10)
rf1.fit(test_data,test_labels)
print(rf1.oob_score_)

y_predprob = rf1.predict_proba(X)[:,1]
print("AUC Score (Train): %f" % metrics.roc_auc_score(y,y_predprob))
plot_roc(y, y_predprob)

In [None]:
#min_samples_split and min_samples_leaf
param_test3= {'min_samples_split':list(range(2,21,2)), 'min_samples_leaf':list(range(10,61,10))}
gsearch3= GridSearchCV(estimator = RandomForestClassifier(n_estimators=150,max_depth=8,
                                                          max_features='sqrt',oob_score=True,random_state=10),
                       param_grid = param_test3,scoring='roc_auc',iid=False, cv=6, n_jobs=-1)
gsearch3.fit(X,y)
gsearch3.cv_results_,gsearch3.best_params_, gsearch3.best_score_


In [None]:
#max_features:
param_test4= {'max_features':list(range(5,24,2))}
gsearch4= GridSearchCV(estimator = RandomForestClassifier(n_estimators=150,max_depth=8, min_samples_split=5,
                                 min_samples_leaf=10 ,oob_score=True, random_state=10),
   param_grid = param_test4,scoring='roc_auc',iid=False, cv=6, n_jobs=-1)
gsearch4.fit(X,y)
gsearch4.cv_results_,gsearch4.best_params_, gsearch4.best_score_


In [None]:
rf2= RandomForestClassifier(n_estimators=90, max_depth=10, min_samples_split=25,
                                 min_samples_leaf=10,max_features=5 ,oob_score=True, random_state=10)
rf2.fit(test_data,test_labels)
print(rf2.oob_score_)

y_predprob = rf2.predict_proba(X)[:,1]
print("AUC Score (Train): %f" % metrics.roc_auc_score(y,y_predprob))
plot_roc(y, y_predprob)