In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

train_ckd = pd.read_csv(os.path.join('sets', 'CKD_train.csv'))
test_ckd = pd.read_csv(os.path.join('sets', 'CKD_test.csv'))

#print columns in the table
print("Columns in 165818_CKD-pure.csv: " + str(list(train_ckd.columns)))

#remove NA dCKD row
train_datanota = train_ckd[train_ckd['dCKD'].notna()]
test_datanota = test_ckd[test_ckd['dCKD'].notna()]

#set the labels
train_labels = train_datanota.dCKD.values
test_labels = test_datanota.dCKD.values

#fetch the columns that we'll be using for our models
_train_data = train_datanota[[ 'age', 'height', 'weight', 'rLen', 'rShort', 'rPT', 'rPTLPA', 'LE', 'rKME', 'rKUPE', 'rRPE ', 'lPT', 'lPGA', 'lTSPA', 'lSPA', 'lPSA', 'lCKE', 'lRUPE', 'lRPE', 'DSK', 'DLK']]
print("Data shape (rows, columns) =  ", _train_data.shape)

_test_data = test_datanota[[ 'age', 'height', 'weight', 'rLen', 'rShort', 'rPT', 'rPTLPA', 'LE', 'rKME', 'rKUPE', 'rRPE ', 'lPT', 'lPGA', 'lTSPA', 'lSPA', 'lPSA', 'lCKE', 'lRUPE', 'lRPE', 'DSK', 'DLK']]
print("Data shape (rows, columns) =  ", _test_data.shape)

#step3-6: replace illigal data to median 
dat_proc_3 = _train_data.replace('#DIV/0!', 0)
dat_proc_4 = dat_proc_3.fillna(0)
dat_proc_5 = dat_proc_4.astype('float32')
dat_proc_6 = dat_proc_5.replace(0, dat_proc_5.median())

dat_proc_31 = _test_data.replace('#DIV/0!', 0)
dat_proc_41 = dat_proc_31.fillna(0)
dat_proc_51 = dat_proc_41.astype('float32')
dat_proc_61 = dat_proc_51.replace(0, dat_proc_51.median())
#print(dat_proc_6.head(10))

#step 7: normalization
# max-min normalization
#train_data = (dat_proc_6 - dat_proc_6.min())/(dat_proc_6.max() - dat_proc_6.min())
#test_data = (dat_proc_61 - dat_proc_61.min())/(dat_proc_61.max() - dat_proc_61.min())
#print(data.head(10))

# z-score normalization
#train_data = (dat_proc_6 - dat_proc_6.mean())/(dat_proc_6.std())  
#test_data = (dat_proc_61 - dat_proc_61.mean())/(dat_proc_61.std()) 
#print(data1.head(10))

train_data = dat_proc_6
test_data = dat_proc_61

print("training data shape (rows, columns) =  ", train_data.shape)
print("TRUE rate: ", train_labels.sum()/len(train_labels))

print("testing data shape (rows, columns) =  ", test_data.shape)
print("TRUE rate: ", test_labels.sum()/len(test_labels))

Columns in 165818_CKD-pure.csv: ['id', 'gender', 'age', 'height', 'weight', 'history', '24hPro', 'SCr', 'BUN', 'Albamin', 'GFR', 'CKDstage', 'dCKD', 'USET', 'rLen', 'rShort', 'rThick', 'rPT', 'rPTLA', 'rPTLPA', 'rPTLA.1', 'rPTSA', 'rPTSPA', 'rPSPA', 'Unnamed: 24', 'LE', 'rKME', 'rKUPE', 'rRPE ', 'Unnamed: 29', 'lLen', 'lShort', 'lThick', 'lPT', 'lPTA', 'lPGA', 'lPLA', 'lTSPA', 'lSPA', 'lPSA', 'Unnamed: 40', 'SE', 'lCKE', 'lRUPE', 'lRPE', 'DSK', 'DLK', 'rAR', 'lAR', 'rRP', 'lRP', 'PDT', 'PP', 'PK', 'PL', 'GloNumGM', 'GloNumIM', 'SW', 'SWR']
Data shape (rows, columns) =   (804, 21)
Data shape (rows, columns) =   (98, 21)
training data shape (rows, columns) =   (804, 21)
TRUE rate:  0.48009950248756217
testing data shape (rows, columns) =   (98, 21)
TRUE rate:  0.3163265306122449


In [2]:
from sklearn.model_selection import train_test_split
# Split training dataset to 2 group
tr_data, ts_data, tr_labels, ts_labels = train_test_split(train_data, train_labels, test_size=0.1, random_state=10)

from sklearn.svm import LinearSVC
clf = LinearSVC() #random_state=2000)
clf.fit(tr_data, tr_labels)

print("Weight vector: " + str(clf.coef_))
print("Bias: " + str(clf.intercept_))
print("Prediction accuracy:", clf.score(ts_data, ts_labels))

print("Prediction test accuracy:", clf.score(test_data, test_labels))

Weight vector: [[ 0.01131223 -0.00081988  0.01386523 -0.08541791  0.01442923 -0.07223228
   0.01540985  0.03890614  0.03326543  0.02647279 -0.03763575 -0.09030079
   0.0049682   0.07355653 -0.1446754  -0.0892364   0.03178513  0.02560994
  -0.05386437 -0.00730449 -0.07928107]]
Bias: [0.00270649]
Prediction accuracy: 0.7283950617283951
Prediction test accuracy: 0.8469387755102041




In [3]:

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Set the parameters by cross-validation
tuned_parameters = [{'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1, 10, 100, 1000, 10000]}]

print("# Tuning hyper-parameters for accuracy")
clf = GridSearchCV(LinearSVC(), tuned_parameters, cv=5, scoring='accuracy')
clf.fit(train_data, train_labels)

print("Best parameters set found on development set:")
print(clf.best_params_)
print("Grid scores on development set:")

means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

tr_data, ts_data, tr_labels, ts_labels = train_test_split(train_data, train_labels, test_size=0.25, random_state=10)

#train and test by using best parameters.
clf = LinearSVC(random_state=10, C=clf.best_params_['C'])
clf.fit(tr_data, tr_labels)

print("Weights: " + str(clf.coef_))
print("Bias: " + str(clf.intercept_))
print("Prediction accuracy:", clf.score(ts_data, ts_labels))

print("Prediction test accuracy:", clf.score(test_data, test_labels))

# Tuning hyper-parameters for accuracy
Best parameters set found on development set:
{'C': 0.01}
Grid scores on development set:
0.792 (+/-0.055) for {'C': 0.0001}
0.800 (+/-0.048) for {'C': 0.001}
0.803 (+/-0.040) for {'C': 0.01}
0.723 (+/-0.159) for {'C': 0.1}
0.669 (+/-0.226) for {'C': 0.5}
0.664 (+/-0.145) for {'C': 1}
0.695 (+/-0.217) for {'C': 10}
0.729 (+/-0.109) for {'C': 100}
0.750 (+/-0.159) for {'C': 1000}
0.693 (+/-0.095) for {'C': 10000}
Weights: [[ 0.0156459   0.00268355  0.00882215 -0.08200557 -0.01847702 -0.06631001
   0.01402201  0.01030253  0.06087526  0.01647691 -0.05620585 -0.07752089
   0.00443179  0.03279666 -0.10909268 -0.03479505  0.03758792  0.02945431
  -0.04602959 -0.0006635  -0.05089783]]
Bias: [0.00822612]
Prediction accuracy: 0.7064676616915423
Prediction test accuracy: 0.7653061224489796


In [4]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Perceptron

tr_data, ts_data, tr_labels, ts_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=10)
clf = Perceptron(random_state=1000)
scores = cross_val_score(clf, train_data, train_labels,  cv=5, scoring='f1')
print('Scores max', scores.max())
print('Scores min', scores.min())
print('Scores mean', scores.mean())

#train and test on split A
clf.fit(tr_data, tr_labels)

#print("Setting A")
print("Perceptron weights for setting A: " + str(clf.coef_))
print("Perceptron bias for setting A: " + str(clf.intercept_))
print("Prediction accuracy for setting A:", clf.score(ts_data, ts_labels))

print("Prediction test accuracy:", clf.score(test_data, test_labels))


Scores max 0.8121212121212121
Scores min 0.6271186440677966
Scores mean 0.7039560665882856
Perceptron weights for setting A: [[  326.           -95.           194.          -711.47394323
    -74.1020093   -135.91600722   148.05995846    53.90595651
   1490.97003651  1064.14999866 -1253.93893337  -125.85700446
   -101.37802982  -737.633986    -197.16202807  -651.41998911
    956.77293158   899.30708432 -1437.25508595  -181.4220304
  -1506.03000941]]
Perceptron bias for setting A: [-25.]
Prediction accuracy for setting A: 0.7391304347826086
Prediction test accuracy: 0.8571428571428571


In [5]:
from sklearn.tree import DecisionTreeClassifier

tr_data, ts_data, tr_labels, ts_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=10)
clf = DecisionTreeClassifier(criterion='entropy')

scores = cross_val_score(clf, train_data, train_labels,  cv=5, scoring='f1')
print('Scores max', scores.max())
print('Scores min', scores.min())
print('Scores mean', scores.mean())

clf.fit(tr_data, tr_labels)

print("Accuracy", clf.score(ts_data, ts_labels))
print("The depth of this tree is", clf.tree_.max_depth)

#graph = graphviz.Source(tree.export_graphviz(clf, out_file=None)) 
#graph.render('graphs/tweet_dt_gini')

clf.fit(train_data, train_labels)
#print("Prediction eval accuracy:", clf.score(eval_data, eval_labels))
print("Prediction test accuracy:", clf.score(test_data, test_labels))


Scores max 0.7466666666666667
Scores min 0.6184210526315789
Scores mean 0.6726974158084287
Accuracy 0.6708074534161491
The depth of this tree is 15
Prediction test accuracy: 0.7755102040816326


In [6]:
clf = DecisionTreeClassifier(criterion='gini')
clf.fit(tr_data, tr_labels)
#DecisionTreeClassifier()

print("Accuracy", clf.score(ts_data, ts_labels))
print("The depth of this tree is", clf.tree_.max_depth)

#graph = graphviz.Source(tree.export_graphviz(clf, out_file=None)) 
#graph.render('graphs/dt_entropy')

clf.fit(train_data, train_labels)
#DecisionTreeClassifier()
#print("Prediction eval accuracy:", clf.score(eval_data, eval_labels))
print("Prediction test accuracy:", clf.score(test_data, test_labels))

Accuracy 0.7204968944099379
The depth of this tree is 18
Prediction test accuracy: 0.7448979591836735


In [7]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=None, random_state=0, n_estimators=100, criterion='gini') 

scores = cross_val_score(clf, train_data, train_labels,  cv=5, scoring='f1')
print('Scores max', scores.max())
print('Scores min', scores.min())
print('Scores mean', scores.mean())

clf.fit(train_data, train_labels) 
print(str(list(train_data.columns)))
print(clf.feature_importances_)

#print("Prediction eval accuracy:", clf.score(eval_data, eval_labels)) 
print("Prediction test accuracy:", clf.score(test_data, test_labels))

Scores max 0.8211920529801324
Scores min 0.7333333333333334
Scores mean 0.7637349787982514
['age', 'height', 'weight', 'rLen', 'rShort', 'rPT', 'rPTLPA', 'LE', 'rKME', 'rKUPE', 'rRPE ', 'lPT', 'lPGA', 'lTSPA', 'lSPA', 'lPSA', 'lCKE', 'lRUPE', 'lRPE', 'DSK', 'DLK']
[0.04288739 0.02517597 0.02833008 0.07840926 0.03460021 0.04731739
 0.03113707 0.03548976 0.10083155 0.05939716 0.04008755 0.04336505
 0.02256336 0.03719496 0.02782718 0.06711636 0.05707216 0.04097391
 0.03865806 0.04555186 0.09601372]
Prediction test accuracy: 0.8469387755102041


In [8]:
from sklearn.linear_model import LogisticRegression

tr_data, ts_data, tr_labels, ts_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=1)
clf = LogisticRegression(penalty= 'l2')

#train and test on setting A
clf.fit(tr_data, tr_labels)
print('Prediction Accuracy:', clf.score(ts_data, ts_labels))

print(str(list(tr_data.columns)))
print(clf.coef_)
print("bias: " + str(clf.intercept_))

#train and test on setting B
#clf.fit(tr_data, tr_labels)

print('Prediction Accuracy:', clf.score(test_data, test_labels))

Prediction Accuracy: 0.7888198757763976
['age', 'height', 'weight', 'rLen', 'rShort', 'rPT', 'rPTLPA', 'LE', 'rKME', 'rKUPE', 'rRPE ', 'lPT', 'lPGA', 'lTSPA', 'lSPA', 'lPSA', 'lCKE', 'lRUPE', 'lRPE', 'DSK', 'DLK']
[[ 0.037688    0.00186248  0.03143257 -0.34211067  0.01020101 -0.16919795
   0.08136874  0.07052542  0.10559258  0.05130595 -0.10626961 -0.11447872
  -0.01918391  0.09076781 -0.27553906 -0.10596605  0.09685108  0.06657779
  -0.15263384 -0.02336118 -0.1792175 ]]
bias: [0.03136938]
Prediction Accuracy: 0.8877551020408163


In [9]:

tr_data, ts_data, tr_labels, ts_labels = train_test_split(train_data, train_labels, test_size=0.1, random_state=1)

tuned_parameters = [{'C': [1e-3, 1e-2, 0.1, 0.25, 0.5, 0.75, 1, 10]}]

clf = GridSearchCV(LogisticRegression(penalty= 'l2'), tuned_parameters, cv=5, scoring='accuracy')
clf.fit(train_data, train_labels)

print("Best parameters set found on development set:")
print(clf.best_params_)
print("Grid scores on development set:")

means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

#train and test by using best parameters.
clf = LogisticRegression(penalty= 'l2', C=clf.best_params_['C'])

clf.fit(tr_data, tr_labels)

print("Weights: " + str(clf.coef_))
print("Bias: " + str(clf.intercept_))
print("Prediction accuracy:", clf.score(ts_data, ts_labels))

print("Prediction test accuracy:", clf.score(test_data, test_labels))

Best parameters set found on development set:
{'C': 0.1}
Grid scores on development set:
0.779 (+/-0.048) for {'C': 0.001}
0.800 (+/-0.053) for {'C': 0.01}
0.808 (+/-0.050) for {'C': 0.1}
0.802 (+/-0.054) for {'C': 0.25}
0.799 (+/-0.054) for {'C': 0.5}
0.805 (+/-0.042) for {'C': 0.75}
0.800 (+/-0.044) for {'C': 1}
0.803 (+/-0.047) for {'C': 10}
Weights: [[ 0.03198731  0.00936145  0.02366814 -0.24812165  0.04660478 -0.15155316
   0.05269361  0.06210675  0.10897433  0.05581873 -0.14069675 -0.14927122
  -0.02313437  0.09703739 -0.23153881 -0.13536961  0.09991703  0.05636766
  -0.14871567 -0.01518453 -0.1681132 ]]
Bias: [0.02369321]
Prediction accuracy: 0.8888888888888888
Prediction test accuracy: 0.8877551020408163
