In [None]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

train_ckd = pd.read_csv(os.path.join('sets', 'CKD_train.csv'))
test_ckd = pd.read_csv(os.path.join('sets', 'CKD_test.csv'))

#print columns in the table
print("Columns in 165818_CKD-pure.csv: " + str(list(train_ckd.columns)))

#remove NA dCKD row
train_datanota = train_ckd[train_ckd['dCKD'].notna()]
test_datanota = test_ckd[test_ckd['dCKD'].notna()]

#set the labels
train_labels = train_datanota.dCKD.values
test_labels = test_datanota.dCKD.values

#fetch the columns that we'll be using for our models
_train_data = train_datanota[[ 'age', 'height', 'weight', 'rLen', 'rShort', 'rPT', 'rPTLPA', 'LE', 'rKME', 'rKUPE', 'rRPE ', 'lPT', 'lPGA', 'lTSPA', 'lSPA', 'lPSA', 'lCKE', 'lRUPE', 'lRPE', 'DSK', 'DLK']]
print("Data shape (rows, columns) =  ", _train_data.shape)

_test_data = test_datanota[[ 'age', 'height', 'weight', 'rLen', 'rShort', 'rPT', 'rPTLPA', 'LE', 'rKME', 'rKUPE', 'rRPE ', 'lPT', 'lPGA', 'lTSPA', 'lSPA', 'lPSA', 'lCKE', 'lRUPE', 'lRPE', 'DSK', 'DLK']]
print("Data shape (rows, columns) =  ", _test_data.shape)

#step3-6: replace illigal data to median 
dat_proc_3 = _train_data.replace('#DIV/0!', 0)
dat_proc_4 = dat_proc_3.fillna(0)
dat_proc_5 = dat_proc_4.astype('float32')
dat_proc_6 = dat_proc_5.replace(0, dat_proc_5.median())

dat_proc_31 = _test_data.replace('#DIV/0!', 0)
dat_proc_41 = dat_proc_31.fillna(0)
dat_proc_51 = dat_proc_41.astype('float32')
dat_proc_61 = dat_proc_51.replace(0, dat_proc_51.median())
#print(dat_proc_6.head(10))

#step 7: normalization
# max-min normalization
#train_data = (dat_proc_6 - dat_proc_6.min())/(dat_proc_6.max() - dat_proc_6.min())
#test_data = (dat_proc_61 - dat_proc_61.min())/(dat_proc_61.max() - dat_proc_61.min())
#print(data.head(10))

# z-score normalization
#train_data = (dat_proc_6 - dat_proc_6.mean())/(dat_proc_6.std())  
#test_data = (dat_proc_61 - dat_proc_61.mean())/(dat_proc_61.std()) 
#print(data1.head(10))

train_data = dat_proc_6
test_data = dat_proc_61

print("training data shape (rows, columns) =  ", train_data.shape)
print("TRUE rate: ", train_labels.sum()/len(train_labels))

print("testing data shape (rows, columns) =  ", test_data.shape)
print("TRUE rate: ", test_labels.sum()/len(test_labels))

In [None]:
from sklearn.model_selection import train_test_split
# Split training dataset to 2 group
tr_data, ts_data, tr_labels, ts_labels = train_test_split(train_data, train_labels, test_size=0.1, random_state=10)

from sklearn.svm import LinearSVC
clf = LinearSVC() #random_state=2000)
clf.fit(tr_data, tr_labels)

print("Weight vector: " + str(clf.coef_))
print("Bias: " + str(clf.intercept_))
print("Prediction accuracy:", clf.score(ts_data, ts_labels))

print("Prediction test accuracy:", clf.score(test_data, test_labels))

In [None]:

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Set the parameters by cross-validation
tuned_parameters = [{'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1, 10, 100, 1000, 10000]}]

print("# Tuning hyper-parameters for accuracy")
clf = GridSearchCV(LinearSVC(), tuned_parameters, cv=5, scoring='accuracy')
clf.fit(train_data, train_labels)

print("Best parameters set found on development set:")
print(clf.best_params_)
print("Grid scores on development set:")

means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

tr_data, ts_data, tr_labels, ts_labels = train_test_split(train_data, train_labels, test_size=0.25, random_state=10)

#train and test by using best parameters.
clf = LinearSVC(random_state=10, C=clf.best_params_['C'])
clf.fit(tr_data, tr_labels)

print("Weights: " + str(clf.coef_))
print("Bias: " + str(clf.intercept_))
print("Prediction accuracy:", clf.score(ts_data, ts_labels))

print("Prediction test accuracy:", clf.score(test_data, test_labels))

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Perceptron

tr_data, ts_data, tr_labels, ts_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=10)
clf = Perceptron(random_state=1000)
scores = cross_val_score(clf, train_data, train_labels,  cv=5, scoring='f1')
print('Scores max', scores.max())
print('Scores min', scores.min())
print('Scores mean', scores.mean())

#train and test on split A
clf.fit(tr_data, tr_labels)

#print("Setting A")
print("Perceptron weights for setting A: " + str(clf.coef_))
print("Perceptron bias for setting A: " + str(clf.intercept_))
print("Prediction accuracy for setting A:", clf.score(ts_data, ts_labels))

print("Prediction test accuracy:", clf.score(test_data, test_labels))


In [None]:
from sklearn.tree import DecisionTreeClassifier

tr_data, ts_data, tr_labels, ts_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=10)
clf = DecisionTreeClassifier(criterion='entropy')

scores = cross_val_score(clf, train_data, train_labels,  cv=5, scoring='f1')
print('Scores max', scores.max())
print('Scores min', scores.min())
print('Scores mean', scores.mean())

clf.fit(tr_data, tr_labels)

print("Accuracy", clf.score(ts_data, ts_labels))
print("The depth of this tree is", clf.tree_.max_depth)

#graph = graphviz.Source(tree.export_graphviz(clf, out_file=None)) 
#graph.render('graphs/tweet_dt_gini')

clf.fit(train_data, train_labels)
#print("Prediction eval accuracy:", clf.score(eval_data, eval_labels))
print("Prediction test accuracy:", clf.score(test_data, test_labels))


In [None]:
clf = DecisionTreeClassifier(criterion='gini')
clf.fit(tr_data, tr_labels)
#DecisionTreeClassifier()

print("Accuracy", clf.score(ts_data, ts_labels))
print("The depth of this tree is", clf.tree_.max_depth)

#graph = graphviz.Source(tree.export_graphviz(clf, out_file=None)) 
#graph.render('graphs/dt_entropy')

clf.fit(train_data, train_labels)
#DecisionTreeClassifier()
#print("Prediction eval accuracy:", clf.score(eval_data, eval_labels))
print("Prediction test accuracy:", clf.score(test_data, test_labels))

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=None, random_state=0, n_estimators=100, criterion='gini') 

scores = cross_val_score(clf, train_data, train_labels,  cv=5, scoring='f1')
print('Scores max', scores.max())
print('Scores min', scores.min())
print('Scores mean', scores.mean())

clf.fit(train_data, train_labels) 
print(str(list(train_data.columns)))
print(clf.feature_importances_)

#print("Prediction eval accuracy:", clf.score(eval_data, eval_labels)) 
print("Prediction test accuracy:", clf.score(test_data, test_labels))

In [None]:
from sklearn.linear_model import LogisticRegression

tr_data, ts_data, tr_labels, ts_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=1)
clf = LogisticRegression(penalty= 'l2')

#train and test on setting A
clf.fit(tr_data, tr_labels)
print('Prediction Accuracy:', clf.score(ts_data, ts_labels))

print(str(list(tr_data.columns)))
print(clf.coef_)
print("bias: " + str(clf.intercept_))

#train and test on setting B
#clf.fit(tr_data, tr_labels)

print('Prediction Accuracy:', clf.score(test_data, test_labels))

In [None]:

tr_data, ts_data, tr_labels, ts_labels = train_test_split(train_data, train_labels, test_size=0.1, random_state=1)

tuned_parameters = [{'C': [1e-3, 1e-2, 0.1, 0.25, 0.5, 0.75, 1, 10]}]

clf = GridSearchCV(LogisticRegression(penalty= 'l2'), tuned_parameters, cv=5, scoring='accuracy')
clf.fit(train_data, train_labels)

print("Best parameters set found on development set:")
print(clf.best_params_)
print("Grid scores on development set:")

means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

#train and test by using best parameters.
clf = LogisticRegression(penalty= 'l2', C=clf.best_params_['C'])

clf.fit(tr_data, tr_labels)

print("Weights: " + str(clf.coef_))
print("Bias: " + str(clf.intercept_))
print("Prediction accuracy:", clf.score(ts_data, ts_labels))

print("Prediction test accuracy:", clf.score(test_data, test_labels))