In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split

In [2]:
pneumonia_data = pd.read_csv("C:/Users/coryg/OneDrive/Desktop/STAT_574_Data_Mining/pneumonia_data.csv")
code_gender = {'M':1, 'F':0}
code_tobacco_use = {'yes':1, 'no':0}
code_pneumonia = {'yes':1, 'no':0}

pneumonia_data['gender'] = pneumonia_data['gender'].map(code_gender)
pneumonia_data['tobacco_use'] = pneumonia_data['tobacco_use'].map(code_tobacco_use)
pneumonia_data['pneumonia'] = pneumonia_data['pneumonia'].map(code_pneumonia)

X = pneumonia_data.iloc[:,0:4].values
y = pneumonia_data.iloc[:,4].values

In [3]:
#Splitting data into 80% training and 20% testing.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=199233)

#Fitting binary tree with Gini Splitting Criterion

gini_tree = DecisionTreeClassifier(max_leaf_nodes=6, criterion='gini', random_state=786756)
gini_tree.fit = gini_tree.fit(X_train, y_train)

In [4]:
#Computing confusion matrix and performance measures for testing set

y_pred = gini_tree.predict_proba(X_test)

total = len(y_pred)
tpos = []
fpos = []
tneg = []
fneg = []

for sub1, sub2 in zip(y_pred[::,1], y_test):
    tpos.append(1) if (sub1>0.5 and sub2==1) else tpos.append(0)
    fpos.append(1) if (sub1>0.5 and sub2==0) else fpos.append(0)
    tneg.append(1) if (sub1<0.5 and sub2==0) else tneg.append(0)
    fneg.append(1) if (sub1<0.5 and sub2==1) else fneg.append(0)
    tp = sum(tpos)
    fp = sum(fpos)
    tn = sum(tneg)
    fn = sum(fneg)

print('tp:', tp)
print('fp:', fp)
print('tn:', tn)
print('fn:', fn)

accuracy = (tp+tn)/total
misclassrate = (fp+fn)/total
sensitivity = tp/(tp+fn)
FNR = fn/(tp+fn)
specificity = tn/(fp+tn)
FPR = fp/(fp+tn)
precision = tp/(tp+fp)
NPV = tn/(fn+tn)
F1score = 2*tp/(2*tp+fn+fp)

print("accuracy:", accuracy)
print("misclassrate:", misclassrate)
print("sensitivity:", sensitivity)
print("FNR:", FNR)
print("specificity:", specificity)
print("FPR:", FPR)
print("precision:", precision)
print("NPV:", NPV)
print("F1score:", F1score)

tp: 81
fp: 45
tn: 176
fn: 44
accuracy: 0.7427745664739884
misclassrate: 0.25722543352601157
sensitivity: 0.648
FNR: 0.352
specificity: 0.7963800904977375
FPR: 0.20361990950226244
precision: 0.6428571428571429
NPV: 0.8
F1score: 0.6454183266932271
