# Acute Inflammations Data Set
Multivariate(MultiLabel) example, the resultant is 2 feature vector.

In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

<b>Dataset Description</b><br>
Temperature of patient { 35C-42C }<br>
a2 Occurrence of nausea { yes, no }<br>
a3 Lumbar pain { yes, no }<br>
a4 Urine pushing (continuous need for urination) { yes, no }<br>
a5 Micturition pains { yes, no }<br>
a6 Burning of urethra, itch, swelling of urethra outlet { yes, no }<br>
d1 decision: Inflammation of urinary bladder { yes, no }<br>
d2 decision: Nephritis of renal pelvis origin { yes, no }<br>

In [2]:
#load data
path = "./datasets/diagnosis.csv"
data = pd.DataFrame(pd.read_csv(path,names=['Temp','a2','a3','a4','a5','a6','d1','d2']))
y_label1, y_label2 = data.pop('d1'), data.pop('d2')
y_data = pd.DataFrame({'d1':y_label1,
                  'd2':y_label2})
#X = data.drop('d1','d2')
#y = pd.DataFrame(Y['d1','d2'])

In [23]:
data.head(10)

Unnamed: 0,Temp,a2,a3,a4,a5,a6
0,35.5,0,1,0,0,0
1,35.9,0,0,1,1,1
2,35.9,0,1,0,0,0
3,36.0,0,0,1,1,1
4,36.0,0,1,0,0,0
5,36.0,0,1,0,0,0
6,36.2,0,0,1,1,1
7,36.2,0,1,0,0,0
8,36.3,0,0,1,1,1
9,36.6,0,0,1,1,1


In [3]:
# checking if condition is ever both yes
y_data[(y_data['d1'] == 'yes') & (y_data['d2'] == 'yes')].head(5)

Unnamed: 0,d1,d2
70,yes,yes
71,yes,yes
72,yes,yes
78,yes,yes
79,yes,yes


In [4]:
#data cleaning
import re
for col in data:
    data[col] = data[col].replace("yes","1")
    data[col] = data[col].replace("no","0")

ss = np.array(data['Temp'])    
i=0    
for val in ss:
    ss[i] = re.sub(',', '.', val)
    i +=1
data['Temp'] = pd.DataFrame(ss)    

for col in y_data:
    y_data[col] = y_data[col].replace("yes","1")
    y_data[col] = y_data[col].replace("no","0")


#y_label1, y_label2 = y_label1.replace("yes","1"), y_label2.replace("yes","1")
#y_label1, y_label2 = y_label1.replace("no","0"), y_label2.replace("yes","1")    

In [5]:
data.shape, y_data.shape

((120, 6), (120, 2))

In [6]:
data.head()

Unnamed: 0,Temp,a2,a3,a4,a5,a6
0,35.5,0,1,0,0,0
1,35.9,0,0,1,1,1
2,35.9,0,1,0,0,0
3,36.0,0,0,1,1,1
4,36.0,0,1,0,0,0


In [7]:
#train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, y_data, test_size=0.15, random_state=42)

In [8]:
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train,y_train)
pred_tree = tree_model.predict(X_test)

As we know currently there is no metric for multiclass accuracy verification, so we can atleast create function to manually verify accuracy the best.


In [9]:
sample = X_test.iloc[3]
tree_model.predict([sample])

array([['1', '0']], dtype=object)

In [10]:
#check
y_test.iloc[3]

d1    1
d2    0
Name: 55, dtype: object

In [11]:
def count_correction(predict, actual,right=0,wrong=0):
    for i in range(len(predict)):
        if actual[i][0] == predict[i][0]:
            if actual[i][1] == predict[i][1]:     
                right+=1
        else:
            wrong+=1 
            
            
    return right,wrong            

In [12]:
r, w = count_correction(pred_tree, y_test.values)
print('Number of Correct',r,' Wrong',w)

Number of Correct 18  Wrong 0


Shows our model is already outperforming this dataset

<b>Another way to handle multivariate is to divide them into two seperate classes(y1, y2) and prepare a model for each to predict on them.</b>

# Model 1

In [13]:
#train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, y_data.d1.astype(int), test_size=0.15, random_state=42)

In [14]:
tree_model = DecisionTreeClassifier() #normal
tree_model.fit(X_train,y_train)
pred_tree = tree_model.predict(X_test)

In [15]:
sample = X_test
tree_model.predict(sample)

array([1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0])

In [16]:
print("Acuuracy % .2f%% " % accuracy_score(y_test, pred_tree))

Acuuracy  1.00% 


In [17]:
cv_score = cross_val_score(tree_model, X_train, y_train, cv=5, scoring='accuracy')
print("Cross val score% .2f%% " % cv_score.mean())

Cross val score 1.00% 


# Model 2

In [18]:
#train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, y_data.d2.astype(int), test_size=0.15, random_state=42)

In [19]:
tree_model = DecisionTreeClassifier() #normal
tree_model.fit(X_train,y_train)
pred_tree = tree_model.predict(X_test)

In [20]:
sample = X_test
tree_model.predict(sample)

array([1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0])

In [21]:
print("Acuuracy % .2f%% " % accuracy_score(y_test, pred_tree))

Acuuracy  1.00% 


In [None]:
cv_score = cross_val_score(tree_model, X_train, y_train, cv=5, scoring='accuracy')
print("Cross val score% .2f%% " % cv_score.mean())

<b> This is probably overfitting anyways</b>