In [39]:
import numpy as np
import pandas as pd

In [40]:
df =pd.read_csv("Disease_Symptoms.csv")
disease = set(df.iloc[:,0])
disease = list(disease)
disease.sort()
disease

['Alcoholic_hepatitis',
 'Cervical_spondylosis',
 'Dengue',
 'Dimorphic_hemmorhoids',
 'Drug_Reaction',
 'Heart_Attack',
 'Hypertension',
 'Malaria',
 'Migraine',
 'Paralysis',
 'Pneumonia']

# Data Preprocessing (Label enconding)

In [41]:
from sklearn.preprocessing import LabelEncoder
labelencoder_Y = LabelEncoder()
df.iloc[:,0] = labelencoder_Y.fit_transform(df.iloc[:,0].values)

with open("Coded.csv","w") as f:
    df.to_csv(f,line_terminator = "\n", encoding = "ISO-8859-1")
print(df.dtypes)
df.shape

Disease                  int32
bloody_stools            int64
fecal_leakage            int64
swelling                 int64
dizziness                int64
confusion                int64
fatigue                  int64
itching                  int64
vomiting                 int64
arm_pain                 int64
cough                    int64
muscle_pain              int64
depression               int64
fever                    int64
painful_bowel_moments    int64
urine_blood              int64
sweating                 int64
nausea                   int64
stiff_neck               int64
decreased_appetite       int64
weak                     int64
wheezing                 int64
bleeding                 int64
hives                    int64
bleed                    int64
headache                 int64
dry_mouth                int64
sweat                    int64
stomach_pain             int64
stool_pressure           int64
anxiety                  int64
shoulder_pain            int64
anus_itc

(5324, 57)

In [42]:
# print(df.head())

# Split the dataset into independent (X) and dependent (Y)

In [43]:
X = df.iloc[:,1:58].values
Y =df.iloc[:,0].values
print(X.shape)
print(Y.shape)
X.dtype

(5324, 56)
(5324,)


dtype('int64')

# Spliting Dataset into training and testing samples

In [44]:
# Split the dataset into 80% training and 20% testing

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20,random_state = 0)
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# Creating Models ( Decision Tree , Random Forest )

In [45]:
# Create a function for the models


def models(X_train,Y_train):
    
    # Decision Tree Classifier
    
    from sklearn.tree import DecisionTreeClassifier
    tree = DecisionTreeClassifier(criterion = "entropy" , random_state=0)
    tree.fit(X_train,Y_train)
    
    # Random Forest Classifier
    
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(n_estimators = 10 , criterion = "entropy",random_state=0)
    forest.fit(X_train,Y_train)
    
    # Print the models accuracy on the training data
    
    print('[0] Decision Tree Classifier Training Accuracy : ',tree.score(X_train,Y_train))
    print('[1] Random Forest Classifier Training Accuracy : ',forest.score(X_train,Y_train))
    
    return tree,forest
    
    

In [46]:
# Getting all the models

model = models(X_train,Y_train)

[0] Decision Tree Classifier Training Accuracy :  1.0
[1] Random Forest Classifier Training Accuracy :  1.0


# Testing Model Accuracy by Confusion Matrix

In [47]:
# Test model accuracy on test data on Confusion matrix

from sklearn.metrics import confusion_matrix

for i in range(len(model)):
    print("Model ", i)
    
    cm = confusion_matrix(Y_test,model[i].predict(X_test))
    # [[true_negative , false_postive] [false_negative,true_positive]]
    TP = cm[1][1]
    TN = cm[0][0]
    FP = cm[0][1]
    FN = cm[1][0]

    print(cm)
    print("Testing Accuracy =  ",(TP + TN)/(TP + TN + FN + FP))
    print()


Model  0
[[ 93   0   0   0   0   0   0   0   0   0   0]
 [  0  85   0   0   0   0   0   0   0   0   0]
 [  0   0 101   0   0   0   0   0   0   0   0]
 [  0   0   0  90   0   0   0   0   0   0   0]
 [  0   0   0   0  87   0   0   0   0   0   0]
 [  0   0   0   0   0 107   0   0   0   0   0]
 [  0   0   0   0   0   0  89   0   0   0   0]
 [  0   0   0   0   0   0   0  99   0   0   0]
 [  0   0   0   0   0   0   0   0  91   0   0]
 [  0   0   0   0   0   0   0   0   0 122   0]
 [  0   0   0   0   0   0   0   0   0   0 101]]
Testing Accuracy =   1.0

Model  1
[[ 93   0   0   0   0   0   0   0   0   0   0]
 [  0  85   0   0   0   0   0   0   0   0   0]
 [  0   0 101   0   0   0   0   0   0   0   0]
 [  0   0   0  90   0   0   0   0   0   0   0]
 [  0   0   0   0  87   0   0   0   0   0   0]
 [  0   0   0   0   0 107   0   0   0   0   0]
 [  0   0   0   0   0   0  89   0   0   0   0]
 [  0   0   0   0   0   0   0  99   0   0   0]
 [  0   0   0   0   0   0   0   0  91   0   0]
 [  0   0   0  

# Testing accuracy by another method


In [48]:
# Show another way to get metrices of the models

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


for i in range(len(model)):
    print("Model ", i)
    print(classification_report(Y_test,model[i].predict(X_test)))
    print(accuracy_score(Y_test,model[i].predict(X_test)))
    print()

Model  0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        93
           1       1.00      1.00      1.00        85
           2       1.00      1.00      1.00       101
           3       1.00      1.00      1.00        90
           4       1.00      1.00      1.00        87
           5       1.00      1.00      1.00       107
           6       1.00      1.00      1.00        89
           7       1.00      1.00      1.00        99
           8       1.00      1.00      1.00        91
           9       1.00      1.00      1.00       122
          10       1.00      1.00      1.00       101

    accuracy                           1.00      1065
   macro avg       1.00      1.00      1.00      1065
weighted avg       1.00      1.00      1.00      1065

1.0

Model  1
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        93
           1       1.00      1.00      1.00        85
 

In [49]:
X_test.shape

(1065, 56)

# Saving models into file

In [52]:
def saveModel():
    import pickle
    with open("DiseasePrediction(Rand)","wb") as f:
        pickle.dump(model[1],f)

In [53]:
saveModel()

# Testing the model against incoming input

In [54]:
header = ['bloody_stools','fecal_leakage','swelling','dizziness','confusion','fatigue','itching','vomiting','arm_pain',
          'cough','muscle_pain','depression','fever','painful_bowel_moments','urine_blood','sweating','nausea',
          'stiff_neck','decreased_appetite','weak','wheezing','bleeding','hives','bleed','headache','dry_mouth','sweat',
          'stomach_pain','stool_pressure','anxiety','shoulder_pain','anus_itching','vision_problem','abdominal_pain',
          'chest_pain','weight_loss','diarrhea','breath_problems','thirsty','anus_swelling','blood_o_tissue','constipation',
          'neck_pain','low_heartbeat','more_urine','low_breath','muscle_cramps','muscle_spasm','yawning','rash','back_pain',
          'anal_bleeding','lump_anus','cold','skin_rash','neck_stiff']
len(header)
# header

56

In [70]:
model_inputs=[]
for x in range(0,len(header)):
    model_inputs.append(0)

inputs = [ i.strip() for i in input("Enter Symptoms : ").split()]

print(inputs)

for element in range(0,len(header)):
    for symptoms in inputs:
        if symptoms == header[element]:
            model_inputs[element] = 1


Enter Symptoms : fever vomiting haedache bleeding skin_rash fatigue muscle_pain
['fever', 'vomiting', 'haedache', 'bleeding', 'skin_rash', 'fatigue', 'muscle_pain']


In [71]:
with open("Models\DiseasePrediction(Dec)","rb") as f:
    decisionTreeModel = pickle.load(f)

prediction = decisionTreeModel.predict([model_inputs])
prediction

array([2])

In [72]:
disease[prediction[0]]

'Dengue'