In [1]:
import numpy as np 
import pandas as pd 

In [2]:
symptoms = pd.read_csv('../data/dataset.csv')
symptom_weights = pd.read_csv('../data/Symptom-severity.csv')

In [3]:
symptoms.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [4]:
symptoms.tail()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
4915,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,nausea,spinning_movements,loss_of_balance,unsteadiness,,,,,,,,,,,
4916,Acne,skin_rash,pus_filled_pimples,blackheads,scurring,,,,,,,,,,,,,
4917,Urinary tract infection,burning_micturition,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,,,,,,,,,,,,,
4918,Psoriasis,skin_rash,joint_pain,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,,,,,,,,,,,
4919,Impetigo,skin_rash,high_fever,blister,red_sore_around_nose,yellow_crust_ooze,,,,,,,,,,,,


In [5]:
def remove_whitespace(x):
    return str(x).strip()

for column in symptoms.columns:
    symptoms[column] = symptoms[column].apply(remove_whitespace)

In [6]:
symptoms_lst = set()

for column in symptoms.drop(columns=['Disease']):
    symptoms_lst.update(symptoms[column].unique())

symptoms_dict = {symptom:np.zeros(symptoms.shape[0]) for symptom in symptoms_lst}

In [7]:
diseases = pd.DataFrame(symptoms_dict)

In [8]:
def one_hot_encode(row):
    row_symptoms = row[1:].unique()[:-1]
    
    temp = {symptom:0 for symptom in symptoms_lst}
    for symptom in row_symptoms:
        temp[symptom] = 1
    
    return temp

for i, row in symptoms.iterrows():
    diseases.iloc[i] = one_hot_encode(row)

In [9]:
diseases = diseases.assign(disease=symptoms['Disease'])
diseases

Unnamed: 0,spotting_ urination,nausea,mild_fever,blurred_and_distorted_vision,rusty_sputum,polyuria,belly_pain,scurring,acidity,continuous_feel_of_urine,...,irritability,loss_of_smell,internal_itching,dehydration,sunken_eyes,acute_liver_failure,neck_pain,red_sore_around_nose,headache,disease
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fungal infection
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fungal infection
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fungal infection
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fungal infection
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fungal infection
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,(vertigo) Paroymsal Positional Vertigo
4916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Acne
4917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Urinary tract infection
4918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Psoriasis


In [10]:
num_classes = len(diseases['disease'].unique())
num_predictors = len(symptoms_lst)

print(f"Number of predictors: {num_predictors}")
print(f"Number of classes: {num_classes}")

Number of predictors: 132
Number of classes: 41


In [11]:

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import joblib

In [12]:
X = diseases.drop(columns=['disease'])
y = diseases['disease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred, average='macro')
acc = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [23]:
X = diseases.drop(columns=['disease'])
y = diseases['disease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logreg = LogisticRegression(penalty='l1', solver='liblinear')

logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
y_prob = logreg.predict_proba(X_test)

f1 = f1_score(y_test, y_pred, average='macro')
acc = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

class_labels = logreg.classes_

top_preds = np.argsort(y_prob, axis=1)[:, -3:][:, ::-1]

for i in range(5):
    idx = top_preds[i]
    print(f"Prediction {i + 1}:")
    for j in range(3):
        print(f"Disease: {class_labels[idx[j]]}, Probability: {y_prob[i, idx[j]]*100:.2f}%")
    print('\n')



Prediction 1:
Disease: Acne, Probability: 97.97%
Disease: Fungal infection, Probability: 0.13%
Disease: Impetigo, Probability: 0.09%


Prediction 2:
Disease: Acne, Probability: 98.06%
Disease: Fungal infection, Probability: 0.12%
Disease: Impetigo, Probability: 0.08%


Prediction 3:
Disease: Hyperthyroidism, Probability: 97.04%
Disease: Arthritis, Probability: 0.48%
Disease: Jaundice, Probability: 0.40%


Prediction 4:
Disease: AIDS, Probability: 98.24%
Disease: Malaria, Probability: 0.10%
Disease: Paralysis (brain hemorrhage), Probability: 0.07%


Prediction 5:
Disease: Chronic cholestasis, Probability: 97.37%
Disease: Hepatitis D, Probability: 0.30%
Disease: Fungal infection, Probability: 0.18%




array([[18,  0,  0, ...,  0,  0,  0],
       [ 0, 30,  0, ...,  0,  0,  0],
       [ 0,  0, 24, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ..., 26,  0,  0],
       [ 0,  0,  0, ...,  0, 22,  0],
       [ 0,  0,  0, ...,  0,  0, 34]])

In [24]:
joblib.dump(model, '../models/DecisionTreeModel.pkl')
joblib.dump(logreg, '../models/LogisticRegressionModel.pkl')

['../models/LogisticRegressionModel.pkl']