In [13]:
import numpy as np 
import pandas as pd 

In [14]:
symptoms = pd.read_csv('dataset.csv')
symptom_weights = pd.read_csv('Symptom-severity.csv')

In [15]:
symptoms.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [16]:
symptoms.tail()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
4915,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,nausea,spinning_movements,loss_of_balance,unsteadiness,,,,,,,,,,,
4916,Acne,skin_rash,pus_filled_pimples,blackheads,scurring,,,,,,,,,,,,,
4917,Urinary tract infection,burning_micturition,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,,,,,,,,,,,,,
4918,Psoriasis,skin_rash,joint_pain,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,,,,,,,,,,,
4919,Impetigo,skin_rash,high_fever,blister,red_sore_around_nose,yellow_crust_ooze,,,,,,,,,,,,


In [17]:
def remove_whitespace(x):
    return str(x).strip()

for column in symptoms.columns:
    symptoms[column] = symptoms[column].apply(remove_whitespace)

In [18]:
symptoms_lst = set()

for column in symptoms.drop(columns=['Disease']):
    symptoms_lst.update(symptoms[column].unique())

symptoms_dict = {symptom:np.zeros(symptoms.shape[0]) for symptom in symptoms_lst}

In [19]:
diseases = pd.DataFrame(symptoms_dict)

In [20]:
def one_hot_encode(row):
    row_symptoms = row[1:].unique()[:-1]
    
    temp = {symptom:0 for symptom in symptoms_lst}
    for symptom in row_symptoms:
        temp[symptom] = 1
    
    return temp

for i, row in symptoms.iterrows():
    diseases.iloc[i] = one_hot_encode(row)

In [21]:
diseases = diseases.assign(disease=symptoms['Disease'])
diseases

Unnamed: 0,patches_in_throat,history_of_alcohol_consumption,swelled_lymph_nodes,small_dents_in_nails,abdominal_pain,silver_like_dusting,pain_during_bowel_movements,headache,lack_of_concentration,watering_from_eyes,...,constipation,loss_of_appetite,dehydration,runny_nose,swelling_joints,swollen_blood_vessels,muscle_pain,pain_behind_the_eyes,acidity,disease
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fungal infection
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fungal infection
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fungal infection
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fungal infection
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fungal infection
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,(vertigo) Paroymsal Positional Vertigo
4916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Acne
4917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Urinary tract infection
4918,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Psoriasis


In [22]:
num_classes = len(diseases['disease'].unique())
num_predictors = len(symptoms_lst)

print(f"Number of predictors: {num_predictors}")
print(f"Number of classes: {num_classes}")

Number of predictors: 132
Number of classes: 41


In [25]:

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
import joblib

In [27]:
X = diseases.drop(columns=['disease'])
y = diseases['disease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred, average='macro')
acc = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [28]:
joblib.dump(model, 'model.pkl')

['model.pkl']