In [1]:
import numpy as np
import pandas as pd

In [2]:
# main dataset
df = pd.read_csv('dataset.csv')
df

Unnamed: 0,Disease,Symptom_0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16
0,AIDS,muscle_wasting,patches_in_throat,high_fever,extra_marital_contacts,,,,,,,,,,,,,
1,AIDS,patches_in_throat,high_fever,extra_marital_contacts,,,,,,,,,,,,,,
2,AIDS,muscle_wasting,high_fever,extra_marital_contacts,,,,,,,,,,,,,,
3,AIDS,muscle_wasting,patches_in_throat,extra_marital_contacts,,,,,,,,,,,,,,
4,AIDS,muscle_wasting,patches_in_throat,high_fever,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,Varicose veins,cramps,bruising,obesity,swollen_legs,swollen_blood_vessels,prominent_veins_on_calf,,,,,,,,,,,
309,Varicose veins,fatigue,cramps,bruising,obesity,swollen_legs,swollen_blood_vessels,prominent_veins_on_calf,,,,,,,,,,
310,Varicose veins,fatigue,cramps,bruising,obesity,swollen_legs,prominent_veins_on_calf,,,,,,,,,,,
311,Varicose veins,fatigue,cramps,bruising,swollen_legs,swollen_blood_vessels,prominent_veins_on_calf,,,,,,,,,,,


In [3]:
# Other important datasets
df_precaution = pd.read_csv('disease_precaution.csv')
df_description = pd.read_csv('disease_description.csv')
df_specialist = pd.read_csv('Doctor_Versus_Disease.csv',encoding='latin1', names=['Disease','Specialist'])

In [4]:
df_specialist

Unnamed: 0,Disease,Specialist
0,Drug Reaction,Allergist
1,Allergy,Allergist
2,Hypertension,Cardiologist
3,Heart attack,Cardiologist
4,Psoriasis,Dermatologist
5,Chicken pox,Dermatologist
6,Acne,Dermatologist
7,Impetigo,Dermatologist
8,Fungal infection,Dermatologist
9,Hypothyroidism,Endocrinologist


In [5]:
df_precaution.head()

Unnamed: 0,Disease,Symptom_precaution_0,Symptom_precaution_1,Symptom_precaution_2,Symptom_precaution_3
0,AIDS,avoid open cuts,wear ppe if possible,consult doctor,follow up
1,Acne,bath twice,avoid fatty spicy food,drink plenty of water,avoid too many products
2,Alcoholic hepatitis,stop alcohol consumption,consult doctor,medication,follow up
3,Allergy,apply calamine,cover area with bandage,,use ice to compress itching
4,Arthritis,exercise,use hot and cold therapy,try acupuncture,massage


1. Here we are dropping the disease column and then joining the tuples of all symptoms_precaution as Precaution 
2. Next we form a new dataset, by dropping the columns of df_try and and concatening it with the disease column
3. This forms a new dataset named df_precaution

In [6]:
df_try = df_precaution.drop(columns ='Disease')
df_try['Precaution'] = df_try.apply(lambda row: " ".join(row.values.astype(str)),axis =1)

In [7]:
df_try = df_try.drop(['Symptom_precaution_0','Symptom_precaution_1','Symptom_precaution_2','Symptom_precaution_3'],axis =1)

In [8]:
df_try = pd.concat([df_precaution['Disease'],df_try],axis =1)
df_try.head()

Unnamed: 0,Disease,Precaution
0,AIDS,avoid open cuts wear ppe if possible consult d...
1,Acne,bath twice avoid fatty spicy food drink plenty...
2,Alcoholic hepatitis,stop alcohol consumption consult doctor medica...
3,Allergy,apply calamine cover area with bandage nan use...
4,Arthritis,exercise use hot and cold therapy try acupunct...


In [9]:
df_precaution = df_try
df_precaution.head()

Unnamed: 0,Disease,Precaution
0,AIDS,avoid open cuts wear ppe if possible consult d...
1,Acne,bath twice avoid fatty spicy food drink plenty...
2,Alcoholic hepatitis,stop alcohol consumption consult doctor medica...
3,Allergy,apply calamine cover area with bandage nan use...
4,Arthritis,exercise use hot and cold therapy try acupunct...


1. Here we are dropping the disease column and then joining the tuples of all symptoms making a new column Symptoms 
2. Next we form a new dataset, by dropping the columns of df_main and and concatening it with the disease column
3. This forms a new dataset named df.. making this our main dataset of disease and their symptoms.

In [10]:
df.fillna('',inplace =True)

In [11]:
df_main = df.drop(columns='Disease')

In [12]:
df_main['Symptoms'] = df_main.apply(lambda row: " ".join(row.values.astype(str)),axis =1)

In [13]:
df_main = df_main.drop(['Symptom_0','Symptom_1','Symptom_2','Symptom_3','Symptom_4','Symptom_5','Symptom_6','Symptom_7','Symptom_8','Symptom_9','Symptom_10','Symptom_11','Symptom_12','Symptom_13','Symptom_14','Symptom_15','Symptom_16',],axis =1)


In [14]:
df_main['Symptoms'] = df_main['Symptoms'].str.replace('_'," ")

In [15]:
df_main = pd.concat([df['Disease'],df_main],axis =1)

In [16]:
df = df_main
df.head()

Unnamed: 0,Disease,Symptoms
0,AIDS,muscle wasting patches in throat high fever...
1,AIDS,patches in throat high fever extra marital ...
2,AIDS,muscle wasting high fever extra marital con...
3,AIDS,muscle wasting patches in throat extra mari...
4,AIDS,muscle wasting patches in throat high fever...



We will use TFIDF Vectorizer to extract features from the text

In [17]:
# using NLP from here
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
vec = TfidfVectorizer(max_features=5000)
x = vec.fit_transform(df['Symptoms']).toarray()
y = df['Disease']

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report

Splitting the Dataset

In [20]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3)

In [21]:
model = MultinomialNB()
model.fit(x_train,y_train)

In [22]:
y_pred =model.predict(x_test)
y_pred

array(['Drug Reaction', 'Hepatitis E', 'Peptic ulcer disease', 'Jaundice',
       'Gastroenteritis', 'Typhoid', 'Varicose veins', 'Bronchial Asthma',
       'Varicose veins', 'Chronic cholestasis', 'Hepatitis D', 'Dengue',
       'Impetigo', 'Hypoglycemia', 'Hypertension ', 'GERD', 'Diabetes ',
       'Chicken pox', 'Paroxysmal Positional Vertigo', 'Malaria',
       'Malaria', 'Diabetes ', 'Covid', 'Varicose veins', 'Arthritis',
       'Pneumonia', 'Dimorphic hemorrhoids(piles)', 'Common Cold',
       'Jaundice', 'Alcoholic hepatitis', 'Malaria', 'Common Cold',
       'Dengue', 'Migraine', 'Migraine', 'Chronic cholestasis', 'GERD',
       'Typhoid', 'Malaria', 'Pneumonia', 'Hepatitis D', 'Psoriasis',
       'Dengue', 'Dimorphic hemorrhoids(piles)', 'Chicken pox',
       'Hepatitis C', 'Malaria', 'Hepatitis B', 'Urinary tract infection',
       'Hypoglycemia', 'Urinary tract infection', 'Osteoarthritis',
       'Chicken pox', 'Covid', 'Jaundice', 'Dengue', 'Psoriasis',
       'Hepatitis

In [23]:
print("Accuracy Score: ",accuracy_score(y_test,y_pred))
print("Classification Report:\n",classification_report(y_test,y_pred))

Accuracy Score:  0.8085106382978723
Classification Report:
                                precision    recall  f1-score   support

                         AIDS       0.00      0.00      0.00         3
                         Acne       0.00      0.00      0.00         3
          Alcoholic hepatitis       1.00      1.00      1.00         2
                    Arthritis       1.00      0.33      0.50         3
             Bronchial Asthma       1.00      1.00      1.00         2
         Cervical spondylosis       0.00      0.00      0.00         0
                  Chicken pox       1.00      1.00      1.00         4
          Chronic cholestasis       0.00      0.00      0.00         0
                  Common Cold       1.00      1.00      1.00         2
                        Covid       1.00      1.00      1.00         2
                       Dengue       1.00      1.00      1.00         4
                    Diabetes        1.00      1.00      1.00         3
 Dimorphic hemor

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
def pred_prognosis():
    user_symptoms = input("Enter ur symptoms")
    user_symptoms = user_symptoms.lower().replace('[^\w\s]','')
    symps_vec = vec.transform([user_symptoms]).toarray()
    prognosis = model.predict(symps_vec)
    result = pd.DataFrame({'Disease':prognosis}) 
    result = result.merge(df_specialist, on="Disease",how="left")
    result = result.merge(df_precaution, on="Disease", how = "left")
    result = result.merge(df_description, on="Disease",how="left")

    return result

In [26]:
pred_prognosis()

Unnamed: 0,Disease,Specialist,Precaution,Symptom_Description
0,Common Cold,Otolaryngologist,drink vitamin c rich drinks take vapour avoid ...,The common cold is a viral infection of your n...
