In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
import re
from googlesearch import search
import warnings
warnings.filterwarnings("ignore")
import requests
from bs4 import BeautifulSoup

# Take input a disease and return the content of wikipedia's infobox for that specific disease

def diseaseDetail(term):
    diseases=[term]
    ret=term+"\n"
    for dis in diseases:
        # search "disease wilipedia" on google
        query = dis+' wikipedia'
        for sr in search(query,tld="co.in",stop=10,pause=0.5):
            # open wikipedia link
            match=re.search(r'wikipedia',sr)
            filled = 0
            if match:
                wiki = requests.get(sr,verify=False)
                soup = BeautifulSoup(wiki.content, 'html5lib')
                # Fetch HTML code for 'infobox'
                info_table = soup.find("table", {"class":"infobox"})
                if info_table is not None:
                    # Preprocess contents of infobox
                    for row in info_table.find_all("tr"):
                        data=row.find("th",{"scope":"row"})
                        if data is not None:
                            symptom=str(row.find("td"))
                            symptom = symptom.replace('.','')
                            symptom = symptom.replace(';',',')
                            symptom = symptom.replace('<b>','<b> \n')
                            symptom=re.sub(r'<a.*?>','',symptom) # Remove hyperlink
                            symptom=re.sub(r'</a>','',symptom) # Remove hyperlink
                            symptom=re.sub(r'<[^<]+?>',' ',symptom) # All the tags
                            symptom=re.sub(r'\[.*\]','',symptom) # Remove citation text
                            symptom=symptom.replace("&gt",">")
                            ret+=data.get_text()+" - "+symptom+"\n"
#                            print(data.get_text(),"-",symptom)
                            filled = 1
                if filled:
                    break
    return ret

***Run all the cells***

# **Disease Detection using Symptoms and Doctor recommendation**

This notebook contains code to detect disease using the symptoms entered and selected by the user and recommends the appropriate doctors.


In [10]:
# Predicts diseases based on the symptoms entered and selected by the user.
# importing all necessary libraries
import warnings
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, cross_val_score
from statistics import mean
from nltk.corpus import wordnet
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from itertools import combinations
from time import time
from collections import Counter
import operator
from xgboost import XGBClassifier
import math
from sklearn.linear_model import LogisticRegression

warnings.simplefilter("ignore")

Download resources required for NLTK pre-processing

In [11]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

True

**synonyms function** finds the synonymous terms of a symptom entered by the user.

This is necessary as the user may use a term for a symptom which may be different from the one present in dataset.
This improves the accuracy by reducing the wrong predictions even when symptoms for a disease are entered slightly different than the ones on which model is trained.

*Synonyms are searched on Thesaurus.com and NLTK Wordnet*

In [12]:
# returns the list of synonyms of the input word from thesaurus.com (https://www.thesaurus.com/) and wordnet (https://www.nltk.org/howto/wordnet.html)
def synonyms(term):
    synonyms = []
    response = requests.get('https://www.thesaurus.com/browse/{}'.format(term))
    soup = BeautifulSoup(response.content,  "html.parser")
    try:
        container=soup.find('section', {'class': 'MainContentContainer'})
        row=container.find('div',{'class':'css-191l5o0-ClassicContentCard'})
        row = row.find_all('li')
        for x in row:
            synonyms.append(x.get_text())
    except:
        None
    for syn in wordnet.synsets(term):
        synonyms+=syn.lemma_names()
    return set(synonyms)

In [13]:
# utlities for pre-processing
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
splitter = RegexpTokenizer(r'\w+')

Disease Combination dataset contains the combinations for each of the disease present in dataset as practically it is often observed that it is not necessary for a person to have a disease when all the symptoms are faced by the patient or the user.

*To tackle this problem, combinations are made with the symptoms for each disease.*

 **This increases the size of the data exponentially and helps the model to predict the disease with much better accuracy.**

*df_comb -> Dataframe consisting of dataset generated by combining symptoms for each disease.*

*df_norm -> Dataframe consisting of dataset which contains a single row for each diseases with all the symptoms for that corresponding disease.*

**Dataset contains 261 diseases and their symptoms**

In [14]:
df_comb = pd.read_csv("/content/drive/MyDrive/Medical-Assistant-main/Dataset/dis_sym_dataset_comb.csv") # Disease combination
df_norm = pd.read_csv("/content/drive/MyDrive/Medical-Assistant-main/Dataset/dis_sym_dataset_norm.csv") # Individual Disease

X = df_comb.iloc[:, 1:]
Y = df_comb.iloc[:, 0:1]

Using **Logistic Regression (LR) Classifier** as it gives better accuracy compared to other classification models as observed in the comparison of model accuracies in Model_latest.py

Cross validation is done on dataset with cv = 5

In [15]:
lr = LogisticRegression()
lr = lr.fit(X, Y)
scores = cross_val_score(lr, X, Y, cv=5)

In [16]:
X = df_norm.iloc[:, 1:]
Y = df_norm.iloc[:, 0:1]

In [17]:
# List of symptoms
dataset_symptoms = list(X.columns)

# Symptoms initially taken from user.

In [18]:
# Taking symptoms from user as input
user_symptoms = str(input("Please enter symptoms separated by comma(,):\n")).lower().split(',')
# Preprocessing the input symptoms
processed_user_symptoms=[]
for sym in user_symptoms:
    sym=sym.strip()
    sym=sym.replace('-',' ')
    sym=sym.replace("'",'')
    sym = ' '.join([lemmatizer.lemmatize(word) for word in splitter.tokenize(sym)])
    processed_user_symptoms.append(sym)

Please enter symptoms separated by comma(,):
cough, cold


In [19]:
user_symptoms

['cough', ' cold']

Pre-processing on symptoms entered by user is done.

In [20]:
# Taking each user symptom and finding all its synonyms and appending it to the pre-processed symptom string
user_symptoms = []
for user_sym in processed_user_symptoms:
    user_sym = user_sym.split()
    str_sym = set()
    for comb in range(1, len(user_sym)+1):
        for subset in combinations(user_sym, comb):
            subset=' '.join(subset)
            subset = synonyms(subset)
            str_sym.update(subset)
    str_sym.add(' '.join(user_sym))
    user_symptoms.append(' '.join(str_sym).replace('_',' '))
# query expansion performed by joining synonyms found for each symptoms initially entered
print("After query expansion done by using the symptoms entered")
print(user_symptoms)

After query expansion done by using the symptoms entered
['cough coughing', 'stale cold-blooded frigidity common cold coldness cold dusty low temperature moth-eaten frigid frigidness inhuman insensate']


The below procedure is performed in order to show the symptom synonmys found for the symptoms entered by the user.

The symptom synonyms and user symptoms are matched with the symptoms present in dataset. Only the symptoms which matches the symptoms present in dataset are shown back to the user.

In [21]:
# Loop over all the symptoms in dataset and check its similarity score to the synonym string of the user-input
# symptoms. If similarity>0.5, add the symptom to the final list
found_symptoms = set()
for idx, data_sym in enumerate(dataset_symptoms):
    data_sym_split=data_sym.split()
    for user_sym in user_symptoms:
        count=0
        for symp in data_sym_split:
            if symp in user_sym.split():
                count+=1
        if count/len(data_sym_split)>0.5:
            found_symptoms.add(data_sym)
found_symptoms = list(found_symptoms)

## **Prompt the user to select the relevant symptoms by entering the corresponding indices.**

In [22]:
# Print all found symptoms
print("Top matching symptoms from your search!")
for idx, symp in enumerate(found_symptoms):
    print(idx,":",symp)



Top matching symptoms from your search!
0 : coughing


In [23]:
# Show the related symptoms found in the dataset and ask user to select among them
select_list = input("\nPlease select the relevant symptoms. Enter indices (separated-space):\n").split()


Please select the relevant symptoms. Enter indices (separated-space):
0


In [24]:
# Find other relevant symptoms from the dataset based on user symptoms based on the highest co-occurance with the
# ones that is input by the user
dis_list = set()
final_symp = []
counter_list = []
for idx in select_list:
    symp=found_symptoms[int(idx)]
    final_symp.append(symp)
    dis_list.update(set(df_norm[df_norm[symp]==1]['label_dis']))

In [25]:
for dis in dis_list:
    row = df_norm.loc[df_norm['label_dis'] == dis].values.tolist()
    row[0].pop(0)
    for idx,val in enumerate(row[0]):
        if val!=0 and dataset_symptoms[idx] not in final_symp:
            counter_list.append(dataset_symptoms[idx])

## To find symptoms which generally co-occur, for example with symptoms like cough, headache generally happens hence they co-occur.

In [26]:
# Symptoms that co-occur with the ones selected by user
dict_symp = dict(Counter(counter_list))
dict_symp_tup = sorted(dict_symp.items(), key=operator.itemgetter(1),reverse=True)

In [27]:
dict_symp = dict(Counter(counter_list))
# Sorting the list of tuples based on the count in descending order
dict_symp_tup = sorted(dict_symp.items(), key=operator.itemgetter(1), reverse=True)

# Printing only the top 5 symptoms with their index numbers
for index, (symptom, count) in enumerate(dict_symp_tup[:5]):
    print(f"{index + 1}) {symptom}")


1) feeling tired
2) fever
3) headache
4) muscle joint pain
5) runny nose


## User is presented with a list of co-occuring symptoms to select from and is performed iteratively to recommend more possible symptoms based on the similarity to the previously entered symptoms.

As the co-occuring symptoms can be in overwhelming numbers, only the top 5 are recommended to the user from which user can select the symptoms.

If user does not have any of those 5 symptoms and wants to see the next 5, he can do so by giving input as -1.

To stop the recommendation, user needs to give input as "No".

In [28]:
# Iteratively, suggest top co-occuring symptoms to the user and ask to select the ones applicable
found_symptoms=[]
count=0

for tup in dict_symp_tup:
    count+=1
    found_symptoms.append(tup[0])
    if count%5==0 or count==len(dict_symp_tup):
        print("\nCommon co-occuring symptoms:")
        for idx,ele in enumerate(found_symptoms):
            print(idx,":",ele)
        select_list = input("Do you have have of the symptoms from the above? If Yes, enter the indices (space-separated), 'no' to stop, '-1' to skip:\n").lower().split();
        if select_list[0]=='no':
            break
        if select_list[0]=='-1':
            found_symptoms = []
            continue
        for idx in select_list:
            final_symp.append(found_symptoms[int(idx)])
        found_symptoms = []


Common co-occuring symptoms:
0 : feeling tired
1 : fever
2 : headache
3 : muscle joint pain
4 : runny nose
Do you have have of the symptoms from the above? If Yes, enter the indices (space-separated), 'no' to stop, '-1' to skip:
0

Common co-occuring symptoms:
0 : sore throat
1 : chest tightness
2 : recurring episode wheezing
3 : shortness breath
Do you have have of the symptoms from the above? If Yes, enter the indices (space-separated), 'no' to stop, '-1' to skip:
0


Final Symptom list

In [29]:
# Create query vector based on symptoms selected by the user
print("\nFinal list of Symptoms that will be used for prediction:")
sample_x = [0 for x in range(0,len(dataset_symptoms))]
for val in final_symp:
    print(val)
    sample_x[dataset_symptoms.index(val)]=1


Final list of Symptoms that will be used for prediction:
coughing
feeling tired
sore throat


Prediction of disease is done

In [30]:
# Predict disease
lr = LogisticRegression()
lr = lr.fit(X, Y)
prediction = lr.predict_proba([sample_x])

Show top k diseases and their probabilities to the user.

K in this case is 10

In [31]:
k = 10
diseases = list(set(Y['label_dis']))
diseases.sort()
topk = prediction[0].argsort()[-k:][::-1]

In [32]:
# Assuming 'prediction' is the output of your model, giving a probability for each disease
topk = prediction[0].argsort()[-k:][::-1]

most_probable_diseases=[]

# Printing diseases corresponding to the indices in topk
print("Top Diseases Predicted:")
for idx in topk:
    most_probable_diseases.append(diseases[idx])
    print(diseases[idx])


Top Diseases Predicted:
Influenza
Rubella
Brucellosis
Hepatitis D
Fibromyalgia
Asthma
Congestive heart disease
Botulism
Strep throat
Anaemia


***Suggesting doctors based on most probable diseases***

In [34]:
doctors=pd.read_csv('/content/drive/MyDrive/Medical-Assistant-main/doctors.csv')

In [35]:
# Split the 'Specialization' entries and expand them into a list
specializations = doctors['Specialization'].str.split(',').explode()

# Split the 'Specialization' entries and expand them into a list
specializations = doctors['Specialization'].str.split(',').explode()

# Count occurrences of each specialization
specialization_counts = specializations.value_counts()

# Disease to Sepecialiation mapping using GPT3.5 Turbo

In [36]:
# Mapping diseases to medical specializations
disease_to_specialization = {
    'Abscess': 'General Surgeon',
    'Acquired Capillary Haemangioma of Eyelid': 'Dermatologist',
    'Acquired Immuno Deficiency Syndrome': 'Infectious Diseases',
    'Acute encephalitis syndrome': 'Neurologist',
    'Adult Inclusion Conjunctivitis': 'Eye Specialist',
    'Alcohol Abuse and Alcoholism': 'Psychiatrist',
    'Alopecia (hair loss)': 'Dermatologist',
    'Alzheimer': 'Neurologist',
    'Amaurosis Fugax': 'Ophthalmologist',
    'Amblyopia': 'Ophthalmologist',
    'Amoebiasis': 'Gastroenterologist',
    'Anaemia': 'Hematologist',
    'Aniseikonia': 'Ophthalmologist',
    'Anisometropia': 'Ophthalmologist',
    'Antepartum hemorrhage (Bleeding in late pregnancy)': 'Gynecologist',
    'Anthrax': 'Infectious Diseases',
    'Anxiety': 'Psychiatrist',
    'Appendicitis': 'General Surgeon',
    'Arthritis': 'Rheumatologist',
    'Asbestos-related diseases': 'Pulmonologist / Lung Specialist',
    'Aseptic meningitis': 'Neurologist',
    'Asthma': 'Pulmonologist / Lung Specialist',
    'Astigmatism': 'Ophthalmologist',
    'Atrophy': 'Neurologist',
    'Autism': 'Pediatrician',
    'Bad Breath (Halitosis)': 'General Physician',
    "Bell's Palsy": 'Neurologist',
    'Beriberi': 'Nutritionist',
    'Black Death': 'Infectious Diseases',
    'Bleeding Gums': 'Dentist',
    'Blindness': 'Ophthalmologist',
    'Botulism': 'Infectious Diseases',
    'Brain Tumour': 'Neuro Surgeon',
    'Breast Cancer / Carcinoma': 'Oncologist',
    'Bronchitis': 'Pulmonologist / Lung Specialist',
    'Brucellosis': 'Infectious Diseases',
    'Bubonic plague': 'Infectious Diseases',
    'Bunion': 'Orthopedic Surgeon',
    'Burns': 'General Surgeon',
    'Calculi': 'Urologist',
    'Campylobacter infection': 'Gastroenterologist',
    'Cancer': 'Oncologist',
    'Candidiasis': 'Infectious Diseases',
    'Carbon monoxide poisoning': 'Emergency Medicine',
    'Carpal Tunnel Syndrome': 'Orthopedic Surgeon',
    'Cavities': 'Dentist',
    'Celiacs disease': 'Gastroenterologist',
    'Cerebral palsy': 'Pediatric Neurologist',
    'Chagas disease': 'Infectious Diseases',
    'Chalazion': 'Ophthalmologist',
    'Chickenpox': 'Pediatrician',
    'Chikungunya Fever': 'Infectious Diseases',
    'Childhood Exotropia': 'Ophthalmologist',
    'Chlamydia': 'Sexologist',
    'Cholera': 'Infectious Diseases',
    'Chorea': 'Neurologist',
    'Chronic fatigue syndrome': 'Internal Medicine Specialist',
    'Chronic obstructive pulmonary disease (COPD)': 'Pulmonologist / Lung Specialist',
    'Cleft Lip and Cleft Palate': 'Plastic Surgeon',
    'Colitis': 'Gastroenterologist',
    'Colorectal Cancer': 'Oncologist',
    'Common cold': 'General Physician',
    'Condyloma': 'Dermatologist',
    'Congenital anomalies (birth defects)': 'Geneticist',
    'Congestive heart disease': 'Cardiologist',
    'Corneal Abrasion': 'Ophthalmologist',
    'Coronary Heart Disease': 'Cardiologist',
    'Coronavirus disease 2019 (COVID-19)': 'Infectious Diseases',
    'Cough': 'General Physician',
    'Crimean Congo haemorrhagic fever (CCHF)': 'Infectious Diseases',
    'Dehydration': 'General Physician',
    'Dementia': 'Neurologist',
    'Dengue': 'Infectious Diseases',
    'Diabetes Mellitus': 'Endocrinologist',
    'Diabetic Retinopathy': 'Ophthalmologist',
    'Diarrhea': 'Gastroenterologist',
    'Diphtheria': 'Infectious Diseases',
    "Down's Syndrome": 'Geneticist',
    'Dracunculiasis (guinea-worm disease)': 'Infectious Diseases',
    'Dysentery': 'Gastroenterologist',
    'Ear infection': 'Ent Specialist',
    'Early pregnancy loss': 'Gynecologist',
    'Ebola': 'Infectious Diseases',
    'Eclampsia': 'Gynecologist',
    'Ectopic pregnancy': 'Gynecologist',
    'Eczema': 'Dermatologist',
    'Endometriosis': 'Gynecologist',
    'Epilepsy': 'Neurologist',
    'Fibroids': 'Gynecologist',
    'Fibromyalgia': 'Rheumatologist',
    'Food Poisoning': 'Gastroenterologist',
    'Frost Bite': 'General Surgeon',
    'GERD': 'Gastroenterologist',
    'Gaming disorder': 'Psychiatrist',
    'Gangrene': 'General Surgeon',
    'Gastroenteritis': 'Gastroenterologist',
    'Genital herpes': 'Dermatologist',
    'Glaucoma': 'Ophthalmologist',
    'Goitre': 'Endocrinologist',
    'Gonorrhea': 'Sexologist',
    'Guillain-Barré syndrome': 'Neurologist',
    'Haemophilia': 'Hematologist',
    'Hand, Foot and Mouth Disease': 'Pediatrician',
    'Heat-Related Illnesses and Heat waves': 'Emergency Medicine',
    'Hepatitis': 'Hepatologist',
    'Hepatitis A': 'Hepatologist',
    'Hepatitis B': 'Hepatologist',
    'Hepatitis C': 'Hepatologist',
    'Hepatitis D': 'Hepatologist',
    'Hepatitis E': 'Hepatologist',
    'Herpes Simplex': 'Dermatologist',
    'High risk pregnancy': 'Gynecologist',
    'Human papillomavirus': 'Dermatologist',
    'Hypermetropia': 'Ophthalmologist',
    'Hyperthyroidism': 'Endocrinologist',
    'Hypothyroid': 'Endocrinologist',
    'Hypotonia': 'Pediatrician',
    'Impetigo': 'Dermatologist',
    'Inflammatory Bowel Disease': 'Gastroenterologist',
    'Influenza': 'General Physician',
    'Insomnia': 'Psychiatrist',
    'Interstitial cystitis': 'Urologist',
    'Iritis': 'Ophthalmologist',
    'Iron Deficiency Anemia': 'Hematologist',
    'Irritable bowel syndrome': 'Gastroenterologist',
    'Japanese Encephalitis': 'Infectious Diseases',
    'Jaundice': 'Hepatologist',
    'Kala-azar/ Leishmaniasis': 'Infectious Diseases',
    'Kaposi’s Sarcoma': 'Oncologist',
    'Keratoconjunctivitis Sicca (Dry eye syndrome)': 'Ophthalmologist',
    'Keratoconus': 'Ophthalmologist',
    'Kuru': 'Neurologist',
    'Laryngitis': 'Ent Specialist',
    'Lead poisoning': 'Toxicologist',
    'Legionellosis': 'Infectious Diseases',
    'Leprosy': 'Dermatologist',
    'Leptospirosis': 'Infectious Diseases',
    'Leukemia': 'Hematologist',
    'Lice': 'Dermatologist',
    'Lung cancer': 'Oncologist',
    'Lupus erythematosus': 'Rheumatologist',
    'Lyme disease': 'Infectious Diseases',
    'Lymphoma': 'Oncologist',
    'Mad cow disease': 'Neurologist',
    'Malaria': 'Infectious Diseases',
    'Marburg fever': 'Infectious Diseases',
    'Mastitis': 'Gynecologist',
    'Measles': 'Pediatrician',
    'Melanoma': 'Oncologist',
    'Middle East respiratory syndrome coronavirus (MERS‐CoV)': 'Infectious Diseases',
    'Migraine': 'Neurologist',
    'Mononucleosis': 'Infectious Diseases',
    'Mouth Breathing': 'Ent Specialist',
    'Multiple myeloma': 'Oncologist',
    'Multiple sclerosis': 'Neurologist',
    'Mumps': 'Pediatrician',
    'Muscular dystrophy': 'Neurologist',
    'Myasthenia gravis': 'Neurologist',
    'Myelitis': 'Neurologist',
    'Myocardial Infarction (Heart Attack)': 'Cardiologist',
    'Myopia': 'Ophthalmologist',
    'Narcolepsy': 'Neurologist',
    'Nasal Polyps': 'Ent Specialist',
    'Nausea and Vomiting of Pregnancy and  Hyperemesis gravidarum': 'Gynecologist',
    'Necrotizing Fasciitis': 'General Surgeon',
    'Neonatal Respiratory Disease Syndrome(NRDS)': 'Pediatrician',
    'Neoplasm': 'Oncologist',
    'Neuralgia': 'Neurologist',
    'Nipah virus infection': 'Infectious Diseases',
    'Obesity': 'Nutritionist',
    'Obsessive Compulsive Disorder': 'Psychiatrist',
    'Oral Cancer': 'Oncologist',
    'Orbital Dermoid': 'Ophthalmologist',
    'Osteoarthritis': 'Orthopedic Surgeon',
    'Osteomyelitis': 'Orthopedic Surgeon',
    'Osteoporosis': 'Orthopedic Surgeon',
    'Paratyphoid fever': 'Infectious Diseases',
    "Parkinson's Disease": 'Neurologist',
    'Pelvic inflammatory disease': 'Gynecologist',
    'Perennial Allergic Conjunctivitis': 'Allergy Specialist',
    'Pericarditis': 'Cardiologist',
    'Peritonitis': 'General Surgeon',
    'Pinguecula': 'Ophthalmologist',
    'Pneumonia': 'Pulmonologist / Lung Specialist',
    'Poliomyelitis': 'Pediatrician',
    'Polycystic ovary syndrome (PCOS)': 'Endocrinologist',
    'Porphyria': 'Dermatologist',
    'Post Menopausal Bleeding': 'Gynecologist',
    'Post-herpetic neuralgia': 'Pain Specialist',
    'Postpartum depression/ Perinatal depression': 'Psychiatrist',
    'Preeclampsia': 'Gynecologist',
    'Premenstrual syndrome': 'Gynecologist',
    'Presbyopia': 'Ophthalmologist',
    'Preterm birth': 'Gynecologist',
    'Progeria': 'Geneticist',
    'Psoriasis': 'Dermatologist',
    'Puerperal sepsis': 'Gynecologist',
    'Pulmonary embolism': 'Pulmonologist / Lung Specialist',
    'Ques fever': 'Infectious Diseases',
    'Quinsy': 'Ent Specialist',
    'Rabies': 'Infectious Diseases',
    "Raynaud's Phenomenon": 'Rheumatologist',
    'Repetitive strain injury': 'Orthopedic Surgeon',
    'Rheumatic fever': 'Cardiologist',
    'Rheumatism': 'Rheumatologist',
    'Rickets': 'Pediatrician',
    'Rift Valley fever': 'Infectious Diseases',
    'Rocky Mountain spotted fever': 'Infectious Diseases',
    'Rubella': 'Pediatrician',
    'SARS': 'Infectious Diseases',
    'SIDS': 'Pediatrician',
    'Sarcoidosis': 'Pulmonologist / Lung Specialist',
    'Sarcoma': 'Oncologist',
    'Scabies': 'Dermatologist',
    'Scarlet fever': 'Infectious Diseases',
    'Schizophrenia': 'Psychiatrist',
    'Sciatica': 'Orthopedic Surgeon',
    'Scrapie': 'Neurologist',
    'Scrub Typhus': 'Infectious Diseases',
    'Scurvy': 'Nutritionist',
    'Sepsis': 'Infectious Diseases',
    'Sexually transmitted infections (STIs)': 'Sexologist',
    'Shaken Baby Syndrome': 'Pediatrician',
    'Shigellosis': 'Infectious Diseases',
    'Shin splints': 'Orthopedic Surgeon',
    'Shingles': 'Dermatologist',
    'Sickle-cell anemia': 'Hematologist',
    'Smallpox': 'Infectious Diseases',
    'Stevens-Johnson syndrome': 'Dermatologist',
    'Stomach ulcers': 'Gastroenterologist',
    'Strep throat': 'Ent Specialist',
    'Stroke': 'Neurologist',
    'Sub-conjunctival Haemorrhage': 'Ophthalmologist',
    'Syphilis': 'Sexologist',
    'Taeniasis': 'Infectious Diseases',
    'Taeniasis/cysticercosis': 'Infectious Diseases',
    'Tay-Sachs disease': 'Geneticist',
    'Tennis elbow': 'Orthopedic Surgeon',
    'Tetanus': 'Infectious Diseases',
    'Thalassaemia': 'Hematologist',
    'Tinnitus': 'Ent Specialist',
    'Tonsillitis': 'Ent Specialist',
    'Toxic shock syndrome': 'Infectious Diseases',
    'Trachoma': 'Ophthalmologist',
    'Trichinosis': 'Infectious Diseases',
    'Trichomoniasis': 'Sexologist',
    'Tuberculosis': 'Pulmonologist / Lung Specialist',
    'Tularemia': 'Infectious Diseases',
    'Turners Syndrome': 'Geneticist',
    'Urticaria': 'Dermatologist',
    'Varicose Veins': 'Vascular Surgeon',
    'Vasovagal syncope': 'Cardiologist',
    'Vitamin B12 Deficiency': 'Nutritionist',
    'Vitiligo': 'Dermatologist',
    'Warkany syndrome': 'Geneticist',
    'Warts': 'Dermatologist',
    'Yaws': 'Dermatologist',
    'Yellow Fever': 'Infectious Diseases',
    'Zika virus disease': 'Infectious Diseases',
    'lactose intolerance': 'Gastroenterologist',
    'papilloedema': 'Ophthalmologist'
}

# Function to retrieve the specialization based on the disease name
def get_specialization(disease_name):
    return disease_to_specialization.get(disease_name, "Specialization not found")

# Suggesting doctors

In [37]:
doctors_df=doctors

In [38]:
doctors_df['Specialization'] = doctors_df['Specialization'].apply(lambda x: [spec.strip() for spec in x.split(',')])
# Calculate the normalized satisfaction score
doctors_df['Normalized Satisfaction Score'] = doctors_df['Patient Satisfaction Rate(%age)'] * doctors_df['Total_Reviews']


In [41]:
# Create query vector based on symptoms selected by the user
print("\nFinal list of Symptoms used for prediction:")
print("\n-------------------------------------------------------------------")
sample_x = [0 for x in range(0,len(dataset_symptoms))]
for val in final_symp:
    print(val)
    sample_x[dataset_symptoms.index(val)]=1


print("\n")
top_three_diseases = most_probable_diseases[:3]  # Take the top 3 predicted diseases


print(f"\nTop {k} diseases predicted based on symptoms")
topk_dict = {}
# Show top 10 highly probable disease to the user.
for idx,t in  enumerate(topk):
    match_sym=set()
    row = df_norm.loc[df_norm['label_dis'] == diseases[t]].values.tolist()
    row[0].pop(0)

    for idx,val in enumerate(row[0]):
        if val!=0:
            match_sym.add(dataset_symptoms[idx])
    prob = (len(match_sym.intersection(set(final_symp)))+1)/(len(set(final_symp))+1)
    prob *= mean(scores)
    topk_dict[t] = prob
j = 0
topk_index_mapping = {}
topk_sorted = dict(sorted(topk_dict.items(), key=lambda kv: kv[1], reverse=True))
for key in topk_sorted:
  prob = topk_sorted[key]*100
  print(str(j) + " Disease name:",diseases[key], "\tProbability:",str(round(prob, 2))+"%")
  topk_index_mapping[j] = key
  j += 1

select = input("\nMore details about the disease? Enter index of disease or '-1' to discontinue and close the system:\n")
if select!='-1':
    dis=diseases[topk_index_mapping[int(select)]]
    print()
    print(diseaseDetail(dis))

# Print recommendations with proper formatting
print("Top Diseases Predicted and Doctor Recommendations:")
for i, disease in enumerate(top_three_diseases):
    specialization_needed = disease_to_specialization[disease]
    print("\n-------------------------------------------------------------------")
    print(f"Most Probable Disease {i + 1}: {disease}")
    print(f"Required Specialization: {specialization_needed}")

    # Filtering doctors based on the required specialization
    filtered_doctors = doctors_df[doctors_df['Specialization'].apply(lambda x: specialization_needed in x)]
    # Sort and pick the top doctor(s) based on normalized satisfaction score
    top_doctor = filtered_doctors.sort_values(by='Normalized Satisfaction Score', ascending=False).head(1)

    if not top_doctor.empty:
        doctor = top_doctor.iloc[0]
        print(f"Recommended Doctor: {doctor['Doctor Name']}")
        print(f"City: {doctor['City']}")
        print(f"Specialization: {doctor['Specialization']}")
        print(f"Qualification: {doctor['Doctor Qualification']}")
        print(f"Experience: {doctor['Experience(Years)']} years")
        print(f"Reviews: {doctor['Total_Reviews']}")
        print(f"Satisfaction Rate: {doctor['Patient Satisfaction Rate(%age)']}%")
        print(f"Average Time to Patients: {doctor['Avg Time to Patients(mins)']} mins")
        print(f"Wait Time: {doctor['Wait Time(mins)']} mins")
        print(f"Fee: PKR {doctor['Fee(PKR)']}")
        print(f"Hospital Address: {doctor['Hospital Address']}")
        print(f"Profile Link: {doctor['Doctors Link']}")
    else:
        print("No available doctors for this specialization.")


Final list of Symptoms used for prediction:

-------------------------------------------------------------------
coughing
feeling tired
sore throat



Top 10 diseases predicted based on symptoms
0 Disease name: Influenza 	Probability: 89.19%
1 Disease name: Rubella 	Probability: 66.89%
2 Disease name: Brucellosis 	Probability: 44.6%
3 Disease name: Hepatitis D 	Probability: 44.6%
4 Disease name: Fibromyalgia 	Probability: 44.6%
5 Disease name: Asthma 	Probability: 44.6%
6 Disease name: Congestive heart disease 	Probability: 44.6%
7 Disease name: Botulism 	Probability: 44.6%
8 Disease name: Strep throat 	Probability: 44.6%
9 Disease name: Anaemia 	Probability: 44.6%

More details about the disease? Enter index of disease or '-1' to discontinue and close the system:
0

Influenza
Other names -  flu, the flu, grippe (French for flu) 
Specialty -  Infectious disease 
Symptoms -  Fever, runny nose, sore throat, muscle pain, headache, coughing, fatigue 
Usual onset -  1–4 days after exposure