### Import necessary files


In [3]:
import regex as re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MultiLabelBinarizer 
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Load and view data


In [4]:
dataset = pd.read_csv('./dataset.csv')

In [5]:
dataset.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [6]:
dataset.shape

(4920, 18)

### Checking for balance of dataset


In [7]:
dataset.Disease.value_counts()

Disease
Fungal infection                           120
Hepatitis C                                120
Hepatitis E                                120
Alcoholic hepatitis                        120
Tuberculosis                               120
Common Cold                                120
Pneumonia                                  120
Dimorphic hemmorhoids(piles)               120
Heart attack                               120
Varicose veins                             120
Hypothyroidism                             120
Hyperthyroidism                            120
Hypoglycemia                               120
Osteoarthristis                            120
Arthritis                                  120
(vertigo) Paroymsal  Positional Vertigo    120
Acne                                       120
Urinary tract infection                    120
Psoriasis                                  120
Hepatitis D                                120
Hepatitis B                                120
Aller

In [8]:
dataset.Disease.unique()

array(['Fungal infection', 'Allergy', 'GERD', 'Chronic cholestasis',
       'Drug Reaction', 'Peptic ulcer diseae', 'AIDS', 'Diabetes ',
       'Gastroenteritis', 'Bronchial Asthma', 'Hypertension ', 'Migraine',
       'Cervical spondylosis', 'Paralysis (brain hemorrhage)', 'Jaundice',
       'Malaria', 'Chicken pox', 'Dengue', 'Typhoid', 'hepatitis A',
       'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E',
       'Alcoholic hepatitis', 'Tuberculosis', 'Common Cold', 'Pneumonia',
       'Dimorphic hemmorhoids(piles)', 'Heart attack', 'Varicose veins',
       'Hypothyroidism', 'Hyperthyroidism', 'Hypoglycemia',
       'Osteoarthristis', 'Arthritis',
       '(vertigo) Paroymsal  Positional Vertigo', 'Acne',
       'Urinary tract infection', 'Psoriasis', 'Impetigo'], dtype=object)

In [9]:
df = dataset.copy()

In [10]:
df['All Symptoms'] = df.apply(lambda row: ','.join(row.dropna()), axis=1) 
df['All Symptoms'] = df['All Symptoms'].apply(lambda x: ','.join(sorted(set(x.split(','))) if x else ''))
stay_cols = ['Disease', 'All Symptoms']
df = df[stay_cols]
df.head()

Unnamed: 0,Disease,All Symptoms
0,Fungal infection,"dischromic _patches, nodal_skin_eruptions, sk..."
1,Fungal infection,"dischromic _patches, nodal_skin_eruptions, sk..."
2,Fungal infection,"dischromic _patches, nodal_skin_eruptions,Fun..."
3,Fungal infection,"dischromic _patches, skin_rash,Fungal infecti..."
4,Fungal infection,"nodal_skin_eruptions, skin_rash,Fungal infect..."


In [11]:
df['All Symptoms'][0]

' dischromic _patches, nodal_skin_eruptions, skin_rash,Fungal infection,itching'

### Removing the '\_'


In [12]:
def strip_to_tokens(text):
    text = re.sub(r'[_\s]+', ' ', text)
    tokens = [token.strip().lower() for token in text.split(',')]
    return tokens

df['Basic Tokens'] = df['All Symptoms'].apply(strip_to_tokens)
df['Basic Tokens'] = df['Basic Tokens'].apply(lambda x: ', '.join(x))
df = df.drop(['All Symptoms'], axis=1)
df.head()

Unnamed: 0,Disease,Basic Tokens
0,Fungal infection,"dischromic patches, nodal skin eruptions, skin..."
1,Fungal infection,"dischromic patches, nodal skin eruptions, skin..."
2,Fungal infection,"dischromic patches, nodal skin eruptions, fung..."
3,Fungal infection,"dischromic patches, skin rash, fungal infectio..."
4,Fungal infection,"nodal skin eruptions, skin rash, fungal infect..."


In [13]:
df['Basic Tokens'][0]

'dischromic patches, nodal skin eruptions, skin rash, fungal infection, itching'

### Using MultiLabelBinarizer to convert all the symptoms to binary


In [14]:
dfE = df.copy()
dfE['Basic Tokens'] = dfE['Basic Tokens'].apply(lambda x: x.split(', '))

mlb = MultiLabelBinarizer()

one_hot_encoded_df = pd.DataFrame(mlb.fit_transform(dfE['Basic Tokens']), columns=mlb.classes_, index=df.index)

df_encoded = pd.concat([dfE, one_hot_encoded_df], axis=1)

df_encoded = df_encoded.drop(columns=['Basic Tokens'])
df_encoded.head()

Unnamed: 0,Disease,(vertigo) paroymsal positional vertigo,abdominal pain,abnormal menstruation,acidity,acne,acute liver failure,aids,alcoholic hepatitis,allergy,altered sensorium,anxiety,arthritis,back pain,belly pain,blackheads,bladder discomfort,blister,blood in sputum,bloody stool,blurred and distorted vision,breathlessness,brittle nails,bronchial asthma,bruising,burning micturition,cervical spondylosis,chest pain,chicken pox,chills,chronic cholestasis,cold hands and feets,coma,common cold,congestion,constipation,continuous feel of urine,continuous sneezing,cough,cramps,dark urine,dehydration,dengue,depression,diabetes,diarrhoea,dimorphic hemmorhoids(piles),dischromic patches,distention of abdomen,dizziness,drug reaction,drying and tingling lips,enlarged thyroid,excessive hunger,extra marital contacts,family history,fast heart rate,fatigue,fluid overload,foul smell of urine,fungal infection,gastroenteritis,gerd,headache,heart attack,hepatitis a,hepatitis b,hepatitis c,hepatitis d,hepatitis e,high fever,hip joint pain,history of alcohol consumption,hypertension,hyperthyroidism,hypoglycemia,hypothyroidism,impetigo,increased appetite,indigestion,inflammatory nails,internal itching,irregular sugar level,irritability,irritation in anus,itching,jaundice,joint pain,knee pain,lack of concentration,lethargy,loss of appetite,loss of balance,loss of smell,malaise,malaria,migraine,mild fever,mood swings,movement stiffness,mucoid sputum,muscle pain,muscle wasting,muscle weakness,nausea,neck pain,nodal skin eruptions,obesity,osteoarthristis,pain behind the eyes,pain during bowel movements,pain in anal region,painful walking,palpitations,paralysis (brain hemorrhage),passage of gases,patches in throat,peptic ulcer diseae,phlegm,pneumonia,polyuria,prominent veins on calf,psoriasis,puffy face and eyes,pus filled pimples,receiving blood transfusion,receiving unsterile injections,red sore around nose,red spots over body,redness of eyes,restlessness,runny nose,rusty sputum,scurring,shivering,silver like dusting,sinus pressure,skin peeling,skin rash,slurred speech,small dents in nails,spinning movements,spotting urination,stiff neck,stomach bleeding,stomach pain,sunken eyes,sweating,swelled lymph nodes,swelling joints,swelling of stomach,swollen blood vessels,swollen extremeties,swollen legs,throat irritation,toxic look (typhos),tuberculosis,typhoid,ulcers on tongue,unsteadiness,urinary tract infection,varicose veins,visual disturbances,vomiting,watering from eyes,weakness in limbs,weakness of one body side,weight gain,weight loss,yellow crust ooze,yellow urine,yellowing of eyes,yellowish skin
0,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
symptoms = df_encoded.columns[1:]
print('Number of unique symptoms:', len(symptoms))
print(symptoms)

Number of unique symptoms: 172
Index(['(vertigo) paroymsal positional vertigo', 'abdominal pain',
       'abnormal menstruation', 'acidity', 'acne', 'acute liver failure',
       'aids', 'alcoholic hepatitis', 'allergy', 'altered sensorium',
       ...
       'vomiting', 'watering from eyes', 'weakness in limbs',
       'weakness of one body side', 'weight gain', 'weight loss',
       'yellow crust ooze', 'yellow urine', 'yellowing of eyes',
       'yellowish skin'],
      dtype='object', length=172)


### Using LabelEncoding to encode the disease names


In [16]:
encoded_df = df_encoded.copy()

class CustomLabelEncoder(LabelEncoder):
    def __init__(self, start=0):
        self.start = start
        super().__init__()

    def fit_transform(self, y):
        encoded = super().fit_transform(y)
        encoded += self.start
        return encoded

In [17]:
flattened_series = encoded_df['Disease'].astype(str)
encoder = CustomLabelEncoder(start=200)

In [18]:
encoded_values = encoder.fit_transform(flattened_series)
encoded_df['Disease'] = encoded_values

mapping_data = {'label_encoder': encoder}

label_mapping = {k: v for k, v in zip(mapping_data['label_encoder'].classes_, range(200, 200+len(mapping_data['label_encoder'].classes_)))}

encoded_df.head()

Unnamed: 0,Disease,(vertigo) paroymsal positional vertigo,abdominal pain,abnormal menstruation,acidity,acne,acute liver failure,aids,alcoholic hepatitis,allergy,altered sensorium,anxiety,arthritis,back pain,belly pain,blackheads,bladder discomfort,blister,blood in sputum,bloody stool,blurred and distorted vision,breathlessness,brittle nails,bronchial asthma,bruising,burning micturition,cervical spondylosis,chest pain,chicken pox,chills,chronic cholestasis,cold hands and feets,coma,common cold,congestion,constipation,continuous feel of urine,continuous sneezing,cough,cramps,dark urine,dehydration,dengue,depression,diabetes,diarrhoea,dimorphic hemmorhoids(piles),dischromic patches,distention of abdomen,dizziness,drug reaction,drying and tingling lips,enlarged thyroid,excessive hunger,extra marital contacts,family history,fast heart rate,fatigue,fluid overload,foul smell of urine,fungal infection,gastroenteritis,gerd,headache,heart attack,hepatitis a,hepatitis b,hepatitis c,hepatitis d,hepatitis e,high fever,hip joint pain,history of alcohol consumption,hypertension,hyperthyroidism,hypoglycemia,hypothyroidism,impetigo,increased appetite,indigestion,inflammatory nails,internal itching,irregular sugar level,irritability,irritation in anus,itching,jaundice,joint pain,knee pain,lack of concentration,lethargy,loss of appetite,loss of balance,loss of smell,malaise,malaria,migraine,mild fever,mood swings,movement stiffness,mucoid sputum,muscle pain,muscle wasting,muscle weakness,nausea,neck pain,nodal skin eruptions,obesity,osteoarthristis,pain behind the eyes,pain during bowel movements,pain in anal region,painful walking,palpitations,paralysis (brain hemorrhage),passage of gases,patches in throat,peptic ulcer diseae,phlegm,pneumonia,polyuria,prominent veins on calf,psoriasis,puffy face and eyes,pus filled pimples,receiving blood transfusion,receiving unsterile injections,red sore around nose,red spots over body,redness of eyes,restlessness,runny nose,rusty sputum,scurring,shivering,silver like dusting,sinus pressure,skin peeling,skin rash,slurred speech,small dents in nails,spinning movements,spotting urination,stiff neck,stomach bleeding,stomach pain,sunken eyes,sweating,swelled lymph nodes,swelling joints,swelling of stomach,swollen blood vessels,swollen extremeties,swollen legs,throat irritation,toxic look (typhos),tuberculosis,typhoid,ulcers on tongue,unsteadiness,urinary tract infection,varicose veins,visual disturbances,vomiting,watering from eyes,weakness in limbs,weakness of one body side,weight gain,weight loss,yellow crust ooze,yellow urine,yellowing of eyes,yellowish skin
0,215,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,215,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,215,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,215,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,215,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
label_mapping

{'(vertigo) Paroymsal  Positional Vertigo': 200,
 'AIDS': 201,
 'Acne': 202,
 'Alcoholic hepatitis': 203,
 'Allergy': 204,
 'Arthritis': 205,
 'Bronchial Asthma': 206,
 'Cervical spondylosis': 207,
 'Chicken pox': 208,
 'Chronic cholestasis': 209,
 'Common Cold': 210,
 'Dengue': 211,
 'Diabetes ': 212,
 'Dimorphic hemmorhoids(piles)': 213,
 'Drug Reaction': 214,
 'Fungal infection': 215,
 'GERD': 216,
 'Gastroenteritis': 217,
 'Heart attack': 218,
 'Hepatitis B': 219,
 'Hepatitis C': 220,
 'Hepatitis D': 221,
 'Hepatitis E': 222,
 'Hypertension ': 223,
 'Hyperthyroidism': 224,
 'Hypoglycemia': 225,
 'Hypothyroidism': 226,
 'Impetigo': 227,
 'Jaundice': 228,
 'Malaria': 229,
 'Migraine': 230,
 'Osteoarthristis': 231,
 'Paralysis (brain hemorrhage)': 232,
 'Peptic ulcer diseae': 233,
 'Pneumonia': 234,
 'Psoriasis': 235,
 'Tuberculosis': 236,
 'Typhoid': 237,
 'Urinary tract infection': 238,
 'Varicose veins': 239,
 'hepatitis A': 240}

In [20]:
encoded_df.shape

(4920, 173)

In [21]:
diseases_names = [key for key in label_mapping.keys()]
diseases = [strip_to_tokens(disease) for disease in diseases_names]
diseases_cleaned = [item[0] if isinstance(item, list) else item for item in diseases]
df_encoded = df_encoded.drop(diseases_cleaned, axis=1)
df_encoded.shape

(4920, 132)

In [22]:
df_encoded.head()

Unnamed: 0,Disease,abdominal pain,abnormal menstruation,acidity,acute liver failure,altered sensorium,anxiety,back pain,belly pain,blackheads,bladder discomfort,blister,blood in sputum,bloody stool,blurred and distorted vision,breathlessness,brittle nails,bruising,burning micturition,chest pain,chills,cold hands and feets,coma,congestion,constipation,continuous feel of urine,continuous sneezing,cough,cramps,dark urine,dehydration,depression,diarrhoea,dischromic patches,distention of abdomen,dizziness,drying and tingling lips,enlarged thyroid,excessive hunger,extra marital contacts,family history,fast heart rate,fatigue,fluid overload,foul smell of urine,headache,high fever,hip joint pain,history of alcohol consumption,increased appetite,indigestion,inflammatory nails,internal itching,irregular sugar level,irritability,irritation in anus,itching,joint pain,knee pain,lack of concentration,lethargy,loss of appetite,loss of balance,loss of smell,malaise,mild fever,mood swings,movement stiffness,mucoid sputum,muscle pain,muscle wasting,muscle weakness,nausea,neck pain,nodal skin eruptions,obesity,pain behind the eyes,pain during bowel movements,pain in anal region,painful walking,palpitations,passage of gases,patches in throat,phlegm,polyuria,prominent veins on calf,puffy face and eyes,pus filled pimples,receiving blood transfusion,receiving unsterile injections,red sore around nose,red spots over body,redness of eyes,restlessness,runny nose,rusty sputum,scurring,shivering,silver like dusting,sinus pressure,skin peeling,skin rash,slurred speech,small dents in nails,spinning movements,spotting urination,stiff neck,stomach bleeding,stomach pain,sunken eyes,sweating,swelled lymph nodes,swelling joints,swelling of stomach,swollen blood vessels,swollen extremeties,swollen legs,throat irritation,toxic look (typhos),ulcers on tongue,unsteadiness,visual disturbances,vomiting,watering from eyes,weakness in limbs,weakness of one body side,weight gain,weight loss,yellow crust ooze,yellow urine,yellowing of eyes,yellowish skin
0,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Create and compile the model


In [23]:
model_features = df_encoded.columns.tolist()
model_features.remove('Disease')
X = df_encoded[model_features]
Y = df_encoded['Disease']

In [24]:
y_encoded = pd.get_dummies(Y)
y_encoded.shape

(4920, 41)

In [25]:
y_encoded.head()

Unnamed: 0,(vertigo) Paroymsal Positional Vertigo,AIDS,Acne,Alcoholic hepatitis,Allergy,Arthritis,Bronchial Asthma,Cervical spondylosis,Chicken pox,Chronic cholestasis,Common Cold,Dengue,Diabetes,Dimorphic hemmorhoids(piles),Drug Reaction,Fungal infection,GERD,Gastroenteritis,Heart attack,Hepatitis B,Hepatitis C,Hepatitis D,Hepatitis E,Hypertension,Hyperthyroidism,Hypoglycemia,Hypothyroidism,Impetigo,Jaundice,Malaria,Migraine,Osteoarthristis,Paralysis (brain hemorrhage),Peptic ulcer diseae,Pneumonia,Psoriasis,Tuberculosis,Typhoid,Urinary tract infection,Varicose veins,hepatitis A
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, random_state=42)
X_eval, X_test, y_eval, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [27]:
X_train_tensor = tf.convert_to_tensor(X_train.values, dtype=tf.float32)
X_test_tensor = tf.convert_to_tensor(X_test.values, dtype=tf.float32)
X_eval_tensor = tf.convert_to_tensor(X_eval.values, dtype=tf.float32)
y_train_tensor = tf.convert_to_tensor(y_train.values, dtype=tf.float32)
y_test_tensor = tf.convert_to_tensor(y_test.values, dtype=tf.float32)
y_eval_tensor = tf.convert_to_tensor(y_eval.values, dtype=tf.float32)

In [28]:
X_train_tensor[0].shape

TensorShape([131])

In [29]:
model = keras.Sequential([
    layers.Input(shape=(X_train_tensor.shape[1],)),
    layers.Dense(160, activation='relu'),
    layers.Dropout(0.1),
    layers.Dense(200, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(240, activation='tanh'),
    layers.BatchNormalization(),
    layers.Dense(240, activation='tanh'),
    layers.Dropout(0.2),
    layers.Dense(200, activation='relu'),
    layers.Dropout(0.1),
    layers.Dense(160, activation='relu'),
    layers.Dense(y_train_tensor.shape[1], activation='softmax')
])

In [30]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_accuracy', patience=4, mode='max')
history = model.fit(X_train_tensor, y_train_tensor, epochs=500, callbacks=[early_stopping],
                    batch_size=16, validation_data=(X_eval_tensor, y_eval_tensor))

Epoch 1/500


[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6971 - loss: 1.3493 - val_accuracy: 1.0000 - val_loss: 0.0299
Epoch 2/500
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9962 - loss: 0.0176 - val_accuracy: 1.0000 - val_loss: 1.2554e-04
Epoch 3/500
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9971 - loss: 0.0130 - val_accuracy: 0.9919 - val_loss: 0.0105
Epoch 4/500
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9750 - loss: 0.0706 - val_accuracy: 1.0000 - val_loss: 0.0015
Epoch 5/500
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9933 - loss: 0.0264 - val_accuracy: 1.0000 - val_loss: 5.3682e-05


In [31]:
model.evaluate(X_test_tensor, y_test_tensor)

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 917us/step - accuracy: 1.0000 - loss: 5.4793e-05


[5.134143430041149e-05, 1.0]

### Manual testing of the model


In [32]:
user_input = ['muscle_wasting', 'patches_in_throat', 'high_fever']
original_data = df_encoded.copy()

def strip_to_token(symptoms):
    symptoms = [symptom.strip().lower().replace(' ', '_').replace('_', ' ') for symptom in symptoms]
    return [re.sub(r'\s+', ' ', symptom) for symptom in symptoms]

user_input_stripped = strip_to_token(user_input)
mlb = MultiLabelBinarizer(classes=df_encoded.columns)

user_input_encoded = pd.DataFrame(mlb.fit_transform([user_input_stripped]), columns=mlb.classes_)

final_user_input = pd.concat([pd.DataFrame(columns=original_data.columns), user_input_encoded], axis=0)

final_user_input = final_user_input.drop('Disease', axis=1)

final_user_input.head()


Unnamed: 0,abdominal pain,abnormal menstruation,acidity,acute liver failure,altered sensorium,anxiety,back pain,belly pain,blackheads,bladder discomfort,blister,blood in sputum,bloody stool,blurred and distorted vision,breathlessness,brittle nails,bruising,burning micturition,chest pain,chills,cold hands and feets,coma,congestion,constipation,continuous feel of urine,continuous sneezing,cough,cramps,dark urine,dehydration,depression,diarrhoea,dischromic patches,distention of abdomen,dizziness,drying and tingling lips,enlarged thyroid,excessive hunger,extra marital contacts,family history,fast heart rate,fatigue,fluid overload,foul smell of urine,headache,high fever,hip joint pain,history of alcohol consumption,increased appetite,indigestion,inflammatory nails,internal itching,irregular sugar level,irritability,irritation in anus,itching,joint pain,knee pain,lack of concentration,lethargy,loss of appetite,loss of balance,loss of smell,malaise,mild fever,mood swings,movement stiffness,mucoid sputum,muscle pain,muscle wasting,muscle weakness,nausea,neck pain,nodal skin eruptions,obesity,pain behind the eyes,pain during bowel movements,pain in anal region,painful walking,palpitations,passage of gases,patches in throat,phlegm,polyuria,prominent veins on calf,puffy face and eyes,pus filled pimples,receiving blood transfusion,receiving unsterile injections,red sore around nose,red spots over body,redness of eyes,restlessness,runny nose,rusty sputum,scurring,shivering,silver like dusting,sinus pressure,skin peeling,skin rash,slurred speech,small dents in nails,spinning movements,spotting urination,stiff neck,stomach bleeding,stomach pain,sunken eyes,sweating,swelled lymph nodes,swelling joints,swelling of stomach,swollen blood vessels,swollen extremeties,swollen legs,throat irritation,toxic look (typhos),ulcers on tongue,unsteadiness,visual disturbances,vomiting,watering from eyes,weakness in limbs,weakness of one body side,weight gain,weight loss,yellow crust ooze,yellow urine,yellowing of eyes,yellowish skin
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [33]:
user_tensor = tf.convert_to_tensor(final_user_input.values, dtype=tf.float32)
user_tensor[0]

<tf.Tensor: shape=(131,), dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>

In [34]:
target_index = y_encoded.columns.tolist()
predict_proba = model.predict(user_tensor)
predicted_class_index = np.argmax(predict_proba)
prediction_encode = target_index[predicted_class_index]
inverse_label_encoding = {v: k for k, v in label_mapping.items()}
try:
    prediction_encode = target_index[predicted_class_index]
    inverse_label_encoding = {v: k for k, v in label_mapping.items()}
    prediction = inverse_label_encoding[prediction_encode]
except KeyError as e:
    res = str(e).replace("'", "")

res

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step


'AIDS'