In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv('disease_dataset.csv')

In [4]:
df.shape

(10000, 8)

In [5]:
df.head()

Unnamed: 0,ID,Disease,Sub-Disease,Symptoms,Severity,Duration (days),Age Group,Gender
0,1,Cough,Chronic Cough,"hoarseness, shortness of breath, long-term cough",Moderate,29,61+,Male
1,2,Fungal Infection,Nail Fungus,"thickened nails, brittle nails, discolored nails",Severe,17,61+,Female
2,3,Stomach Problem,Gastritis,"bloating, abdominal pain, nausea",Mild,20,41-50,Female
3,4,Stomach Problem,Gastritis,"bloating, nausea, abdominal pain, vomiting",Mild,17,0-10,Other
4,5,Fungal Infection,Jock Itch,"flaky skin, red rash, itchy groin",Moderate,18,41-50,Male


In [6]:
df.isna().sum()

ID                 0
Disease            0
Sub-Disease        0
Symptoms           0
Severity           0
Duration (days)    0
Age Group          0
Gender             0
dtype: int64

In [7]:
df.duplicated().sum()

0

In [8]:
df['Sub-Disease'].value_counts()

Sub-Disease
Migraine            358
Dry Eye             358
Acid Reflux         358
Constipation        354
Split Ends          349
Anemia              344
Food Poisoning      344
Jock Itch           343
Hair Thinning       343
Wet Cough           342
Dandruff            340
Dry Cough           337
Glaucoma            335
Athlete's Foot      333
Yeast Infection     332
Gastritis           332
Alopecia            331
Chronic Cough       329
Hypertension        328
Allergic Cough      326
Cataract            325
Nail Fungus         325
Ringworm            324
Conjunctivitis      324
Eye Strain          322
Diabetes            320
Asthma              319
IBS                 311
Whooping Cough      309
Lice Infestation    305
Name: count, dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ID               10000 non-null  int64 
 1   Disease          10000 non-null  object
 2   Sub-Disease      10000 non-null  object
 3   Symptoms         10000 non-null  object
 4   Severity         10000 non-null  object
 5   Duration (days)  10000 non-null  int64 
 6   Age Group        10000 non-null  object
 7   Gender           10000 non-null  object
dtypes: int64(2), object(6)
memory usage: 625.1+ KB


In [10]:
df['Symptoms'].unique()

array(['hoarseness, shortness of breath, long-term cough',
       'thickened nails, brittle nails, discolored nails',
       'bloating, abdominal pain, nausea',
       'bloating, nausea, abdominal pain, vomiting',
       'flaky skin, red rash, itchy groin', 'runny nose, itchy eyes',
       'discharge, watery eyes, itching',
       'slow growth, brittle hair, receding hairline',
       'shortness of breath, coughing, wheezing', 'itching, hair loss',
       'wheezing, coughing, shortness of breath',
       'red bumps, crawling sensation', 'coughing, wheezing',
       'itching, burning sensation, discharge',
       'itching, scalp redness, dry skin, flaky scalp',
       'nausea, vomiting', 'redness, itching, ring-shaped rash',
       'itching, thinning hair', 'cough with mucus, wheezing',
       'blurred vision, dizziness, headache',
       'brittle nails, discolored nails, thickened nails',
       'throat irritation, persistent cough', 'redness, ring-shaped rash',
       'no phlegm, pers

In [11]:
df['Symptoms']

0       hoarseness, shortness of breath, long-term cough
1       thickened nails, brittle nails, discolored nails
2                       bloating, abdominal pain, nausea
3             bloating, nausea, abdominal pain, vomiting
4                      flaky skin, red rash, itchy groin
                              ...                       
9995          blurred vision, nausea, eye pain, headache
9996                 blurred vision, dizziness, headache
9997                                  redness, discharge
9998                       wheezing, shortness of breath
9999                                itchy feet, blisters
Name: Symptoms, Length: 10000, dtype: object

In [12]:
label_encoder = LabelEncoder()
df['Sub-Disease'] = label_encoder.fit_transform(df['Sub-Disease'])


In [13]:
X = df['Symptoms']
y = to_categorical(df['Sub-Disease'])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

In [16]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [17]:
max_length = max(len(seq) for seq in X_train_seq)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

In [18]:
model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=max_length),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(y.shape[1], activation='softmax')  # For multi-class classification
])




In [19]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
history = model.fit(
    X_train_padded, y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_test_padded, y_test)
)

Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.2318 - loss: 2.7036 - val_accuracy: 0.9705 - val_loss: 0.2799
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8336 - loss: 0.5905 - val_accuracy: 0.9795 - val_loss: 0.0736
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9277 - loss: 0.2780 - val_accuracy: 0.9860 - val_loss: 0.0511
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9490 - loss: 0.1962 - val_accuracy: 0.9865 - val_loss: 0.0463
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9613 - loss: 0.1412 - val_accuracy: 0.9850 - val_loss: 0.0304
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9705 - loss: 0.1065 - val_accuracy: 0.9840 - val_loss: 0.0298
Epoch 7/10
[1m250/250[0m 

In [21]:
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Validation Accuracy: {accuracy * 100:.2f}%')

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9863 - loss: 0.0286
Validation Accuracy: 98.55%


In [22]:
def predict_disease(symptoms):
    symptoms_seq = tokenizer.texts_to_sequences([symptoms])
    symptoms_padded = pad_sequences(symptoms_seq, maxlen=max_length, padding='post')
    prediction = model.predict(symptoms_padded)
    predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])
    return predicted_label[0]

In [25]:
new_symptoms = "blured vision, blurry vision"
print(f'Predicted Disease: {predict_disease(new_symptoms)}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 342ms/step
Predicted Disease: Cataract


In [24]:
# Save the model
model.save('model.h5')

# Save the tokenizer
import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


