In [1]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Input
from tensorflow.keras import backend as K
from keras.layers import Layer
from keras.saving import register_keras_serializable


2025-06-18 23:37:43.747204: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-18 23:37:43.753504: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-18 23:37:43.769037: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750271863.794834   10122 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750271863.802861   10122 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750271863.823890   10122 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

In [2]:
df = pd.read_csv("../disease_prediction/Symptom2Disease.csv")

# Clean text: remove punctuation and extra spaces
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)     # Remove extra spaces
    return text.strip()

df['text'] = df['text'].apply(clean_text)


In [3]:
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

# Save for later decoding
label_map = dict(zip(le.classes_, le.transform(le.classes_)))


In [4]:
X = df['text']
y = df['label_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding
maxlen = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen, padding='post')

# Convert to arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

In [5]:


@register_keras_serializable(package="Custom")
class AttentionLayer(Layer):
    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1), initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1), initializer="zeros")
        super().build(input_shape)

    def call(self, inputs):
        e = K.tanh(K.dot(inputs, self.W) + self.b)
        a = K.softmax(e, axis=1)
        output = inputs * a
        return K.sum(output, axis=1)


In [6]:
input_layer = Input(shape=(maxlen,))
embedding_layer = Embedding(input_dim=10000, output_dim=128)(input_layer)
bilstm = Bidirectional(LSTM(64, return_sequences=True))(embedding_layer)
attention = AttentionLayer()(bilstm)
dense1 = Dense(64, activation='relu')(attention)
dropout = Dropout(0.3)(dense1)
output_layer = Dense(len(le.classes_), activation='softmax')(dropout)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


2025-06-18 23:37:49.919832: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [7]:
history = model.fit(X_train_pad, y_train, epochs=15, batch_size=32, validation_split=0.1)


Epoch 1/15
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 156ms/step - accuracy: 0.0461 - loss: 3.1795 - val_accuracy: 0.0625 - val_loss: 3.1695
Epoch 2/15
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 128ms/step - accuracy: 0.0616 - loss: 3.1691 - val_accuracy: 0.0521 - val_loss: 3.1521
Epoch 3/15
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 121ms/step - accuracy: 0.0745 - loss: 3.0945 - val_accuracy: 0.1458 - val_loss: 2.7655
Epoch 4/15
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 122ms/step - accuracy: 0.1904 - loss: 2.6268 - val_accuracy: 0.3125 - val_loss: 2.2296
Epoch 5/15
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 124ms/step - accuracy: 0.3086 - loss: 2.1645 - val_accuracy: 0.4271 - val_loss: 1.8817
Epoch 6/15
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 114ms/step - accuracy: 0.3894 - loss: 1.8932 - val_accuracy: 0.5729 - val_loss: 1.5926
Epoch 7/15
[1m27/27[0m [3

In [8]:
model.save("disease_prediction_model.keras")

import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

In [9]:
def predict_disease(text):
    text = clean_text(text)
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=maxlen, padding='post')
    pred = model.predict(padded)
    pred_class = np.argmax(pred, axis=1)[0]
    return le.inverse_transform([pred_class])[0]

# Example
user_input = "I am feeling itchy and my skin is scaling on my elbows"
print("Predicted Disease:", predict_disease(user_input))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 697ms/step
Predicted Disease: gastroesophageal reflux disease
