In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import nltk
pd.set_option('display.max_colwidth', None)

## Let's first begin with basic data exploration

In [2]:
data = pd.read_csv('/kaggle/input/symptom2disease/Symptom2Disease.csv')
data.drop('Unnamed: 0', axis = 1,  inplace = True)
data.head()

Unnamed: 0,label,text
0,Psoriasis,"I have been experiencing a skin rash on my arms, legs, and torso for the past few weeks. It is red, itchy, and covered in dry, scaly patches."
1,Psoriasis,"My skin has been peeling, especially on my knees, elbows, and scalp. This peeling is often accompanied by a burning or stinging sensation."
2,Psoriasis,"I have been experiencing joint pain in my fingers, wrists, and knees. The pain is often achy and throbbing, and it gets worse when I move my joints."
3,Psoriasis,"There is a silver like dusting on my skin, especially on my lower back and scalp. This dusting is made up of small scales that flake off easily when I scratch them."
4,Psoriasis,"My nails have small dents or pits in them, and they often feel inflammatory and tender to the touch. Even there are minor rashes on my arms."


In [3]:
# We can see there are 1200 instances
data.shape

(1200, 2)

In [4]:
# split data into text and labels
X, y = data['text'].copy(), data['label'].copy()

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,random_state=42)

#### We need to convert our text and labels into numerical values using tokenization. First, let's perform some text preprocessing, such as removing stop-words and punctuation.

In [13]:
# let's look at one example
print('The text at X_train[0] is:', X_train[0], '\n')
print('The label at y_train[0] is:', y_train[0])

The text at X_train[0] is: I have been experiencing a skin rash on my arms, legs, and torso for the past few weeks. It is red, itchy, and covered in dry, scaly patches. 

The label at y_train[0] is: Psoriasis


In [14]:
# Let's first perform one-hot encoding on our labels, since this is a relatively simple task
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder()
onehot_encoder.fit(y_train.values.reshape(-1,1))
y_train_encoded = onehot_encoder.transform(y_train.values.reshape(-1,1))
y_train_encoded = y_train_encoded.toarray()

In [15]:
# Now apply our OneHotEncoder to y_test as well
y_test_encoded = onehot_encoder.transform(y_test.values.reshape(-1,1))
y_test_encoded = y_test_encoded.toarray()

In [16]:
train_texts = X_train.values
test_texts = X_test.values

In [17]:
# Check to see that data is loaded in as expected (an array of strings)
train_texts[0]

"I've been experiencing muscular weakness and stiffness in my neck recently. My joints have enlarged, making it difficult for me to move. Walking has also been difficult."

#### Create a function which performs some preprocessing on our text

In [18]:
import re
from nltk.corpus import stopwords
nltk.download('stopwords')  # Download stop words if not already present

def preprocess_text(sentences):
    # download list of stop words
    stop_words = set(stopwords.words('english'))
    cleaned_sentences = []

    # iterate through all sentences in list of sentences
    for sentence in sentences:
        # get rid of puntuation and split words where puntuation is used
        sentence = ' '.join(re.split(r'[ ,.\']+', sentence))
        # lowercase sentences for standardization
        sentence = sentence.lower()
        words = sentence.split()
        # then remove words that are stop words
        filtered_words = [word for word in words if word not in stop_words]
        # join words back into a single sentence
        cleaned_sentence = ' '.join(filtered_words)
        # add processed sentence into list of processed sentences
        cleaned_sentences.append(cleaned_sentence)

    return cleaned_sentences

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
# take a look at the first three sentences to ensure text is being processed as expected
preprocess_text(train_texts)[:3]

['experiencing muscular weakness stiffness neck recently joints enlarged making difficult move walking also difficult',
 'getting blood pee sometimes get nauseous peeing often almost coincides high temperature',
 'persistent sour taste mouth even eaten anything acidic get frequent hiccups feeling lump throat']

#### Then create a function that tokenizes and pads our text

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# create a toke for OOV (out-of-vocabulary) words
tokenizer = Tokenizer(oov_token='<OOV>')
# then fit tokenizer to training data
tokenizer.fit_on_texts(preprocess_text(train_texts))

def tokenize_and_pad(words, tokenizer, max_len=50):
    # convert sentences to sequences of integers, where each words maps to a unique integer
    sequences = tokenizer.texts_to_sequences(words)
    # then pad our tokenized sequences to ensure uniform length of sequences
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
    return padded_sequences
    
encoded_train = tokenize_and_pad(preprocess_text(train_texts), tokenizer)
encoded_test = tokenize_and_pad(preprocess_text(test_texts), tokenizer)

In [21]:
# verify results are what we expect
encoded_train[0]

array([  9, 303,  78, 185,  12,  38,  57, 515, 106,  31, 135, 102,   2,
        31,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0], dtype=int32)

In [22]:
# verify results are what we expect
encoded_test[0]

array([677,  93, 150,   1, 112, 837,  35, 627, 268, 756, 971, 268,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0], dtype=int32)

#### We've prepared our testing and training data, encoding our text information as well as the corresponding labels. Let's now experiment with some models.

In [26]:
# Compute the unique tokens in the training data
unique_tokens = np.unique(encoded_train)
# Compute the vocabulary size
vocab_size = len(unique_tokens)
print(vocab_size)

1402


#### Start off with a basic embedding model

In [42]:
from tensorflow.keras.models import Sequential

model_embed = tf.keras.Sequential([
    tf.keras.layers.Embedding(1402, 128, input_length=50),
    # get average feature values over all 50 time steps
    tf.keras.layers.GlobalAveragePooling1D(),
    # incorporate a dense relu layer to capture non-linearities in data
    tf.keras.layers.Dense(32, activation='relu'),
    # softmax over all 24 possible classifications
    tf.keras.layers.Dense(24, activation='softmax')])

# Compile the model
model_embed.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model_embed.fit(encoded_train, y_train_encoded, epochs=20, batch_size=32, validation_split=0.1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


#### Overall, we seem to achieve good performance in 20 epochs. However, it is possible there is overfitting occurring. Our embedding model may require fewer epochs to achieve optimal performance since the model is relatively low in complexity compared to the LSTM models we will implemenet next.

In [46]:
model_embed.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 50, 128)           179456    
                                                                 
 global_average_pooling1d_2   (None, 128)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_8 (Dense)             (None, 32)                4128      
                                                                 
 dense_9 (Dense)             (None, 24)                792       
                                                                 
Total params: 184,376
Trainable params: 184,376
Non-trainable params: 0
_________________________________________________________________


#### Let's now create an uni-directional LSTM model

In [29]:
model_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(1403, 128, input_length=50),
    tf.keras.layers.LSTM(units=128),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(24, activation='softmax')])

# Compile the model
model_lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model_lstm.fit(encoded_train, y_train_encoded, epochs=15, batch_size=32, validation_split=0.1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


#### In 15 epochs, we get a very low accuracy. Let's incorporate bi-directionality into our LSTMs to see if this improves performance. We will include 2 dropout layers to regulate overfitting.

In [30]:
model_lstm.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 128)           179584    
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 32)                4128      
                                                                 
 dense_3 (Dense)             (None, 24)                792       
                                                                 
Total params: 316,088
Trainable params: 316,088
Non-trainable params: 0
_________________________________________________________________


In [31]:
model_bilstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(1403, 128, input_length=50),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=64)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(24, activation='softmax')])

# Compile the model
model_bilstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model_bilstm.fit(encoded_train, y_train_encoded, epochs=30, batch_size=32, validation_split=0.1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


#### We can see there is some overfitting occurring. Our model performs very well on the training set, but sees a dip in performance on the validation accuracy.

In [32]:
model_bilstm.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50, 128)           179584    
                                                                 
 dropout_1 (Dropout)         (None, 50, 128)           0         
                                                                 
 bidirectional (Bidirectiona  (None, 128)              98816     
 l)                                                              
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_4 (Dense)             (None, 32)                4128      
                                                                 
 dense_5 (Dense)             (None, 24)                792       
                                                      

#### In terms of validation accuracy, our embedding model actually performs the best compared to our LSTM models. Additionally, it should be noted that we achieve this performance on just 15 epochs of training, compared to the 30 for our LSTM models. Therefore, we will use our embedding model to make predictions on our test set.

In [43]:
embed_pred = model_embed.predict(encoded_test)



In [47]:
# get indices where the highest probability occurs for each instance of text
max_prob_idx = np.argmax(embed_pred, axis=1)
# Create a new matrix of shape (120, 24) with 1s at the indices of the max probabilities and 0s elsewhere
embed_final = np.zeros_like(embed_pred)
embed_final[np.arange(120), max_prob_idx] = 1

In [52]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Calculate accuracy
accuracy = accuracy_score(y_test_encoded, embed_final)

# Calculate precision, recall, and F1 score
precision = precision_score(y_test_encoded, embed_final, average='weighted')
recall = recall_score(y_test_encoded, embed_final, average='weighted')
f1 = f1_score(y_test_encoded, embed_final, average='weighted')

# Print the metrics
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)

Accuracy: 0.9416666666666667
Precision: 0.9533035714285715
Recall: 0.9416666666666667
F1 score: 0.9406663978722801


#### Let's try to see which instances were misclassified. We will view the indices where y_test_encoded (our true values) and embed_final (our predicted values) are not equal.

In [49]:
indices = np.where(~np.all(y_test_encoded == embed_final, axis=1))[0]
# view true values
y_test.iloc[indices]

1065              drug reaction
355                 Common Cold
354                 Common Cold
86               Varicose Veins
184                 Chicken pox
942     urinary tract infection
292                      Dengue
Name: label, dtype: object

In [51]:
# view predicted values
pd.Series(list(onehot_encoder.inverse_transform(embed_final).flatten())).iloc[indices]

40    peptic ulcer disease
43                 allergy
46                 allergy
63               Psoriasis
76                Impetigo
78                diabetes
98                 Typhoid
dtype: object

#### For indices 355 and 354 in our original dataframe, 'data', we can see that our model predicted 'allergy' instead of 'Common Cold'. Since both are often difficult to distinguish from each other, and have similar symptoms, it's not suprising to see this result.

#### For the remaining misclassified results, let's take a look at the original text description, and compare this to a text descriptions of the true disease, and the text descriptions of the disease it was misclassified as.

In [53]:
data.iloc[1065]

label                                                                                                                           drug reaction
text     I have severe nausea and chest discomfort. I have been having chest discomfort lately. I often shiver a lot and feel really nervous.
Name: 1065, dtype: object

In [58]:
# disease instance 1065 was misclassified as
data[data['label'] == 'peptic ulcer disease'].head(10)

Unnamed: 0,label,text
1100,peptic ulcer disease,"I have a burning sensation in my upper abdomen, ofetn between or at night. I have heartburn and indigestion and often feel very nauseous"
1101,peptic ulcer disease,I have bloating and a feeling of uneasiness. I have been experiencing weight loss and a loss of appetite. Sometimes I have dark and tarry stools and blood in my vomit
1102,peptic ulcer disease,I have difficulty swallowing food and often get a sensation of the food getting stuck in my throat. I have constant belching and bloating. There is a persitent sour taste in my mouth
1103,peptic ulcer disease,I have been having bloody stools which has resulted in bloos loss and loss of iron. Thiis has caused me anemia and I feel very weak in general
1104,peptic ulcer disease,"I have persistant, gnawing hunger and apetite. Sometimes I get abdominal cramps and spasms. There is bloating and gas after eating which causes me a great deal of uneasiness"
1105,peptic ulcer disease,"I have changes in my bowel movements, such as constipation and diarrhea. I have a loss of appetite and energy and often feel very fatigued"
1106,peptic ulcer disease,I have abdominal pain and it gets only worse if I bend over or lie down. I feel discomfort when I eat certain foods such as spicy or acidic food. Sometimes I get loose stools because of this
1107,peptic ulcer disease,I have difficulty sleeping due to abdominal pain or discomfort. I have a loss of appetite and feel fatigued after going to the bathroom. I feel very tited all the time
1108,peptic ulcer disease,I have a sour acidic taste in my mouth. I have frequent belching and burping. I have a feeling of pressure or fullness in my upper abdomen that last for a few hours
1109,peptic ulcer disease,I have unintended weight loss and difficulty gaining weight. I have pain and discomfort that is relieved by taking antacids. My mouth tastes very bad


In [59]:
# true disease of instance 1065
data[data['label'] == 'drug reaction'].head(10)

Unnamed: 0,label,text
1050,drug reaction,"I have a metallic taste in my mouth, and also have a sense of change of taste and smell. Sometimes get very unbearable joint pain and muscle pain"
1051,drug reaction,"I have headaches and migraines, have been having difficulty sleeping. I have been having muscle twitching and tremors. Sometimes I get lightheaded"
1052,drug reaction,I have fever and feel very dizzy and lightheaded. My heart is beating very fast and I feel very confused. I am not able to think very clearing and everything feels very foggy
1053,drug reaction,I have rashes on my skin and these flake off from time to time. This leaves me prone to infection. My fingers start twitching and sometimes I experience tremors.
1054,drug reaction,I feel very nauseous and have chest pain. Recently I have been experiencing chest pain. I feel very uneasy and often sweat profusely
1055,drug reaction,"I have itching all over my body, and rashes in my chest and back. I get flaky skin from time to time and often this leaves marks on my body."
1056,drug reaction,I have hair loss and there is a significant change in the texture of my hair. I have dry and itchy sclap and increased dandruff. My skin is also getting very dry
1057,drug reaction,I am experiencing a decrease in my sex drive and difficulty to fucntion sexually. I feel very light headed and confused and often experince brain fog
1058,drug reaction,I am experiencing changes in my menstrual cycle and unexpected vaginal discharge. I often get mood swings and feel agitated from time to time
1059,drug reaction,I have experinenced significant weight gain and become very obese. I have changes in my appetite and cravings for different foods


#### It appears the word 'discomfort' plays a significant role in our model. Specifically, 'discomfort' is a word often used to describe the symptoms of peptic ulcer disease', but less so when describing the symptoms of a drug reaction. Thus, it's possible our model misclassified due to this.

In [60]:
data.iloc[86]

label                                                                                                                                              Varicose Veins
text     The rash on my legs is spreading and becoming more severe. It is red, inflamed, and itchy, causing a lot of discomfort and difficulty sleeping at night.
Name: 86, dtype: object

In [62]:
data[data['label']=='Psoriasis'].head(10)

Unnamed: 0,label,text
0,Psoriasis,"I have been experiencing a skin rash on my arms, legs, and torso for the past few weeks. It is red, itchy, and covered in dry, scaly patches."
1,Psoriasis,"My skin has been peeling, especially on my knees, elbows, and scalp. This peeling is often accompanied by a burning or stinging sensation."
2,Psoriasis,"I have been experiencing joint pain in my fingers, wrists, and knees. The pain is often achy and throbbing, and it gets worse when I move my joints."
3,Psoriasis,"There is a silver like dusting on my skin, especially on my lower back and scalp. This dusting is made up of small scales that flake off easily when I scratch them."
4,Psoriasis,"My nails have small dents or pits in them, and they often feel inflammatory and tender to the touch. Even there are minor rashes on my arms."
5,Psoriasis,The skin on my palms and soles is thickened and has deep cracks. These cracks are painful and bleed easily.
6,Psoriasis,"The skin around my mouth, nose, and eyes is red and inflamed. It is often itchy and uncomfortable. There is a noticeable inflammation in my nails."
7,Psoriasis,My skin is very sensitive and reacts easily to changes in temperature or humidity. I often have to be careful about what products I use on my skin.
8,Psoriasis,"I have noticed a sudden peeling of skin at different parts of my body, mainly arms, legs and back. Also, I face severe joint pain and skin rashes."
9,Psoriasis,"The skin on my genitals is red and inflamed. It is often itchy, burning, and uncomfortable. There are rashes on different parts of the body too."


In [63]:
data[data['label']=='Varicose Veins'].head(10)

Unnamed: 0,label,text
50,Varicose Veins,"I have a rash on my legs that is causing a lot of discomforts. It seems there is a cramp and I can see prominent veins on the calf. Also, I have been feeling very tired and fatigued in the past couple of days."
51,Varicose Veins,"My calves have been cramping up when I walk or stand for long periods of time. There are bruise marks on my calves, which is making me worried. I feel tired very soon."
52,Varicose Veins,"There is bruising on my legs that I cannot explain. I can see strange blood vessels below the skin. Also, I am slightly obese and I am really worried."
53,Varicose Veins,I am overweight and have noticed that my legs are swollen and the blood vessels are visible. My legs have swollen and I can see a stream of swollen veins on my calves.
54,Varicose Veins,"The veins on my calves have become very prominent and causing discomfort. I can't stand for long periods of time, as it causes pain in my legs, similar to cramps."
55,Varicose Veins,The skin around the veins on my legs is red and inflamed. I believe I can see some of the swollen blood vessels. I am really worried about it.
56,Varicose Veins,Standing or walking for long periods of time causes a lot of pain in my legs. I get cramps upon doing physical activities. There are bruise marks on my legs too.
57,Varicose Veins,The cramps in my calves are making it difficult for me to walk. I feel fatigued after working for some time. I believe obesity is the reason behind this.
58,Varicose Veins,"The swelling in my legs has gotten worse over the past few weeks. Now, a large number of veins are noticeable on my calves, which is making me worried."
59,Varicose Veins,The veins on my legs are very noticeable and cause me discomfort. It seems like there is a major bruise and I get cramps when I run.


#### In this case, it appears symptoms of 'rash' and inflammation played a significant role in this prediction. Since these symptoms, along with other skin-related symptoms, are more often associated with Psoriasis than with Varicose Veins, it's possible this is the reason our model misclassified.

## Final Notes

#### Overall, it seems an embedding model is well-suited for this NLP task. It's possible that with fine-tuning and regularization, our embedding and LSTM models could achieve better performance. The main limitation appears to be when a text description of a disease has significant overlap with text descriptions of other diseases. It's possible more training data is needed to help capture these nuanced differences when they appear.