<a href="https://colab.research.google.com/github/AshimR/web3j-workshop/blob/master/medical_diagnosis_with_text_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas --quiet

In [2]:
import pandas as pd

In [6]:
from google.colab import files
file = files.upload()

Saving symbipredict_2022.csv to symbipredict_2022.csv


In [8]:
# Load the dataset to preprocess it for text embedding and semantic search usage
data = pd.read_csv('/content/symbipredict_2022.csv')

# Creating a text description for each row in the dataset by concatenating symptom names if present (1)
def create_symptom_description(row):
    symptoms = [col.replace('_', ' ') for col in row.index if row[col] == 1]
    return ', '.join(symptoms)

# Apply the function to each row in the DataFrame
data['symptom_description'] = data.apply(create_symptom_description, axis=1)

# Check the modified DataFrame and export it to a CSV file for verification
data.to_csv('/content/processed_symbipredict_2022.csv', index=False)
data.head(), data['symptom_description'].head()


(   itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
 0        1          1                     1                    0          0   
 1        0          1                     1                    0          0   
 2        1          0                     1                    0          0   
 3        1          1                     0                    0          0   
 4        1          1                     1                    0          0   
 
    chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  scurring  \
 0       0           0             0        0                 0  ...         0   
 1       0           0             0        0                 0  ...         0   
 2       0           0             0        0                 0  ...         0   
 3       0           0             0        0                 0  ...         0   
 4       0           0             0        0                 0  ...         0   
 
    skin_peeling  silver

In [12]:
data_frame = data[['symptom_description','prognosis']]

Unnamed: 0,symptom_description,prognosis
0,"itching, skin rash, nodal skin eruptions, disc...",Fungal Infection
1,"skin rash, nodal skin eruptions, dischromic p...",Fungal Infection
2,"itching, nodal skin eruptions, dischromic pat...",Fungal Infection
3,"itching, skin rash, dischromic patches",Fungal Infection
4,"itching, skin rash, nodal skin eruptions",Fungal Infection


In [14]:
!pip install transformers --quiet

In [15]:
from transformers import AutoTokenizer, AutoModel
import torch

In [17]:

# Initialize tokenizer and model from Hugging Face's Transformers
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Function to generate embeddings for a given text
def generate_embeddings(text):
    encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        output = model(**encoded_input)
    embeddings = output.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

# Apply the function to generate embeddings for the 'symptom_description' column
data_frame['embeddings'] = data_frame['symptom_description'].apply(generate_embeddings)

# Optionally save the new dataframe with embeddings to a new CSV for further use
data_frame.to_csv('sym_w_embeddings.csv', index=False)

# Print some of the generated embeddings to confirm they are correct
print(data_frame[['symptom_description', 'embeddings']].head())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_frame['embeddings'] = data_frame['symptom_description'].apply(generate_embeddings)


                                 symptom_description  \
0  itching, skin rash, nodal skin eruptions, disc...   
1  skin rash, nodal skin eruptions, dischromic  p...   
2  itching, nodal skin eruptions, dischromic  pat...   
3            itching, skin rash, dischromic  patches   
4           itching, skin rash, nodal skin eruptions   

                                          embeddings  
0  [0.300389, 0.010864246, -0.40300804, 0.2143276...  
1  [0.39868072, -0.112185195, -0.33901146, 0.1459...  
2  [0.31049445, 0.011516023, -0.41578156, 0.19552...  
3  [0.35473484, -0.046607442, -0.38146266, 0.2751...  
4  [0.37643713, 0.08506728, -0.40587324, 0.328377...  


In [19]:
data_frame.to_csv('/content/embd_symbipredict_2022.csv')

In [20]:
embedded_data = data_frame

In [28]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
label_encoder = LabelEncoder()
scaler = StandardScaler()

In [29]:
# Assuming 'embeddings' column contains the embeddings and is in an appropriate format
X = list(embedded_data['embeddings'])
y = embedded_data['prognosis']
X_scaled = scaler.fit_transform(X)
y_encoded = label_encoder.fit_transform(y)

In [22]:
import torch

# Check if a GPU is available and select appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')


Using device: cpu


In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Assuming X and y are your features and labels respectively and are already loaded into your environment
X_tensor = torch.FloatTensor(X_scaled)  # Convert features to Float Tensor
y_tensor = torch.LongTensor(y_encoded)    # Convert labels to Long Tensor if they are not one-hot encoded

# Create dataset and dataloader for batch processing
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Define the neural network architecture
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(in_features=768, out_features=100)  # Adjust the in_features according to your embedding size
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5) # Dropout layer
        self.fc2 = nn.Linear(100, len(set(y)))  # The output size should match the number of classes
        self.softmax = nn.Softmax(dim=1)
        self.output = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

# Initialize the network, loss function, and optimizer
model = SimpleNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training loop
for epoch in range(100):  # Runs 100 training epochs
    for inputs, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")


Epoch 1, Loss: 2.8881518840789795
Epoch 2, Loss: 2.762744665145874
Epoch 3, Loss: 2.7641372680664062
Epoch 4, Loss: 2.755727529525757
Epoch 5, Loss: 2.7551486492156982
Epoch 6, Loss: 2.7552990913391113
Epoch 7, Loss: 2.7549469470977783
Epoch 8, Loss: 2.754746913909912
Epoch 9, Loss: 2.7548279762268066
Epoch 10, Loss: 2.755131959915161
Epoch 11, Loss: 2.7551119327545166
Epoch 12, Loss: 2.7547550201416016
Epoch 13, Loss: 2.7547607421875
Epoch 14, Loss: 2.7551050186157227
Epoch 15, Loss: 2.7546958923339844
Epoch 16, Loss: 2.754680871963501
Epoch 17, Loss: 2.7546780109405518
Epoch 18, Loss: 2.7547292709350586
Epoch 19, Loss: 2.754667043685913
Epoch 20, Loss: 2.7546443939208984
Epoch 21, Loss: 2.7546401023864746
Epoch 22, Loss: 2.754631280899048
Epoch 23, Loss: 2.7546515464782715
Epoch 24, Loss: 2.754638433456421
Epoch 25, Loss: 2.754648447036743
Epoch 26, Loss: 2.754639148712158
Epoch 27, Loss: 2.7546274662017822
Epoch 28, Loss: 2.754631757736206
Epoch 29, Loss: 2.754629373550415
Epoch 30,