In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the data
data = pd.read_csv('/content/Symptom2Disease.csv')
data = data[['label', 'text']]
data.head()

Unnamed: 0,label,text
0,Psoriasis,I have been experiencing a skin rash on my arm...
1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,Psoriasis,I have been experiencing joint pain in my fing...
3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,Psoriasis,"My nails have small dents or pits in them, and..."


In [None]:
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])

# Text cleaning function
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

data['text_cleaned'] = data['text'].apply(clean_text)

# Split data
x_train, x_test, y_train, y_test = train_test_split(data['text_cleaned'], data['label_encoded'], test_size=0.2, random_state=42)



In [None]:
!pip install transformers datasets torch gensim

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 

In [None]:
from datasets import Dataset, DatasetDict
train_dataset = Dataset.from_pandas(pd.DataFrame({'text': x_train, 'label': y_train}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': x_test, 'label': y_test}))

dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

In [None]:
# Load the pre-trained tokenizer and model
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, return_tensors="pt")

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/960 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

In [None]:
# Load additional embeddings (e.g., GloVe)
import gensim.downloader as api
word_vectors = api.load("glove-wiki-gigaword-100")

In [None]:
# Create combined embeddings
import torch

def get_combined_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=512)
    bert_embeddings = bert_model(**inputs).last_hidden_state.mean(dim=1)

    words = text.split()
    glove_embeddings = np.mean([word_vectors[word] if word in word_vectors else np.zeros(100) for word in words], axis=0)
    glove_embeddings = torch.tensor(glove_embeddings).float().unsqueeze(0)

    combined_embeddings = torch.cat((bert_embeddings, glove_embeddings), dim=1)
    return combined_embeddings

batch_texts = ["example text 1", "example text 2"]
batch_embeddings = torch.cat([get_combined_embeddings(text) for text in batch_texts], dim=0)
print(batch_embeddings.shape)


torch.Size([2, 868])


In [None]:
# Define a new model with combined embeddings
import torch.nn as nn
class CombinedBERTModel(nn.Module):
    def __init__(self, bert_model, num_labels):
        super(CombinedBERTModel, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(768 + 100, num_labels)  # 768 from BERT and 100 from GloVe

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        bert_embeddings = outputs.last_hidden_state.mean(dim=1)

        glove_embeddings = []
        for i in range(input_ids.shape[0]):
            tokens = tokenizer.convert_ids_to_tokens(input_ids[i], skip_special_tokens=True)
            glove_emb = np.mean([word_vectors[word] if word in word_vectors else np.zeros(100) for word in tokens], axis=0)
            glove_embeddings.append(glove_emb)

        glove_embeddings = np.array(glove_embeddings)
        glove_embeddings = torch.tensor(glove_embeddings).float().to(bert_embeddings.device)
        combined_embeddings = torch.cat((bert_embeddings, glove_embeddings), dim=1)

        logits = self.fc(combined_embeddings)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.fc.out_features), labels.view(-1))

        return {'loss': loss, 'logits': logits} if loss is not None else {'logits': logits}


num_labels = len(label_encoder.classes_)
model = CombinedBERTModel(bert_model, num_labels)



In [None]:
pip install transformers[torch]



In [None]:
# Training arguments
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',                # output directory
    num_train_epochs=5,                    # number of training epochs
    per_device_train_batch_size=8,         # batch size for training (reduced)
    per_device_eval_batch_size=8,          # batch size for evaluation (reduced)
    warmup_steps=500,                      # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                     # strength of weight decay
    logging_dir='./logs',                  # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",           # evaluate every epoch
    save_strategy="epoch",                 # save every epoch to match evaluation strategy
    save_total_limit=3,                    # only last 3 models are saved
    load_best_model_at_end=True,           # load the best model at the end of training
    fp16=True,                             # enable mixed precision training
    gradient_accumulation_steps=2,         # accumulate gradients to simulate larger batch size
)

# Define metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
)



In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
