### Tutorial

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, DataCollatorWithPadding, Trainer, TrainingArguments, BertForSequenceClassification, pipeline
from peft import PeftModel, PeftConfig, LoraConfig, TaskType, get_peft_model
import torch
import pandas as pd
import numpy as np
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_files = {"train": "train.jsonl", "test": "test.jsonl"}
dataset = load_dataset("gretelai/symptom_to_diagnosis", data_files=data_files)
dataset = dataset.rename_column("output_text", "label")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'input_text'],
        num_rows: 853
    })
    test: Dataset({
        features: ['label', 'input_text'],
        num_rows: 212
    })
})


In [3]:
for entry in dataset['train'].select(range(5)):
    print('INPUT: {} \nOUTPUT: {}\n'.format(entry['input_text'], entry['label']))

INPUT: I've been having a lot of pain in my neck and back. I've also been having trouble with my balance and coordination. I've been coughing a lot and my limbs feel weak. 
OUTPUT: cervical spondylosis

INPUT: I have a rash on my face that is getting worse. It is red, inflamed, and has blisters that are bleeding clear pus. It is really painful. 
OUTPUT: impetigo

INPUT: I have been urinating blood. I sometimes feel sick to my stomach when I urinate. I often feel like I have a fever. 
OUTPUT: urinary tract infection

INPUT: I have been having trouble with my muscles and joints. My neck is really tight and my muscles feel weak. I have swollen joints and it is hard to move around without becoming stiff. It is also really uncomfortable to walk. 
OUTPUT: arthritis

INPUT: I have been feeling really sick. My body hurts a lot and I have no appetite. I have also developed rashes on my arms and face. The back of my eyes hurt a lot. 
OUTPUT: dengue



In [4]:
train_counts = pd.DataFrame({'Diagnosis': dataset['train']['label']})
train_counts = train_counts.groupby('Diagnosis').size().reset_index(name='train_set')

test_counts = pd.DataFrame({'Diagnosis': dataset['test']['label']})
test_counts = test_counts.groupby('Diagnosis').size().reset_index(name='test_set')

train_counts.merge(test_counts, on='Diagnosis')

Unnamed: 0,Diagnosis,train_set,test_set
0,allergy,40,10
1,arthritis,40,10
2,bronchial asthma,40,10
3,cervical spondylosis,40,10
4,chicken pox,40,10
5,common cold,39,10
6,dengue,40,10
7,diabetes,40,10
8,drug reaction,40,8
9,fungal infection,39,9


#### Base BERT

In [5]:
sorted_labels = sorted(set(dataset['train']['label']))
label2id = dict(zip(sorted_labels, range(0, len(sorted_labels))))
id2label = dict(zip(range(0, len(sorted_labels)), sorted_labels))

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
foundation_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id),
    label2id=label2id,
    id2label=id2label
)

classifier = pipeline("text-classification", model=foundation_model, tokenizer=tokenizer)
predicted_labels = classifier(dataset['test']['input_text'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [6]:
test_array = np.asarray(dataset['test']['label'])
pred_array = np.asarray([item['label'] for item in predicted_labels])
foundation_accuracy = round(sum(test_array == pred_array)*100/len(test_array), 2)
print(f"Foundation Model Accuracy: {foundation_accuracy}%")

Foundation Model Accuracy: 4.72%


#### Adding Lora

In [7]:
# adding lora layers
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=64, lora_alpha=1, lora_dropout=0.1
)

peft_model = get_peft_model(foundation_model, lora_config)
print(peft_model.bert)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): lora.Linear(
              (base_layer): Linear(in_features=768, out_features=768, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.1, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=768, out_features=64, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=64, out_features=768, bias=False)
              )
              (lora_embedding_A): P

In [8]:
peft_model.print_trainable_parameters()

trainable params: 2,376,214 || all params: 111,875,372 || trainable%: 2.1240


#### Preprocessing

In [9]:
def preprocess_function(examples):
    """Preprocess the dataset by returning tokenized examples."""
    tokens = tokenizer(examples["input_text"], padding="max_length", truncation=True)
    tokens['label'] = [label2id[l] for l in examples["label"]]
    return tokens

splits = ['train', 'test']

tokenized_ds = {}

for split in splits:
    tokenized_ds[split] = dataset[split].map(preprocess_function, batched=True)

print(tokenized_ds)

{'train': Dataset({
    features: ['label', 'input_text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 853
}), 'test': Dataset({
    features: ['label', 'input_text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 212
})}


#### Training peft-BERT

In [10]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()*100}


# The HuggingFace Trainer class handles the training and eval loop for PyTorch for us.
# Read more about it here https://huggingface.co/docs/transformers/main_classes/trainer
trainer = Trainer(
    model=peft_model,
    args=TrainingArguments(
        output_dir="bert-lora",
        learning_rate=2e-3,
        # Reduce the batch size if you don't have enough memory
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=15,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        report_to="none",
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

print("Starting to train...")
trainer.train()



Starting to train...


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.475024,19.811321
2,No log,1.104839,61.320755
3,2.226400,1.259054,54.716981
4,2.226400,0.583535,80.660377
5,0.621400,0.468679,84.90566
6,0.621400,0.336716,90.09434
7,0.621400,0.321819,88.679245
8,0.221200,0.232672,93.867925
9,0.221200,0.247691,93.396226
10,0.079400,0.287866,91.509434


TrainOutput(global_step=3210, training_loss=0.4992301059289142, metrics={'train_runtime': 686.7065, 'train_samples_per_second': 18.632, 'train_steps_per_second': 4.674, 'total_flos': 3460510521077760.0, 'train_loss': 0.4992301059289142, 'epoch': 15.0})

In [11]:
peft_bert_model_path = "fine-tuned-peft-model-weights/"
peft_model.save_pretrained(peft_bert_model_path)

# check the size of the saved model
for file_name in os.listdir(peft_bert_model_path):
    file_size = os.path.getsize(peft_bert_model_path + file_name)
    print(f"File Name: {file_name}; File Size: {file_size / 1024:.2f}KB")

File Name: adapter_config.json; File Size: 0.65KB
File Name: README.md; File Size: 4.97KB
File Name: adapter_model.safetensors; File Size: 9288.92KB


#### Test

In [32]:
# when loading peft from file 
config = PeftConfig.from_pretrained(peft_bert_model_path)
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=len(label2id),
)
model = PeftModel.from_pretrained(model, peft_bert_model_path)

trainer = Trainer(
    model=model,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)
trainer.args.report_to = "none"

test_predictions = trainer.predict(tokenized_ds['test'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
# accuracy
pred = np.argmax(test_predictions.predictions, axis=1)
test_labels = tokenized_ds['test']['label']
test_labels = np.array(test_labels)
accuracy = np.mean(pred == test_labels)
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 94.81%
