## Installing libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%capture
!pip install datasets

In [None]:
%%capture
!pip install accelerate -U

In [None]:
%%capture
!pip install transformers

In [None]:
%%capture
!pip install evaluate

##Import the Libraries

In [None]:
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, \
                         DistilBertTokenizerFast, DataCollatorWithPadding, pipeline
from datasets import Dataset
import evaluate
import numpy as np
import pandas as pd
import os
os.environ["WANDB_DISABLED"] = "true"


## Import Dataset

In [None]:
medicalTranscript_df = pd.read_csv("med_transcripts.csv")
medicalTranscript_df.head()

Unnamed: 0,medical_specialty,transcription
0,Dentistry,"PREOPERATIVE DIAGNOSIS:, Completely bony impa..."
1,Dentistry,"CHIEF COMPLAINT:, Toothache.,HISTORY OF PRESE..."
2,Dentistry,"PREOPERATIVE DIAGNOSES: , Carious teeth #2 and..."
3,Dentistry,"PREOPERATIVE DIAGNOSES,1. Carious teeth #2, #..."
4,Dentistry,"PREOPERATIVE DIAGNOSES,1. Basal cell nevus sy..."


In [None]:
print( "number of medical transcripts :", len(medicalTranscript_df))

number of medical transcripts : 250


## Create Dataset Object

In [None]:
#Obtain the medical speciality labels
unique_medical_specialities = medicalTranscript_df['medical_specialty'].drop_duplicates()
unique_medical_specialities = unique_medical_specialities.to_list()
print(unique_medical_specialities)


[' Dentistry', ' Dermatology', ' Psychiatry / Psychology', ' Podiatry', ' Neurosurgery']


In [None]:
medical_specialities = list(medicalTranscript_df['medical_specialty'])
transcriptions = list(medicalTranscript_df['transcription'])

## Replace the medical speciality labels with numerical labels
medical_specialities = [unique_medical_specialities.index(i) for i in medical_specialities]
print(medical_specialities)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]


In [None]:
med_transcripts_dataset = Dataset.from_dict(
    dict (
        text = transcriptions,
        label = medical_specialities
    )
    )
print(med_transcripts_dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 250
})


## Split Dataset

In [None]:
med_transcripts_dataset = med_transcripts_dataset.train_test_split(test_size=0.2)
print(med_transcripts_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 50
    })
})


In [None]:
med_transcripts_dataset['train'][0]

{'text': 'PREOPERATIVE DIAGNOSIS: , Cervical spondylosis at C3-C4 with cervical radiculopathy and spinal cord compression.,POSTOPERATIVE DIAGNOSIS:,  Cervical spondylosis at C3-C4 with cervical radiculopathy and spinal cord compression.,OPERATION PERFORMED,1.  Anterior cervical discectomy of C3-C4.,2.  Removal of herniated disc and osteophytes.,3.  Bilateral C4 nerve root decompression.,4.  Harvesting of bone for autologous vertebral bodies for creation of arthrodesis.,5.  Grafting of fibular allograft bone for creation of arthrodesis.,6.  Creation of arthrodesis via an anterior technique with fibular allograft bone and autologous bone from the vertebral bodies.,7.  Placement of anterior spinal instrumentation using the operating microscope and microdissection technique.,INDICATIONS FOR PROCEDURE: , This 62-year-old man has progressive and intractable right C4 radiculopathy with neck and shoulder pain.  Conservative therapy has failed to improve the problem.  Imaging studies showed sev

##Loading the Tokenizer

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_dataset = med_transcripts_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset['train'][0]

{'text': 'PREOPERATIVE DIAGNOSIS: , Cervical spondylosis at C3-C4 with cervical radiculopathy and spinal cord compression.,POSTOPERATIVE DIAGNOSIS:,  Cervical spondylosis at C3-C4 with cervical radiculopathy and spinal cord compression.,OPERATION PERFORMED,1.  Anterior cervical discectomy of C3-C4.,2.  Removal of herniated disc and osteophytes.,3.  Bilateral C4 nerve root decompression.,4.  Harvesting of bone for autologous vertebral bodies for creation of arthrodesis.,5.  Grafting of fibular allograft bone for creation of arthrodesis.,6.  Creation of arthrodesis via an anterior technique with fibular allograft bone and autologous bone from the vertebral bodies.,7.  Placement of anterior spinal instrumentation using the operating microscope and microdissection technique.,INDICATIONS FOR PROCEDURE: , This 62-year-old man has progressive and intractable right C4 radiculopathy with neck and shoulder pain.  Conservative therapy has failed to improve the problem.  Imaging studies showed sev

## Creating the Data Collator

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

##Loading the Pre-trained Model

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(unique_medical_specialities))

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Fine-tuning the Pre-trained Model

In [None]:
#Map model's output prediction (intergers indices) back to their corresponding lable names.
model.config.id2label = {i: label for i, label in enumerate(unique_medical_specialities)}
print (model.config.id2label)

{0: ' Dentistry', 1: ' Dermatology', 2: ' Psychiatry / Psychology', 3: ' Podiatry', 4: ' Neurosurgery'}


In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(evaluate_predictions):
    logits, labels = evaluate_predictions
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
epochs = 8

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size = 32,
    load_best_model_at_end=True,

    warmup_steps=len(tokenized_dataset["train"]) // 5,  # number of warmup steps for learning rate scheduler
    weight_decay=0.05,

    logging_steps=1,
    log_level="info",
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
#Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
#Get Initial metrics
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 50
  Batch size = 32


{'eval_loss': 1.5935946702957153,
 'eval_model_preparation_time': 0.0023,
 'eval_accuracy': 0.32,
 'eval_runtime': 1.6399,
 'eval_samples_per_second': 30.49,
 'eval_steps_per_second': 1.22}

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 200
  Num Epochs = 8
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 56
  Number of trainable parameters = 66,957,317


Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy
1,1.5649,1.566865,0.0023,0.42
2,1.5045,1.501219,0.0023,0.32
3,1.0269,1.403615,0.0023,0.46
4,0.9772,1.154693,0.0023,0.54
5,0.8228,0.797802,0.0023,0.82
6,0.4744,0.425612,0.0023,0.96
7,0.1575,0.260267,0.0023,0.98
8,0.2105,0.18328,0.0023,1.0


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 50
  Batch size = 32


Saving model checkpoint to ./results/checkpoint-7
Configuration saved in ./results/checkpoint-7/config.json
Model weights saved in ./results/checkpoint-7/model.safetensors
Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`
tokenizer config file saved in ./results/checkpoint-7/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-7/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 50
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-14
Configuration saved in ./results/checkpoint-14/config.json
Model weights saved in ./results/checkpoint-14/model.safetensors
Saving Trainer.data_collator.tokenizer by default as Trainer.processing_c

TrainOutput(global_step=56, training_loss=0.9532584547996521, metrics={'train_runtime': 108.1492, 'train_samples_per_second': 14.794, 'train_steps_per_second': 0.518, 'total_flos': 211959177216000.0, 'train_loss': 0.9532584547996521, 'epoch': 8.0})

## Predition and Evaluation

In [None]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 50
  Batch size = 32


{'eval_loss': 0.1832798421382904,
 'eval_model_preparation_time': 0.0023,
 'eval_accuracy': 1.0,
 'eval_runtime': 0.8023,
 'eval_samples_per_second': 62.325,
 'eval_steps_per_second': 2.493,
 'epoch': 8.0}

In [None]:
med_transcripts_dataset['test'][8]

{'text': "PROCEDURE:,  Subcutaneous ulnar nerve transposition.,PROCEDURE IN DETAIL: , After administering appropriate antibiotics and MAC anesthesia, the upper extremity was prepped and draped in the usual standard fashion.  The arm was exsanguinated with Esmarch, and the tourniquet inflated to 250 mmHg.,A curvilinear incision was made over the medial elbow, starting proximally at the medial intermuscular septum, curving posterior to the medial epicondyle, then curving anteriorly along the path of the ulnar nerve.  Dissection was carried down to the ulnar nerve.  Branches of the medial antebrachial and the medial brachial cutaneous nerves were identified and protected.,Osborne's fascia was released, an ulnar neurolysis performed, and the ulnar nerve was mobilized.  Six cm of the medial intermuscular septum was excised, and the deep periosteal origin of the flexor carpi ulnaris was released to avoid kinking of the nerve as it was moved anteriorly.,The subcutaneous plane just superficial

In [None]:
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
pipe("PROCEDURE:,  Subcutaneous ulnar nerve transposition.,PROCEDURE IN DETAIL: , After administering appropriate antibiotics and MAC anesthesia, the upper extremity was prepped and draped in the usual standard fashion.  The arm was exsanguinated with Esmarch, and the tourniquet inflated to 250 mmHg.,A curvilinear incision was made over the medial elbow, starting proximally at the medial intermuscular septum, curving posterior to the medial epicondyle, then curving anteriorly along the path of the ulnar nerve.  Dissection was carried down to the ulnar nerve.  Branches of the medial antebrachial and the medial brachial cutaneous nerves were identified and protected.,Osborne's fascia was released, an ulnar neurolysis performed, and the ulnar nerve was mobilized.  Six cm of the medial intermuscular septum was excised, and the deep periosteal origin of the flexor carpi ulnaris was released to avoid kinking of the nerve as it was moved anteriorly.,The subcutaneous plane just superficial to the flexor-pronator mass was developed.  Meticulous hemostasis was maintained with bipolar electrocautery.  The nerve was transposed anteriorly, superficial to the flexor-pronator mass.  Motor branches were dissected proximally and distally to avoid tethering or kinking the ulnar nerve.,A semicircular medially based flap of flexor-pronator fascia was raised and sutured to the subcutaneous tissue in such a way as to prevent the nerve from relocating.  The subcutaneous tissue and skin were closed with simple interrupted sutures.  Marcaine with epinephrine was injected into the wound.  The elbow was dressed and splinted.  The patient was awakened and sent to the recovery room in good condition, having tolerated the procedure well.")

Device set to use cuda:0


[{'label': ' Neurosurgery', 'score': 0.9181930422782898}]

## Saving the Fine-tuned Model

In [None]:
trainer.save_model()

Saving model checkpoint to ./results
Configuration saved in ./results/config.json
Model weights saved in ./results/model.safetensors
Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`
tokenizer config file saved in ./results/tokenizer_config.json
Special tokens file saved in ./results/special_tokens_map.json


In [None]:
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)