# Fine Tune Bert Model

Trying to fine tune BERT model to classify test

Created By [Anshul Chaudhary](https://www.linkedin.com/in/chaudharyanshul/)

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-

### Load Data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_json('/content/clean_data.json', orient="records")

In [None]:
# test and train split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=1)

# reset the index for test and train data
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [None]:
train_df.head()

Unnamed: 0,text,label_encoded
0,◦| Adjusted Operating Income margin of 11.8%; ...,2
1,"| For the Three Months Ended March 31,| For th...",0
2,¨| Pre-commencement communications pursuant to...,1
3,"Board Retainer ………………………………………………………….| $105,0...",0
4,NINETEENTH SUPPLEMENTAL INDENTURE (this Ninet...,1


In [None]:
train_df['label_encoded'].value_counts()

label_encoded
1    132
2    116
0     57
Name: count, dtype: int64

In [None]:
test_df['label_encoded'].value_counts()

label_encoded
1    38
2    24
0    15
Name: count, dtype: int64

**Observations:**

* The split of data in labels is not uniform, so there will be some need for resampling on train data

### Tokenizing the text

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
# model to be used from hugging face
model_name = "bert-base-uncased"

In [None]:
# Load pre-trained BERT model and tokenizer
tokenizer_bert = BertTokenizer.from_pretrained(model_name)
model_bert = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Skeleton Code for Training Model

##### Set Device for Training

In [None]:
import torch

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cpu


##### k-Fold Cross-Validation Training with Resampling

In [None]:
from sklearn.model_selection import KFold

In [None]:
# Number of splits
num_splits = 5

In [None]:
# Initialize k-fold cross-validator
kfold = KFold(n_splits=num_splits, shuffle=True, random_state=1)

##### Data Tokenization

In [None]:
from datasets import Dataset

In [None]:
def tokenizeData(tokenizer, train_df, test_df):
  '''
    Tokenizes the text data using the provided tokenizer and returns Dataset objects.

    Args:
      tokenizer: The tokenizer object to tokenize the text data.
      train_df: Pandas DataFrame containing the training data.
      test_df: Pandas DataFrame containing the testing data.

    Returns:
      train_dataset: Tokenized training dataset.
      test_dataset: Tokenized testing dataset.
  '''


  # converting pandas.core.frame.DataFrame to datasets.arrow_dataset.Dataset
  train_dataset = Dataset.from_pandas(train_df)
  test_dataset = Dataset.from_pandas(test_df)

  # Tokenize text column in test and train
  print("Tokenize Train Data:")
  train_dataset = train_dataset.map(lambda x: tokenizer(x["text"], padding="max_length", truncation=True), batched=True)
  print("Tokenize Test Data:")
  test_dataset  = test_dataset.map(lambda x: tokenizer(x["text"], padding="max_length", truncation=True), batched=True)

  # drop unwanted column
  train_dataset = train_dataset.remove_columns(['text'])
  test_dataset = test_dataset.remove_columns(['text'])

  # rename label_encoded column to label
  train_dataset = train_dataset.rename_column("label_encoded", "label")
  test_dataset = test_dataset.rename_column("label_encoded", "label")

  return train_dataset, test_dataset

##### Temp

In [None]:
train_dataset, test_dataset = tokenizeData(tokenizer_bert, train_df, test_df)

Tokenize Train Data:


Map:   0%|          | 0/305 [00:00<?, ? examples/s]

Tokenize Test Data:


Map:   0%|          | 0/77 [00:00<?, ? examples/s]

In [None]:
from torch.utils.data import DataLoader, Subset, WeightedRandomSampler
from transformers import AdamW, get_scheduler
from tqdm.auto import tqdm
from imblearn.over_sampling import RandomOverSampler

In [None]:
# number of epochs
num_epochs = 10

In [None]:
# batch size
batch_size = 10

In [None]:
model_bert.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
for fold, (train_indices, val_indices) in enumerate(kfold.split(train_dataset)):
  print("Training fold {}".format(fold + 1))

  # Extract the training and validation sets
  train_subset = train_dataset.select(train_indices)
  val_subset = train_dataset.select(val_indices)

  train_df = pd.DataFrame(train_subset)

  # Resample the training set
  ros = RandomOverSampler(random_state=1)
  X_resampled, y_resampled = ros.fit_resample(train_df[['input_ids', 'attention_mask', 'token_type_ids']], train_df['label'])

  # Create dataset
  resampled_df = pd.DataFrame({
    'input_ids': list(X_resampled['input_ids']),
    'attention_mask': list(X_resampled['attention_mask']),
    'token_type_ids': list(X_resampled['token_type_ids']),
    'label': y_resampled
  })

  train = Dataset.from_pandas(resampled_df)
  val = Dataset.from_pandas(pd.DataFrame(val_subset))

  train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])
  val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])


  # use torch dataloader to pass subset of data based on the fold index
  train_dataloader = DataLoader(train, shuffle=True, batch_size=batch_size)
  val_dataloader = DataLoader(val, batch_size=batch_size)

  # optimizer
  optimizer = AdamW(model_bert.parameters(), lr=10**-7)
  num_training_steps = num_epochs * len(train_dataloader)

  # schedule to decrease the LR till 0 in number of steps
  lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
  )

  # progress bar for training steps
  progress_bar = tqdm(range(num_training_steps))

  # set model in train model
  model_bert.train()

  # run for the set number of epoch
  for epoch in range(num_epochs):
    # for all batch in the train_dataloader
    for batch in train_dataloader:
      # separating tokens and labels
      inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
      labels = batch['label'].to(device)

      # Forward pass: Pass inputs and labels through the model
      outputs = model_bert(**inputs, labels=labels)
      loss = outputs.loss

      # Backward pass: Compute gradients
      loss.backward()

      # Optimize: Update model parameters
      optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()
      progress_bar.update(1)

    # Validation after each epoch
    model_bert.eval()
    val_loss = 0
    val_steps = 0

    for batch in val_dataloader:
      # separating tokens and labels
      inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
      labels = batch['label'].to(device)

      # No gradient calculation during validation
      with torch.no_grad():
          outputs = model_bert(**inputs, labels=labels)
      val_loss += outputs.loss.item()
      val_steps += 1
    print(f"Validation loss after epoch {epoch + 1}: {val_loss / val_steps}")
    model_bert.train()

  print(f"Completed fold {fold + 1}")

Training fold 1




  0%|          | 0/330 [00:00<?, ?it/s]

Validation loss after epoch 1: 1.098869034222194
Validation loss after epoch 2: 1.0988526344299316
Validation loss after epoch 3: 1.0988813468388148
Validation loss after epoch 4: 1.0988560574395316
Validation loss after epoch 5: 1.0988585608346122
Validation loss after epoch 6: 1.098871418407985
Validation loss after epoch 7: 1.0988711629595076
Validation loss after epoch 8: 1.0988680635179793
Validation loss after epoch 9: 1.0988705498831612
Validation loss after epoch 10: 1.098874466759818
Completed fold 1
Training fold 2




  0%|          | 0/320 [00:00<?, ?it/s]

Validation loss after epoch 1: 1.0986269201551164
Validation loss after epoch 2: 1.0986395903996058
Validation loss after epoch 3: 1.0986404589244299
Validation loss after epoch 4: 1.0986477817807878
Validation loss after epoch 5: 1.098644529070173
Validation loss after epoch 6: 1.0986427068710327
Validation loss after epoch 7: 1.098648990903582
Validation loss after epoch 8: 1.0986448696681432
Validation loss after epoch 9: 1.0986459936414446
Validation loss after epoch 10: 1.0986449207578386
Completed fold 2
Training fold 3




  0%|          | 0/310 [00:00<?, ?it/s]

Validation loss after epoch 1: 1.0975759710584367
Validation loss after epoch 2: 1.0975667067936488
Validation loss after epoch 3: 1.0976496594292777
Validation loss after epoch 4: 1.0976128578186035
Validation loss after epoch 5: 1.0976427623203822
Validation loss after epoch 6: 1.0976192780903407
Validation loss after epoch 7: 1.0976180008479528
Validation loss after epoch 8: 1.0976132324763708
Validation loss after epoch 9: 1.0976211002894811
Validation loss after epoch 10: 1.0976152590342931
Completed fold 3
Training fold 4




  0%|          | 0/320 [00:00<?, ?it/s]

Validation loss after epoch 1: 1.0987379380634852
Validation loss after epoch 2: 1.0988109622682845
Validation loss after epoch 3: 1.0987577268055506
Validation loss after epoch 4: 1.0987859964370728
Validation loss after epoch 5: 1.09879241670881
Validation loss after epoch 6: 1.098776204245431
Validation loss after epoch 7: 1.0987872396196638
Validation loss after epoch 8: 1.0987954480307442
Validation loss after epoch 9: 1.0987976959773473
Validation loss after epoch 10: 1.0987859283174788
Completed fold 4
Training fold 5




  0%|          | 0/330 [00:00<?, ?it/s]

Validation loss after epoch 1: 1.0985187802995955
Validation loss after epoch 2: 1.0985177074159895
Validation loss after epoch 3: 1.0985205003193446
Validation loss after epoch 4: 1.098519001688276
Validation loss after epoch 5: 1.0985184907913208
Validation loss after epoch 6: 1.0985185248511178
Validation loss after epoch 7: 1.0985166345323836
Validation loss after epoch 8: 1.098518661090306
Validation loss after epoch 9: 1.0985189846583776
Validation loss after epoch 10: 1.0985196317945207
Completed fold 5


In [None]:
model_bert.eval()
all_labels = []
all_preds = []


In [None]:
test_dataset

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 77
})

In [None]:
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])

In [None]:
test_dataloader = DataLoader(test_dataset, batch_size=8)

In [None]:
for batch in test_dataloader:
    # Move tensors to the specified device
    inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
    labels = batch['label'].to(device)

    with torch.no_grad():
        outputs = model_bert(**inputs)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_labels.extend(labels.cpu().numpy())
    all_preds.extend(predictions.cpu().numpy())

In [None]:
from sklearn.metrics import accuracy_score, classification_report

In [None]:
accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy}")
print(classification_report(all_labels, all_preds))

Test Accuracy: 0.4935064935064935
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        15
           1       0.49      1.00      0.66        38
           2       0.00      0.00      0.00        24

    accuracy                           0.49        77
   macro avg       0.16      0.33      0.22        77
weighted avg       0.24      0.49      0.33        77



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
