# BERT based Transformer Model Training 

Trying to fine tune different BERT model to classify test

Created By [Anshul Chaudhary](https://www.linkedin.com/in/chaudharyanshul/)

### Load Data

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [6]:
df = pd.read_json('/kaggle/input/clean-data/clean_data.json', orient="records")

In [7]:
# test and train split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=1)

# reset the index for test and train data
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [8]:
train_df.head()

Unnamed: 0,text,label_encoded
0,◦| Adjusted Operating Income margin of 11.8%; ...,2
1,"| For the Three Months Ended March 31,| For th...",0
2,¨| Pre-commencement communications pursuant to...,1
3,"Board Retainer ………………………………………………………….| $105,0...",0
4,NINETEENTH SUPPLEMENTAL INDENTURE (this Ninet...,1


In [9]:
train_df['label_encoded'].value_counts()

label_encoded
1    132
2    116
0     57
Name: count, dtype: int64

In [10]:
test_df['label_encoded'].value_counts()

label_encoded
1    38
2    24
0    15
Name: count, dtype: int64

**Observations:**

* The split of data in labels is not uniform, so there will be some need for resampling on train data

### Tokenizing the text

### Skeleton Code for Training Model

##### Set Device for Training

In [11]:
import torch

In [12]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


##### k-Fold Cross-Validation Training with Resampling

In [13]:
from sklearn.model_selection import KFold

In [14]:
def get_kfold(num_splits = 3):
  '''
    Initialize cross-validation 
    
    Args:
      num_splits: number of split on data
      
    Returns:
      KFold object
  '''
  kfold = KFold(n_splits=num_splits, shuffle=True, random_state=1)
  
  return kfold

##### Data Tokenization

In [15]:
from datasets import Dataset

In [16]:
def get_tokenizeData(tokenizer, train_df, test_df):
  '''
    Tokenize the text data using the provided tokenizer and returns Dataset objects.

    Args:
      tokenizer: The tokenizer object to tokenize the text data.
      train_df: Pandas DataFrame containing the training data.
      test_df: Pandas DataFrame containing the testing data.

    Returns:
      train_dataset: Tokenized training dataset.
      test_dataset: Tokenized testing dataset.
  '''


  # converting pandas.core.frame.DataFrame to datasets.arrow_dataset.Dataset
  train_dataset = Dataset.from_pandas(train_df)
  test_dataset = Dataset.from_pandas(test_df)

  # Tokenize text column in test and train
  print("Tokenize Train Data:")
  train_dataset = train_dataset.map(lambda x: tokenizer(x["text"], padding="max_length", truncation=True), batched=True)
  print("Tokenize Test Data:")
  test_dataset  = test_dataset.map(lambda x: tokenizer(x["text"], padding="max_length", truncation=True), batched=True)

  # drop unwanted column
  train_dataset = train_dataset.remove_columns(['text'])
  test_dataset = test_dataset.remove_columns(['text'])

  # rename label_encoded column to label
  train_dataset = train_dataset.rename_column("label_encoded", "label")
  test_dataset = test_dataset.rename_column("label_encoded", "label")

  return train_dataset, test_dataset

##### Train Model

use the train data to train

In [17]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler

2024-05-29 21:12:18.313535: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-29 21:12:18.313656: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-29 21:12:18.421570: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [18]:
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = torch.argmax(torch.tensor(logits), dim=-1)  # Convert logits to tensor
  labels = torch.tensor(labels)  # Convert labels to tensor
  accuracy = accuracy_score(labels, predictions)
  report = classification_report(labels, predictions, output_dict=True)
  return {
    'accuracy': accuracy,
    'precision': report['weighted avg']['precision'],
    'recall': report['weighted avg']['recall'],
    'f1': report['weighted avg']['f1-score']
  }

In [19]:
def trainModel(model, train_dataset, kfold, device, training_args):
  '''
    Function to train model by passing arguments
    
    Args:
      model (torch.nn.Module): The model to be trained.
      train_dataset (datasets.arrow_dataset.Dataset): The training dataset.
      kfold (sklearn.model_selection._split.KFold): The KFold cross-validator object.
      num_epochs (int): Number of training epochs.
      batch_size (int): Batch size for training.
      device (torch.device): Device to train the model on.
      model_name (str): name of the model.
      
    Returns:
      model: custom trained model
  '''
  
  # model to device
  model.to(device)
  
  # Train loop for k folds
  for fold, (train_indices, val_indices) in enumerate(kfold.split(train_dataset)):

    print("Training fold {}".format(fold + 1))

    # Extract the training and validation sets
    train_subset = train_dataset.select(train_indices)
    val_subset = train_dataset.select(val_indices)

    train_df = pd.DataFrame(train_subset)
    val_df = pd.DataFrame(val_subset)
    
    has_token_type_ids = 'token_type_ids' in train_df.columns.tolist()

    # Resample the training set
    ros = RandomOverSampler(random_state=1)
    if has_token_type_ids:
        X_resampled, y_resampled = ros.fit_resample(train_df[['input_ids', 'attention_mask', 'token_type_ids']], train_df['label'])
        # Create dataset
        resampled_df = pd.DataFrame({
          'input_ids': list(X_resampled['input_ids']),
          'attention_mask': list(X_resampled['attention_mask']),
          'token_type_ids': list(X_resampled['token_type_ids']),
          'label': y_resampled
        })
    else:
        X_resampled, y_resampled = ros.fit_resample(train_df[['input_ids', 'attention_mask']], train_df['label'])
        # Create dataset
        resampled_df = pd.DataFrame({
          'input_ids': list(X_resampled['input_ids']),
          'attention_mask': list(X_resampled['attention_mask']),
          'label': y_resampled
        })
    
    # train and val data
    train = Dataset.from_pandas(resampled_df)
    val = Dataset.from_pandas(val_df)
    
    if has_token_type_ids:
        train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])
        val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])
    else:
        train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
        val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    
    # assign model and data to the Trainer   
    trainer = Trainer(
          model=model,
          args=training_args,
          train_dataset=train,
          eval_dataset=val,
          compute_metrics=compute_metrics
      )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()
    print(f"Validation results for fold {fold + 1}: {eval_results}")
  
  return model

##### Evaluate Model

use test data to evaluate

In [20]:
def evaluateModel(model, test_dataset, training_args):
  '''
  
  '''
  trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
  )

  test_results = trainer.evaluate()
  print(f"Test results: {test_results}")

##### Export Models

In [21]:
import os

In [32]:
def exportModels(model, tokenizer, path):
    '''
    Export the fine-tuned model and tokenizer to the specified directory.
    
    Args:
    model (transformers.PreTrainedModel): The fine-tuned model to be exported.
    tokenizer (transformers.PreTrainedTokenizer): The tokenizer used during model training.
    path (str): The directory path where the model and tokenizer will be saved.
    '''
    
    # Define the base path where the model and tokenizer will be saved
    base_path = '/kaggle/working/models/' + path
    
    # Save the model to the specified path
    model.save_pretrained(base_path)
    
    # Save the tokenizer to the specified path
    tokenizer.save_pretrained(base_path)

### Main Function for Training

In [23]:
def main(model, model_name, tokenizer, train_df, test_df, num_splits=3, batch_size=8, num_epochs=5):
  '''
    Function to perform the main training and evaluation pipeline.

    Args:
      model (torch.nn.Module): The model to be trained.
      model_name (str): Name of the model.
      tokenizer (transformers.PreTrainedTokenizer): The tokenizer used for tokenization.
      train_df (pandas.DataFrame): DataFrame containing the training data.
      test_df (pandas.DataFrame): DataFrame containing the test data.
      num_splits (int): Number of splits for k-fold cross-validation.
      batch_size (int): Batch size for training and evaluation.
      num_epochs (int): Number of training epochs.

    Returns:
      None
  '''
  # get kfold
  kfold = get_kfold(num_splits)
  
  # Train and Test dataset
  train_dataset, test_dataset = get_tokenizeData(tokenizer, train_df, test_df)
  
  # Training arguments
  training_args = TrainingArguments(
    output_dir='/kaggle/working/result/'+model_name,    
    evaluation_strategy="epoch",
    learning_rate=10**-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs, 
    weight_decay=0.01,
    logging_dir='/kaggle/working/logs'+model_name,
    save_total_limit=1,
  )
  
  # Train the model
  model = trainModel(model, train_dataset, kfold, device, training_args)
  
  # Evaluate the trained model
  evaluateModel(model, test_dataset, training_args)

  # Export model
  exportModels(model, tokenizer, model_name)

### Bert Model Fine Tuning

In [24]:
from transformers import BertTokenizer, BertForSequenceClassification

##### Bert Model 1

Train 1st bert model using the below parameters

parameters: 

* epoch = 5
* folds = 3
* batch size = 8

In [25]:
# model to be used from hugging face
model_name = "bert-base-uncased"

In [26]:
# Load pre-trained BERT model and tokenizer
tokenizer_bert = BertTokenizer.from_pretrained(model_name)
model_bert = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
main(model_bert, 'bert_custom_1', tokenizer_bert, train_df, test_df)

Tokenize Train Data:


Map:   0%|          | 0/305 [00:00<?, ? examples/s]

Tokenize Test Data:


Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Training fold 1


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.676244,0.852941,0.859451,0.852941,0.850063
2,No log,0.504783,0.862745,0.867969,0.862745,0.859431
3,No log,0.456931,0.852941,0.859451,0.852941,0.850063
4,No log,0.431449,0.852941,0.859451,0.852941,0.850063
5,No log,0.431642,0.852941,0.859451,0.852941,0.850063


Validation results for fold 1: {'eval_loss': 0.43164223432540894, 'eval_accuracy': 0.8529411764705882, 'eval_precision': 0.859451290397582, 'eval_recall': 0.8529411764705882, 'eval_f1': 0.8500632030043795, 'eval_runtime': 1.7612, 'eval_samples_per_second': 57.915, 'eval_steps_per_second': 7.381, 'epoch': 5.0}
Training fold 2


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.088481,0.990196,0.990461,0.990196,0.990213
2,No log,0.088826,0.990196,0.990461,0.990196,0.990213
3,No log,0.059671,0.990196,0.990461,0.990196,0.990213
4,No log,0.063026,0.990196,0.990461,0.990196,0.990213
5,No log,0.062704,0.990196,0.990461,0.990196,0.990213


Validation results for fold 2: {'eval_loss': 0.06270415335893631, 'eval_accuracy': 0.9901960784313726, 'eval_precision': 0.9904610492845788, 'eval_recall': 0.9901960784313726, 'eval_f1': 0.9902126929009141, 'eval_runtime': 1.7784, 'eval_samples_per_second': 57.355, 'eval_steps_per_second': 7.31, 'epoch': 5.0}
Training fold 3


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.071759,0.980198,0.98192,0.980198,0.980407
2,No log,0.043677,0.990099,0.990549,0.990099,0.990154
3,No log,0.020155,0.990099,0.990549,0.990099,0.990154
4,No log,0.026205,0.990099,0.990549,0.990099,0.990154
5,No log,0.031628,0.990099,0.990549,0.990099,0.990154


Validation results for fold 3: {'eval_loss': 0.03162804991006851, 'eval_accuracy': 0.9900990099009901, 'eval_precision': 0.9905490549054906, 'eval_recall': 0.9900990099009901, 'eval_f1': 0.9901544931741703, 'eval_runtime': 1.7532, 'eval_samples_per_second': 57.61, 'eval_steps_per_second': 7.415, 'epoch': 5.0}


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Test results: {'eval_loss': 0.2579662501811981, 'eval_accuracy': 0.935064935064935, 'eval_precision': 0.9416486291486291, 'eval_recall': 0.935064935064935, 'eval_f1': 0.9357412564243682, 'eval_runtime': 1.445, 'eval_samples_per_second': 53.289, 'eval_steps_per_second': 6.921}


**Observations:**

* Looks like the accuracy of model on unseen data was about 94.8% 
* The F1-score is also high at around 94.8%, suggesting a good balance between correctly identifying
* There is a good balance between the Precision and Recall around 95% 

##### Bert Model 2

Train 2nd bert model using the below parameters

parameters: 

* epoch = 8
* folds = 5
* batch size = 10

In [30]:
# Load pre-trained BERT model and tokenizer
tokenizer_bert_2 = BertTokenizer.from_pretrained(model_name)
model_bert_2 = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
main(model_bert_2, 'bert_custom_2', tokenizer_bert_2, train_df, test_df,  num_splits=5, batch_size=10, num_epochs=8)

Tokenize Train Data:


Map:   0%|          | 0/305 [00:00<?, ? examples/s]

Tokenize Test Data:


Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Training fold 1


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.760756,0.770492,0.833477,0.770492,0.752821
2,No log,0.562611,0.819672,0.832847,0.819672,0.813867
3,No log,0.44871,0.885246,0.893202,0.885246,0.882628
4,No log,0.393166,0.918033,0.919704,0.918033,0.917621
5,No log,0.346405,0.901639,0.902646,0.901639,0.900615
6,No log,0.351082,0.918033,0.919704,0.918033,0.917621
7,No log,0.395643,0.836066,0.845307,0.836066,0.831615
8,No log,0.379978,0.852459,0.858365,0.852459,0.849112


Validation results for fold 1: {'eval_loss': 0.3799777626991272, 'eval_accuracy': 0.8524590163934426, 'eval_precision': 0.8583654773384763, 'eval_recall': 0.8524590163934426, 'eval_f1': 0.8491120218579236, 'eval_runtime': 1.1322, 'eval_samples_per_second': 53.878, 'eval_steps_per_second': 6.183, 'epoch': 8.0}
Training fold 2


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.033372,1.0,1.0,1.0,1.0
2,No log,0.022501,1.0,1.0,1.0,1.0
3,No log,0.023115,1.0,1.0,1.0,1.0
4,No log,0.019109,1.0,1.0,1.0,1.0
5,No log,0.031723,0.983607,0.984192,0.983607,0.983595
6,No log,0.017167,1.0,1.0,1.0,1.0
7,No log,0.02469,0.983607,0.984192,0.983607,0.983595
8,No log,0.026336,0.983607,0.984192,0.983607,0.983595


Validation results for fold 2: {'eval_loss': 0.026336295530200005, 'eval_accuracy': 0.9836065573770492, 'eval_precision': 0.984192037470726, 'eval_recall': 0.9836065573770492, 'eval_f1': 0.9835948686479061, 'eval_runtime': 1.1394, 'eval_samples_per_second': 53.535, 'eval_steps_per_second': 6.143, 'epoch': 8.0}
Training fold 3


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.001966,1.0,1.0,1.0,1.0
2,No log,0.001286,1.0,1.0,1.0,1.0
3,No log,0.000949,1.0,1.0,1.0,1.0
4,No log,0.000838,1.0,1.0,1.0,1.0
5,No log,0.000675,1.0,1.0,1.0,1.0
6,No log,0.000603,1.0,1.0,1.0,1.0
7,No log,0.000557,1.0,1.0,1.0,1.0
8,No log,0.000545,1.0,1.0,1.0,1.0


Validation results for fold 3: {'eval_loss': 0.000545431103091687, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 1.1374, 'eval_samples_per_second': 53.633, 'eval_steps_per_second': 6.155, 'epoch': 8.0}
Training fold 4


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.000248,1.0,1.0,1.0,1.0
2,No log,0.000156,1.0,1.0,1.0,1.0
3,No log,0.000116,1.0,1.0,1.0,1.0
4,No log,9.3e-05,1.0,1.0,1.0,1.0
5,No log,8.1e-05,1.0,1.0,1.0,1.0
6,No log,7.4e-05,1.0,1.0,1.0,1.0
7,No log,6.9e-05,1.0,1.0,1.0,1.0
8,No log,6.8e-05,1.0,1.0,1.0,1.0


Validation results for fold 4: {'eval_loss': 6.799568654969335e-05, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 1.1292, 'eval_samples_per_second': 54.022, 'eval_steps_per_second': 6.199, 'epoch': 8.0}
Training fold 5


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,3.9e-05,1.0,1.0,1.0,1.0
2,No log,3e-05,1.0,1.0,1.0,1.0
3,No log,2.6e-05,1.0,1.0,1.0,1.0
4,No log,2.2e-05,1.0,1.0,1.0,1.0
5,No log,2e-05,1.0,1.0,1.0,1.0
6,No log,1.9e-05,1.0,1.0,1.0,1.0
7,No log,1.8e-05,1.0,1.0,1.0,1.0
8,No log,1.8e-05,1.0,1.0,1.0,1.0


Validation results for fold 5: {'eval_loss': 1.7656486306805164e-05, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 1.1382, 'eval_samples_per_second': 53.593, 'eval_steps_per_second': 6.15, 'epoch': 8.0}


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Test results: {'eval_loss': 0.5803845524787903, 'eval_accuracy': 0.948051948051948, 'eval_precision': 0.9554730983302412, 'eval_recall': 0.948051948051948, 'eval_f1': 0.9486069486069486, 'eval_runtime': 1.5237, 'eval_samples_per_second': 50.536, 'eval_steps_per_second': 5.25}


**Observations:**

* The Accuracy and precission seems to be the same as teh previous Model
* If we look at the loss for each epoch and in the final evaluation the loss increase with epochs and the final loss is greater than the previous model
* Some over fitting is there in this Model


##### Bert Model 3

Train 3rd bert model using the below parameters

parameters: 

* epoch = 8
* folds = 3
* batch size = 8

In [33]:
# Load pre-trained BERT model and tokenizer
tokenizer_bert_3 = BertTokenizer.from_pretrained(model_name)
model_bert_3 = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
main(model_bert_3, 'bert_custom_3', tokenizer_bert_3, train_df, test_df,  num_splits=3, batch_size=8, num_epochs=8)

Tokenize Train Data:


Map:   0%|          | 0/305 [00:00<?, ? examples/s]

Tokenize Test Data:


Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Training fold 1


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.790609,0.813725,0.843801,0.813725,0.806374
2,No log,0.565893,0.862745,0.872394,0.862745,0.859785
3,No log,0.44895,0.882353,0.893717,0.882353,0.880051
4,No log,0.38965,0.872549,0.880583,0.872549,0.870285
5,No log,0.378325,0.882353,0.885121,0.882353,0.881641
6,No log,0.399204,0.872549,0.880849,0.872549,0.869036
7,No log,0.375632,0.882353,0.889058,0.882353,0.880709
8,No log,0.379688,0.882353,0.889058,0.882353,0.880709


Validation results for fold 1: {'eval_loss': 0.3796875476837158, 'eval_accuracy': 0.8823529411764706, 'eval_precision': 0.8890577202748107, 'eval_recall': 0.8823529411764706, 'eval_f1': 0.8807089732444703, 'eval_runtime': 1.7775, 'eval_samples_per_second': 57.384, 'eval_steps_per_second': 7.314, 'epoch': 8.0}
Training fold 2


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.051051,0.990196,0.990461,0.990196,0.990213
2,No log,0.054278,0.990196,0.990461,0.990196,0.990213
3,No log,0.050688,0.990196,0.990461,0.990196,0.990213
4,No log,0.056609,0.980392,0.980657,0.980392,0.980311
5,No log,0.051924,0.990196,0.990461,0.990196,0.990213
6,No log,0.053836,0.990196,0.990461,0.990196,0.990213
7,No log,0.054704,0.990196,0.990461,0.990196,0.990213
8,No log,0.055116,0.990196,0.990461,0.990196,0.990213


Validation results for fold 2: {'eval_loss': 0.05511612817645073, 'eval_accuracy': 0.9901960784313726, 'eval_precision': 0.9904610492845788, 'eval_recall': 0.9901960784313726, 'eval_f1': 0.9902126929009141, 'eval_runtime': 1.7673, 'eval_samples_per_second': 57.715, 'eval_steps_per_second': 7.356, 'epoch': 8.0}
Training fold 3


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.023822,0.990099,0.990549,0.990099,0.990154
2,No log,0.001536,1.0,1.0,1.0,1.0
3,No log,0.001148,1.0,1.0,1.0,1.0
4,No log,0.000902,1.0,1.0,1.0,1.0
5,No log,0.000784,1.0,1.0,1.0,1.0
6,No log,0.000709,1.0,1.0,1.0,1.0
7,No log,0.000666,1.0,1.0,1.0,1.0
8,No log,0.000652,1.0,1.0,1.0,1.0


Validation results for fold 3: {'eval_loss': 0.000652303802780807, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 1.7563, 'eval_samples_per_second': 57.506, 'eval_steps_per_second': 7.402, 'epoch': 8.0}


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Test results: {'eval_loss': 0.3694307208061218, 'eval_accuracy': 0.948051948051948, 'eval_precision': 0.9554730983302412, 'eval_recall': 0.948051948051948, 'eval_f1': 0.9486069486069486, 'eval_runtime': 1.4469, 'eval_samples_per_second': 53.216, 'eval_steps_per_second': 6.911}


**Observations:**

* Model 1 achieved a lower loss of approximately 0.217 compared to Model 3, which had a loss of approximately 0.369. This indicates that Model 1's predictions were slightly more aligned with the actual labels compared to Model 3
* Accuracy, Precision, Recall, and F1-score are more or less the same for both the models

### Distil BERT Model Fine Tuning

In [35]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

In [36]:
model_name_distilBert = 'distilbert-base-uncased'

##### Distil Bert Model 1

Train 1st Distil bert model using the below parameters

parameters: 

* epoch = 5
* folds = 3
* batch size = 8

In [37]:
# Load pre-trained BERT model and tokenizer
tokenizer_distilBert_1 = DistilBertTokenizer.from_pretrained(model_name_distilBert)
model_distilBert_1 = DistilBertForSequenceClassification.from_pretrained(model_name_distilBert, num_labels=3)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
main(model_distilBert_1, 'distilBert_custom_1', tokenizer_distilBert_1, train_df, test_df)

Tokenize Train Data:


Map:   0%|          | 0/305 [00:00<?, ? examples/s]

Tokenize Test Data:


Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Training fold 1


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.832282,0.833333,0.851441,0.833333,0.829953
2,No log,0.613604,0.843137,0.844673,0.843137,0.840181
3,No log,0.497283,0.872549,0.880849,0.872549,0.869036
4,No log,0.447065,0.872549,0.880849,0.872549,0.869036
5,No log,0.432346,0.872549,0.880849,0.872549,0.869036


Validation results for fold 1: {'eval_loss': 0.43234574794769287, 'eval_accuracy': 0.8725490196078431, 'eval_precision': 0.8808489304812834, 'eval_recall': 0.8725490196078431, 'eval_f1': 0.8690364663517707, 'eval_runtime': 0.9035, 'eval_samples_per_second': 112.894, 'eval_steps_per_second': 14.388, 'epoch': 5.0}
Training fold 2


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.152626,0.990196,0.990461,0.990196,0.990213
2,No log,0.114909,0.980392,0.981424,0.980392,0.980453
3,No log,0.091465,0.990196,0.990461,0.990196,0.990213
4,No log,0.089477,0.990196,0.990461,0.990196,0.990213
5,No log,0.088853,0.980392,0.981424,0.980392,0.980453


Validation results for fold 2: {'eval_loss': 0.08885282278060913, 'eval_accuracy': 0.9803921568627451, 'eval_precision': 0.9814241486068112, 'eval_recall': 0.9803921568627451, 'eval_f1': 0.9804528793499381, 'eval_runtime': 0.8999, 'eval_samples_per_second': 113.349, 'eval_steps_per_second': 14.446, 'epoch': 5.0}
Training fold 3


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.08877,0.980198,0.98192,0.980198,0.980407
2,No log,0.087337,0.980198,0.98192,0.980198,0.980407
3,No log,0.056601,0.980198,0.98192,0.980198,0.980407
4,No log,0.078703,0.980198,0.98192,0.980198,0.980407
5,No log,0.083729,0.980198,0.98192,0.980198,0.980407


Validation results for fold 3: {'eval_loss': 0.08372855186462402, 'eval_accuracy': 0.9801980198019802, 'eval_precision': 0.981919931123547, 'eval_recall': 0.9801980198019802, 'eval_f1': 0.9804065772430901, 'eval_runtime': 0.8987, 'eval_samples_per_second': 112.385, 'eval_steps_per_second': 14.465, 'epoch': 5.0}


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Test results: {'eval_loss': 0.20548087358474731, 'eval_accuracy': 0.948051948051948, 'eval_precision': 0.9554730983302412, 'eval_recall': 0.948051948051948, 'eval_f1': 0.9486069486069486, 'eval_runtime': 0.7525, 'eval_samples_per_second': 102.331, 'eval_steps_per_second': 13.29}


**Observations:**

* The result matrix looks pretty much the same as Model 1 for BERT