# Install and Import Requirements

In [1]:
!pip install -q peft
!pip install -q peft evaluate datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

from transformers import BertTokenizer, BertModel, DataCollatorWithPadding, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, EarlyStoppingCallback, Trainer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
from datasets import DatasetDict, Dataset

import evaluate
from peft import LoraConfig, TaskType, get_peft_model

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_csv("/content/final_dataset.csv") # can be downloaded from GitHub repo, contains 9.5k samples

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# create datasets for training, validation, and testing

temp_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
train_df, val_df = train_test_split(temp_df, test_size=0.125, stratify=temp_df["label"], random_state=73) # not sure if this split is right

# create datasets from dataframes
train_dataset = Dataset.from_pandas(train_df[['input', 'label']].reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df[['input', 'label']].reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df[['input', 'label']].reset_index(drop=True))

def tokenize_func(examples):
  return tokenizer(examples["input"], truncation=True, max_length=128, padding="max_length")

tokenized_train = train_dataset.map(tokenize_func, batched=True, remove_columns="input")
tokenized_val = val_dataset.map(tokenize_func, batched=True, remove_columns="input")
tokenized_test = test_dataset.map(tokenize_func, batched=True, remove_columns="input")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/6674 [00:00<?, ? examples/s]

Map:   0%|          | 0/954 [00:00<?, ? examples/s]

Map:   0%|          | 0/1908 [00:00<?, ? examples/s]

# Evaluation Setup

In [5]:
metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # used Claude to write code to calculate multiple training metrics
    accuracy = metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average='binary')
    recall = recall_metric.compute(predictions=predictions, references=labels, average='binary')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='binary')

    tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()

    return {
        'accuracy': accuracy['accuracy'],
        'precision': precision['precision'],
        'recall': recall['recall'],
        'f1': f1['f1'],
        # for printed confusion matrix
        'true_positives': tp,
        'false_positives': fp,
        'false_negatives': fn,
        'true_negatives': tn,
    }

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

# Training Setup

In [8]:
# define LoRA config (from hyperparameter grid search)
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.2,
    target_modules=["query", "key", "value"],
    bias="none"
)

# weighted trainer code from Claude to improve model training w/ class imbalance
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Calculate class weights
        n_samples = len(train_df)
        n_positive = train_df['label'].sum()
        n_negative = n_samples - n_positive
        pos_weight = n_negative / n_positive * 1.3

        loss_fct = nn.CrossEntropyLoss(
            weight=torch.tensor([1.0, pos_weight], dtype=torch.float32).to(logits.device)
        )
        loss = loss_fct(logits.view(-1, 2), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [14]:
# function to train model
def train_full_model(number, train_dataset, val_dataset, tokenizer, lora_config):
  # define model
  model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)
  model = get_peft_model(model, lora_config)


  training_args = TrainingArguments(
      output_dir=f"./model_train_f_{number}",
      eval_strategy="epoch",
      save_strategy="epoch",
      save_total_limit=None, # currently saves models from every epoch
      num_train_epochs=10,
      per_device_train_batch_size=16,
      per_device_eval_batch_size=32,
      learning_rate=1e-4,

      # Claude code to improve model training by decreasing memorization and stabilizing learning
      weight_decay=0.01,
      warmup_ratio=0.1,

      # Claude code to prevent unecessary logging
      report_to=[],
      logging_strategy="no",

      load_best_model_at_end=True,
      metric_for_best_model="recall",
      greater_is_better=True
  )

  # Claude code to implement early stopping
  early_stopping = EarlyStoppingCallback(
      early_stopping_patience=5
  )


  # create trainer from weighted trainer class
  trainer = WeightedTrainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=val_dataset,
      compute_metrics=compute_metrics,
      tokenizer=tokenizer,
      callbacks=[early_stopping],
  )

  # train
  trainer.train()

  # final val metrics
  eval_results = trainer.evaluate()

  model.save_pretrained(f"./final_model_{number}") # for later reference

  return model

# Model Training

In [15]:
train_full_model(2, tokenized_train, tokenized_val, tokenizer, lora_config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,True Positives,False Positives,False Negatives,True Negatives
1,No log,0.65628,0.566038,0.304348,0.653333,0.415254,147,336,78,393
2,No log,0.621094,0.613208,0.326087,0.6,0.422535,135,279,90,450
3,No log,0.608619,0.578616,0.324752,0.728889,0.449315,164,341,61,388
4,No log,0.606294,0.631027,0.354691,0.688889,0.468278,155,282,70,447
5,No log,0.605096,0.65304,0.370732,0.675556,0.47874,152,258,73,471
6,No log,0.612556,0.637317,0.358974,0.684444,0.470948,154,275,71,454
7,No log,0.621736,0.627883,0.353604,0.697778,0.469357,157,287,68,442
8,No log,0.622869,0.625786,0.352018,0.697778,0.467958,157,289,68,440


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(28996, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.2, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default