# Assignment 3.3

This notebook contains the solution and outputs for **Assignment 3.3**.

## Setup and Imports

In [1]:
import sys
sys.path.insert(0, '../src')

from dataclasses import dataclass
from pathlib import Path
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tqdm
from bert_from_scratch import BertForSequenceClassification as MyBertForSequenceClassification
from lora_from_scratch import add_lora_layers, freeze_model, merge_lora_layers, unfreeze_model

In [2]:
def collate_fn(batch):
    text = [item['text'] for item in batch]
    labels = torch.stack([torch.tensor(item['label']) for item in batch])
    return {'text': text, 'label': labels}

df = pd.read_csv("../data/train-sample.csv")
string_to_int = {'open': 0, 'not a real question': 1, 'off topic': 1, 'not constructive': 1, 'too localized': 1}
df['OpenStatusInt'] = df['OpenStatus'].map(string_to_int)
df['TitleConcatWithBody'] = df.apply(lambda x: x.Title + " " + x.BodyMarkdown, axis=1)
data_dict = {'text': df.TitleConcatWithBody.tolist(), 'label': df.OpenStatusInt.tolist()}
dataset_stackoverflow = Dataset.from_dict(data_dict)
n_samples = len(dataset_stackoverflow)
split_idx1 = int(n_samples * 0.8)
split_idx2 = int(n_samples * 0.9)
shuffled_dataset = dataset_stackoverflow.shuffle(seed=42)
train_dataset = shuffled_dataset.select(range(split_idx1))
val_dataset = shuffled_dataset.select(range(split_idx1, split_idx2))
test_dataset = shuffled_dataset.select(range(split_idx2, n_samples))
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

Map:   0%|          | 0/112217 [00:00<?, ? examples/s]

Map:   0%|          | 0/14027 [00:00<?, ? examples/s]

Map:   0%|          | 0/14028 [00:00<?, ? examples/s]

In [3]:
class BertTrainer:
    def __init__(self, model, tokenizer, train_dataloader, eval_dataloader=None, epochs=1, lr=5e-4, output_dir='./', output_filename='model_state_dict.pt', save=False):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = model.to(self.device)
        self.tokenizer = tokenizer
        self.train_dataloader = train_dataloader
        self.eval_dataloader = eval_dataloader
        self.optimizer = AdamW(self.model.parameters(), lr=lr)
        self.loss_fn = torch.nn.CrossEntropyLoss()
        self.output_dir = output_dir
        self.output_filename = output_filename
        self.save = save
        self.eval_loss = float('inf')
        self.epochs = epochs
        self.epoch_best_model = 0

    def train(self, evaluate=False):
        for epoch in range(self.epochs):
            self.iteration(epoch, self.train_dataloader)
            if evaluate and self.eval_dataloader is not None:
                self.iteration(epoch, self.eval_dataloader, train=False)

    def evaluate(self):
        self.iteration(0, self.eval_dataloader, train=False)

    def iteration(self, epoch, data_loader, train=True):
        loss_accumulated = 0.
        correct_accumulated = 0
        samples_accumulated = 0
        preds_all = []
        labels_all = []
        self.model.train() if train else self.model.eval()
        mode = "train" if train else "eval"
        batch_iter = tqdm.tqdm(enumerate(data_loader), desc=f"EP ({mode}) {epoch}", total=len(data_loader), bar_format="{l_bar}{r_bar}")
        for i, batch in batch_iter:
            batch_t = self.tokenizer(batch['text'], padding='max_length', max_length=512, truncation=True, return_tensors='pt')
            batch_t = {key: value.to(self.device) for key, value in batch_t.items()}
            batch_t["input_labels"] = batch["label"].to(self.device)
            logits = self.model(input_ids=batch_t["input_ids"], token_type_ids=batch_t["token_type_ids"], attention_mask=batch_t["attention_mask"])
            loss = self.loss_fn(logits, batch_t["input_labels"])
            if train:
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
            preds = logits.argmax(dim=-1)
            correct = preds.eq(batch_t["input_labels"]).sum().item()
            loss_accumulated += loss.item()
            correct_accumulated += correct
            samples_accumulated += len(batch_t["input_labels"])
            preds_all.append(preds.detach())
            labels_all.append(batch_t['input_labels'].detach())
        preds_all = torch.cat(preds_all, dim=0).cpu()
        labels_all = torch.cat(labels_all, dim=0).cpu()
        accuracy = accuracy_score(labels_all, preds_all)
        precision = precision_score(labels_all, preds_all, average='macro')
        recall = recall_score(labels_all, preds_all, average='macro')
        f1 = f1_score(labels_all, preds_all, average='macro')
        avg_loss_epoch = loss_accumulated / len(data_loader)
        print(f"samples={samples_accumulated}, correct={correct_accumulated}, acc={round(accuracy, 4)}, recall={round(recall, 4)}, prec={round(precision,4)}, f1={round(f1, 4)}, loss={round(avg_loss_epoch, 4)}")
        if self.save and not train and avg_loss_epoch < self.eval_loss:
            dir_path = Path(self.output_dir)
            dir_path.mkdir(parents=True, exist_ok=True)
            file_path = dir_path / f"{self.output_filename}_epoch_{epoch}.pt"
            if epoch > 0:
                file_path_best_model = dir_path / f"{self.output_filename}_epoch_{self.epoch_best_model}.pt"
                !rm -f $file_path_best_model
            torch.save({'model_state_dict': self.model.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict()}, file_path)
            self.eval_loss = avg_loss_epoch
            self.epoch_best_model = epoch

In [4]:
tokenizer_base = BertTokenizer.from_pretrained('bert-base-uncased')
bert_base = MyBertForSequenceClassification.from_pretrained(
    model_type='bert-base-uncased',
    config_args={"vocab_size": 30522, "n_classes": 2}
)

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Loading weights from pretrained model: bert-base-uncased


Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [5]:
trainer_bert_base = BertTrainer(
    bert_base,
    tokenizer_base,
    lr=5e-6,
    epochs=5,
    train_dataloader=train_dataloader,
    eval_dataloader=val_dataloader,
    output_dir='../models/bert_base_fine_tuned',
    output_filename='bert_base',
    save=True
)
trainer_bert_base.train(evaluate=True)

EP (train) 0: 100%|| 3507/3507 [48:24<00:00,  1.21it/s]


samples=112217, correct=85975, acc=0.7661, recall=0.7662, prec=0.7663, f1=0.7661, loss=0.4903


EP (eval) 0: 100%|| 439/439 [02:25<00:00,  3.01it/s]


samples=14027, correct=11013, acc=0.7851, recall=0.7853, prec=0.7856, f1=0.7851, loss=0.4643


EP (train) 1: 100%|| 3507/3507 [48:27<00:00,  1.21it/s]


samples=112217, correct=89764, acc=0.7999, recall=0.7999, prec=0.7999, f1=0.7999, loss=0.4392


EP (eval) 1: 100%|| 439/439 [02:25<00:00,  3.01it/s]


samples=14027, correct=11034, acc=0.7866, recall=0.7869, prec=0.7877, f1=0.7865, loss=0.462


EP (train) 2: 100%|| 3507/3507 [48:29<00:00,  1.21it/s]


samples=112217, correct=92032, acc=0.8201, recall=0.8201, prec=0.8201, f1=0.8201, loss=0.4056


EP (eval) 2: 100%|| 439/439 [02:25<00:00,  3.01it/s]


samples=14027, correct=11081, acc=0.79, recall=0.7901, prec=0.7902, f1=0.79, loss=0.4624


EP (train) 3: 100%|| 3507/3507 [48:31<00:00,  1.20it/s]


samples=112217, correct=94453, acc=0.8417, recall=0.8417, prec=0.8417, f1=0.8417, loss=0.3671


EP (eval) 3: 100%|| 439/439 [02:26<00:00,  3.00it/s]


samples=14027, correct=10886, acc=0.7761, recall=0.7767, prec=0.7816, f1=0.7752, loss=0.4853


EP (train) 4: 100%|| 3507/3507 [48:34<00:00,  1.20it/s]


samples=112217, correct=97186, acc=0.8661, recall=0.8661, prec=0.8661, f1=0.8661, loss=0.3244


EP (eval) 4: 100%|| 439/439 [02:25<00:00,  3.01it/s]


samples=14027, correct=10998, acc=0.7841, recall=0.7842, prec=0.7842, f1=0.7841, loss=0.5039


In [6]:
state_dict = torch.load('../models/bert_base_fine_tuned/bert_base_epoch_1.pt')
bert_base.load_state_dict(state_dict["model_state_dict"])
trainer_bert_base = BertTrainer(
    bert_base,
    tokenizer_base,
    lr=5e-6,
    epochs=5,
    train_dataloader=train_dataloader,
    eval_dataloader=test_dataloader,
    output_dir='../models/bert_base_fine_tuned',
    output_filename='bert_base',
    save=False
)
trainer_bert_base.evaluate()

EP (eval) 0: 100%|| 439/439 [02:25<00:00,  3.01it/s]


samples=14028, correct=11022, acc=0.7857, recall=0.7855, prec=0.7862, f1=0.7855, loss=0.4594


In [7]:
tokenizer_base = BertTokenizer.from_pretrained('bert-base-uncased')
bert_base = MyBertForSequenceClassification.from_pretrained(
    model_type='bert-base-uncased',
    config_args={"vocab_size": 30522, "n_classes": 2}
)
add_lora_layers(bert_base, r=8, lora_alpha=16)
freeze_model(bert_base)

Loading weights from pretrained model: bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [8]:
n_params = 0
n_trainable_params = 0
for n, p in bert_base.named_parameters():
    n_params += p.numel()
    if p.requires_grad:
        n_trainable_params += p.numel()
print(f"Total parameters: {n_params}")
print(f"Trainable parameters: {n_trainable_params}")
print(f"Percentage trainable: {round(n_trainable_params / n_params * 100, 2)}%")

Total parameters: 109778690
Trainable parameters: 296450
Percentage trainable: 0.27%


In [9]:
trainer_bert_base_lora = BertTrainer(
    bert_base,
    tokenizer_base,
    lr=5e-4,
    epochs=5,
    train_dataloader=train_dataloader,
    eval_dataloader=val_dataloader,
    output_dir='../models/bert_base_fine_tuned_lora_r8',
    output_filename='bert_base_lora_r8',
    save=True
)
trainer_bert_base_lora.train(evaluate=True)

EP (train) 0: 100%|| 3507/3507 [38:32<00:00,  1.52it/s]


samples=112217, correct=86105, acc=0.7673, recall=0.7673, prec=0.7674, f1=0.7673, loss=0.4879


EP (eval) 0: 100%|| 439/439 [02:28<00:00,  2.95it/s]


samples=14027, correct=10730, acc=0.765, recall=0.764, prec=0.776, f1=0.7621, loss=0.4885


EP (train) 1: 100%|| 3507/3507 [38:33<00:00,  1.52it/s]


samples=112217, correct=89196, acc=0.7949, recall=0.7949, prec=0.7949, f1=0.7949, loss=0.4482


EP (eval) 1: 100%|| 439/439 [02:29<00:00,  2.94it/s]


samples=14027, correct=10908, acc=0.7776, recall=0.7777, prec=0.7777, f1=0.7776, loss=0.468


EP (train) 2: 100%|| 3507/3507 [38:33<00:00,  1.52it/s]


samples=112217, correct=90999, acc=0.8109, recall=0.8109, prec=0.8109, f1=0.8109, loss=0.4214


EP (eval) 2: 100%|| 439/439 [02:28<00:00,  2.95it/s]


samples=14027, correct=10986, acc=0.7832, recall=0.7834, prec=0.7839, f1=0.7831, loss=0.4723


EP (train) 3: 100%|| 3507/3507 [38:35<00:00,  1.51it/s]


samples=112217, correct=92469, acc=0.824, recall=0.824, prec=0.824, f1=0.824, loss=0.3965


EP (eval) 3: 100%|| 439/439 [02:29<00:00,  2.94it/s]


samples=14027, correct=10937, acc=0.7797, recall=0.7799, prec=0.7802, f1=0.7797, loss=0.4713


EP (train) 4: 100%|| 3507/3507 [38:34<00:00,  1.52it/s]


samples=112217, correct=94023, acc=0.8379, recall=0.8379, prec=0.8379, f1=0.8379, loss=0.3718


EP (eval) 4: 100%|| 439/439 [02:28<00:00,  2.95it/s]


samples=14027, correct=10862, acc=0.7744, recall=0.7745, prec=0.7747, f1=0.7743, loss=0.5011


In [10]:
state_dict = torch.load("../models/bert_base_fine_tuned_lora_r8/bert_base_lora_r8_epoch_1.pt")
bert_base.load_state_dict(state_dict["model_state_dict"])
merge_lora_layers(bert_base)
unfreeze_model(bert_base)
dir_path = Path("../models/bert_base_fine_tuned_lora_r8/merged")
dir_path.mkdir(parents=True, exist_ok=True)
file_path = dir_path / "bert_base_lora_r8_epoch_1_merged.pt"
torch.save({"model_state_dict": bert_base.state_dict()}, file_path)

In [11]:
trainer_bert_base_lora_r8 = BertTrainer(
    bert_base,
    tokenizer_base,
    lr=5e-6,
    epochs=5,
    train_dataloader=train_dataloader,
    eval_dataloader=test_dataloader,
    output_dir='../models/bert_base_fine_tuned_lora_r8',
    output_filename='bert_base_lora_r8',
    save=False
)
trainer_bert_base_lora_r8.evaluate()

EP (eval) 0: 100%|| 439/439 [02:22<00:00,  3.07it/s]


samples=14028, correct=10985, acc=0.7831, recall=0.8204, prec=0.7395, f1=0.7778, loss=0.4643
