# Overview

Notebook for fine-tuning DistilBERT to predict bias class from article tokens

# Setup

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')
# NOTE: To be able to access the shared files, you need to go to Drive and click
# "Add shortcut to Drive" on the options for the shared folder to be able to access it when mounted

In [None]:
# Google Drive Paths to Folder
path = "/content/gdrive/MyDrive/"
%cd {path}
%pwd

In [None]:
# Pip installs
%pip install scikit-learn transformers datasets evaluate accelerate huggingface_hub

In [None]:
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
from tqdm.notebook import tqdm
import os
import json

# Cuda Check
print(torch.__version__)
torch.cuda.is_available()

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

# Load Data

In [None]:
tokens = load_dataset("bzhao18/hyperpartisan-news-distilbert-tokens")
tokens = tokens.remove_columns(["text", "title", "hyperpartisan", "url", "published_at"]) \
    .rename_column("bias", "label") \
    .with_format("torch")
dataset = tokens["train"].train_test_split(test_size=0.3, shuffle=True)
print(dataset)

# Fine Tune

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
def log_classification_metrics(log, labels, predictions, loss):
    accuracy = accuracy_score(labels, predictions)
    rmse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    log['accuracy'].append(accuracy)
    log['rmse'].append(rmse)
    log['mae'].append(mae)
    log['loss'].append(loss)
    return accuracy, rmse, mae

def log_regression_metrics(log, labels, predictions, loss):
    rmse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    log['rmse'].append(rmse)
    log['mae'].append(mae)
    log['loss'].append(loss)
    return rmse, mae

In [None]:
def init_model(task, mode):
    if task=="cls":
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5).to(device)
    elif task=="reg":
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device)
    else:
        return None

    if mode=="head":
        for param in model.distilbert.parameters():
            param.requires_grad = False
    elif mode=="full":
        pass
    else:
        return None
    return model

In [None]:
###############################################################################
# Main
###############################################################################
# dataloader
train_limit = 100
valid_limit = 100
batch_size = 32
shuffle = True
# hyperparameters
lr = 5e-5
num_epochs = 10

def train(task, mode):
    name = "final"+f"_{mode}_{task}"
    root = os.path.join("fine_tune", name)
    os.makedirs(root, exist_ok=True)
    logs = {'train':{'accuracy': [], 'rmse': [], 'mae': [], 'loss': []},
            'valid':{'accuracy': [], 'rmse': [], 'mae': [], 'loss': []}}
    model = init_model(task, mode)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    training_set = DataLoader(dataset["train"], batch_size=batch_size, shuffle=shuffle)
    validation_set = DataLoader(dataset["test"], batch_size=batch_size, shuffle=False)
    for epoch in tqdm(range(num_epochs), total=num_epochs, desc="Total progress"):
        epoch_logs = {'train':{'accuracy': [], 'rmse': [], 'mae': [], 'loss': []},
                        'valid':{'accuracy': [], 'rmse': [], 'mae': [], 'loss': []}}

        # train
        model.train()
        label_train = []
        predict_train = []
        loss_train = []
        for i,batch in tqdm(enumerate(training_set), total=train_limit, desc=f"     Training epoch {epoch}"):
            if i==train_limit: break

            # forward prop
            input_ids = batch['input_ids'].squeeze(1).to(device)
            attention_mask = batch['attention_mask'].squeeze(1).to(device)
            if task=="cls":
                labels = batch['label'].to(device)
            elif task=="reg":
                labels = batch['label'].to(device, dtype=torch.float)
            label_train += batch['label'].tolist()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            # backprop
            loss = outputs.loss.to(torch.double)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            # get batch metrics
            if task=="cls":
                pred_score, pred_labels = torch.max(outputs.logits, dim=1)
                prediction = pred_labels.flatten().cpu().detach().tolist()
                predict_train += prediction
                accuracy, rmse, mae = log_classification_metrics(epoch_logs['train'], batch['label'].tolist(), prediction, loss.item())
            elif task=="reg":
                prediction = outputs.logits.flatten().cpu().detach().tolist()
                predict_train += prediction
                rmse, mae = log_regression_metrics(epoch_logs['train'], batch['label'].tolist(), prediction, loss.item())
            loss_train.append(loss.item())

        # Validation
        model.eval()
        label_valid = []
        predict_valid = []
        loss_valid = []
        with torch.no_grad():
            for i,batch in tqdm(enumerate(validation_set), total=valid_limit, desc=f"     Validating epoch {epoch}"):
                if i==valid_limit: break

                # forward prop
                input_ids = batch['input_ids'].squeeze(1).to(device)
                attention_mask = batch['attention_mask'].squeeze(1).to(device)
                if task=="cls":
                    labels = batch['label'].to(device)
                elif task=="reg":
                    labels = batch['label'].to(device, dtype=torch.float)
                label_valid += batch['label'].tolist()
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

                # get batch metrics
                loss = outputs.loss.to(torch.double)
                if task=="cls":
                    pred_score, pred_labels = torch.max(outputs.logits, dim=1)
                    prediction = pred_labels.flatten().cpu().detach().tolist()
                    predict_valid += prediction
                    accuracy, rmse, mae = log_classification_metrics(epoch_logs['valid'], batch['label'].tolist(), prediction, loss.item())
                elif task=="reg":
                    prediction = outputs.logits.flatten().cpu().detach().tolist()
                    predict_valid += prediction
                    rmse, mae = log_regression_metrics(epoch_logs['valid'], batch['label'].tolist(), prediction, loss.item())
                loss_valid.append(loss.item())

        # get epoch metrics
        avg_loss_train = sum(loss_train)/len(loss_train)
        avg_loss_valid = sum(loss_valid)/len(loss_valid)
        if task=="cls":
            accuracy, rmse, mae = log_classification_metrics(logs['train'], label_train, predict_train, avg_loss_train)
            print(f"\t{name} \tTraining \tAccuracy: {accuracy:.3f} \tRMSE: {rmse:.3f} \tMAE: {mae: .3f}")
            accuracy, rmse, mae = log_classification_metrics(logs['valid'], label_valid, predict_valid, avg_loss_valid)
            print(f"\t{name} \tValidation \tAccuracy: {accuracy:.3f} \tRMSE: {rmse:.3f} \tMAE: {mae: .3f}")
        elif task=="reg":
            rmse, mae = log_regression_metrics(logs['train'], label_train, predict_train, avg_loss_train)
            print(f"\t{name} \tTraining \tRMSE: {rmse:.3f} \tMAE: {mae: .3f}")
            rmse, mae = log_regression_metrics(logs['valid'], label_valid, predict_valid, avg_loss_valid)
            print(f"\t{name} \tValidation \tRMSE: {rmse:.3f} \tMAE: {mae: .3f}")

        # save checkpoints
        checkpoint_folder = os.path.join(root, f"distilbert_chkpt_{epoch}")
        os.makedirs(checkpoint_folder, exist_ok=True)
        model.save_pretrained(checkpoint_folder)
        with open(os.path.join(checkpoint_folder, "chkpt_logs.json"), 'w') as f:
            json.dump(epoch_logs, f)
        with open(os.path.join(root, "logs.json"), 'w') as f:
            json.dump(logs, f)

In [None]:
train("cls", "full")
train("cls", "head")
train("reg", "full")
train("reg", "head")

# Runtime

In [None]:
from transformers import AutoTokenizer
from time import time

In [None]:
def get_runtime(tokenizer, model, article):
    start = time()
    tokens = tokenizer(article, return_tensors='pt', truncation=True, padding="max_length").to(device)
    with torch.no_grad():
        output = model(**tokens)
    end = time()
    return end-start

In [None]:
full_dataset = load_dataset("bzhao18/hyperpartisan-news-distilbert-tokens")
article = full_dataset["train"]["text"][0]
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
tasks = ["cls", "reg"]
modes = ["full", "head"]

for task in tasks:
    for mode in modes:
        model = init_model(task, mode).to(device)
        model.eval()
        times = []
        for i in range(100):
            times.append(get_runtime(tokenizer, model, article))
        print(f"Task: {task}, Mode: {mode} \tAverage Runtime: {sum(times)/len(times):.4f}")