In [1]:
MAX_TOKEN_LEN = 128 #@param
EPOCHS = 20 #@param
BATCH_SIZE = 16 #@param
PATIENCE = 3 #@param
BERT_MODEL_NAME = 'bert-base-uncased' #@param
BASE_LEARNING_RATE = 5e-5 #@param
WARMUP_EPOCHS = 7 #@param
NUM_TOP_BERT_LAYERS_FREEZE = 0 #@param
DROPOUT_PROB = 0.00 #@param

In [2]:
# these could just be pip installs but this way you can run all the cells faster

#@title Install dependencies

import importlib
import subprocess

# Define the packages and their versions
packages = {
  'pytorch_lightning': '==2.4.0',
  'transformers': '==4.42.4',
  'torchmetrics': '==1.4.1',
  'datasets': '==2.20.0',
  'plotly': ""
}

def install_package(package_name, version=None):
    try:
        importlib.import_module(package_name)
        print(f"{package_name} is already installed.")
    except ImportError:
        print(f"Installing {package_name}...")
        if version:
            subprocess.check_call(['pip', 'install', f'{package_name}{version}', '--quiet'])
        else:
            subprocess.check_call(['pip', 'install', package_name, '--quiet'])

for package, version in packages.items():
    install_package(package, version)

print("All packages are installed.")

  from .autonotebook import tqdm as notebook_tqdm


pytorch_lightning is already installed.
transformers is already installed.
torchmetrics is already installed.
datasets is already installed.
plotly is already installed.
All packages are installed.


In [3]:

import pandas as pd
import numpy as np

from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

import pytorch_lightning as pl
from torchmetrics.classification import Accuracy, F1Score, AUROC
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

%matplotlib inline
%config InlineBackend.figure_format='retina'

RANDOM_SEED = 42

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

pl.seed_everything(RANDOM_SEED)

Seed set to 42


42

In [4]:
#@title Load BABE Training Dataset
from datasets import load_dataset

dataset = load_dataset("mediabiasgroup/BABE", split="train")
dataset = pd.DataFrame(dataset)
dataset.head()

Unnamed: 0,text,outlet,label,topic,news_link,biased_words,uuid,type,label_opinion
0,NYPD Commissioner Dermot Shea on Monday expres...,Breitbart,0,marriage-equality,http://feedproxy.google.com/~r/breitbart/~3/F5...,[],GtvFWZmmQmybyeMnb8Wbsr,,Entirely factual
1,School systems across the country are adopting...,Federalist,1,islam,https://thefederalist.com/2020/07/08/black-liv...,"['indoctrinating', 'Marxist', 'alarming']",mvoQPtabs6NZbby6LkLbms,,Expresses writer’s opinion
2,"And then along came President Barry Obama, who...",Breitbart,1,marriage-equality,http://feedproxy.google.com/~r/breitbart/~3/ks...,"['what', 'the', 'hell']",RDWPbijx3n2aw6NiMHt7di,,Expresses writer’s opinion
3,"The curfews, which have never before occurred ...",Alternet,1,elections-2020,https://www.alternet.org/2020/06/we-just-got-a...,"['false', 'claims']",2uYKw5KpXasJWH65WCjSu4,left,Entirely factual
4,"Rather than help be a part of the solution, Tr...",Alternet,1,elections-2020,https://www.alternet.org/2020/06/trump-thought...,['racist'],SRGvrzY9PkvtHESdts35Rw,left,Expresses writer’s opinion


In [5]:
#@title Split into Training and Validation Sets

train_df, val_df = train_test_split(dataset, test_size=0.05, random_state=42)

print(f"Shape of train_df: {train_df.shape}")
print(f"Shape of val_df: {val_df.shape}")

Shape of train_df: (2964, 9)
Shape of val_df: (157, 9)


In [6]:

import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer
import pandas as pd

class SocialBiasDataset(Dataset):
    def __init__(self, data: pd.DataFrame, tokenizer: BertTokenizer, max_token_len: int = 128):
        self.tokenizer = tokenizer
        self.data = data
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        text_str = data_row['text']
        label = data_row['label']

        # tokenize the text
        encoding = self.tokenizer.encode_plus(
            text_str,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return dict(
            text_str=text_str,
            input_ids=encoding["input_ids"].flatten(), # flatten to single dimension
            attention_mask=encoding["attention_mask"].flatten(), # flatten to single dimension
            label=torch.FloatTensor([label]) # turns the int into a tensor w it as the only valuse, still one dimension

          )

In [7]:

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

train_dataset = SocialBiasDataset(
  train_df,
  tokenizer,
  max_token_len=MAX_TOKEN_LEN
)

sample_item = train_dataset[0]
print(sample_item.keys(), "\n")
print(f"text_str: {sample_item['text_str']}")
print(f"input_ids shape: {sample_item['input_ids'].shape}")
print(f"attention_mask shape: {sample_item['attention_mask'].shape}")
print(f"labels shape: {sample_item['label'].shape} -- ex. {sample_item['label']}")

dict_keys(['text_str', 'input_ids', 'attention_mask', 'label']) 

text_str: This wasn’t a moment of anger where Keon called someone a racist or sexist name because he’d lost his temper or felt threatened.
input_ids shape: torch.Size([128])
attention_mask shape: torch.Size([128])
labels shape: torch.Size([1]) -- ex. tensor([1.])


In [8]:

class SocialBiasDataModule(pl.LightningDataModule):
    def __init__(self, train_df, test_df, tokenizer, batch_size=16, max_token_len=128):
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def setup(self, stage=None):
        # use the pt datasets defined above
        self.train_dataset = SocialBiasDataset(self.train_df, self.tokenizer, self.max_token_len)
        self.test_dataset = SocialBiasDataset(self.test_df, self.tokenizer, self.max_token_len)

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4
        )

    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            num_workers=4
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            num_workers=4
        )

# init the data module
data_module = SocialBiasDataModule(
    train_df,
    val_df,
    tokenizer,
    batch_size=BATCH_SIZE,
    max_token_len=MAX_TOKEN_LEN
)

In [9]:

class SocialBiasClassifier(pl.LightningModule):
    def __init__(self, n_training_steps=None, n_warmup_steps=None, num_layers_to_freeze=0, dropout_prob=0.1):
        super().__init__()
        self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
        self.classifier = nn.Sequential(
            nn.Dropout(dropout_prob),
            nn.Linear(self.bert.config.hidden_size, 1)  # 1 neuron for binary classification
        )
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        self.loss_fn = nn.BCELoss()  # using binary cross entropy loss

        # freeze layers (wasn't useful in testing)
        self.freeze_bert_layers(num_layers_to_freeze)

        # metrics
        self.train_accuracy = Accuracy(task="binary")
        self.val_accuracy = Accuracy(task="binary")
        self.train_f1 = F1Score(task="binary")
        self.val_f1 = F1Score(task="binary")
        self.train_outputs = [] # used in train_auroc
        self.train_auroc = AUROC(task="binary")
        self.val_auroc = AUROC(task="binary")

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.bert(input_ids, attention_mask=attention_mask)

        # # mean pooling
        # attention_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size())
        # sum_hidden_states = torch.sum(hidden_states * attention_mask_expanded, dim=1)
        # num_tokens = torch.sum(attention_mask, dim=1).clamp(min=1)  # don't divide by 0
        # mean_pooled_output = sum_hidden_states / num_tokens.unsqueeze(-1)  # reshape num_tokens?

        pooler_output = output.pooler_output
        output = self.classifier(output.pooler_output) # change this to mean_pooled_output if you uncomment the lines above
        output = torch.sigmoid(output).squeeze(-1)
        loss = 0
        if labels is not None:
            loss = self.loss_fn(output, labels)

        return loss, output

    def training_step(self, batch, batch_idx):
        # parse batch
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"].squeeze(-1)  # squeeze labels to match outputs shape

        # run forward pass
        loss, outputs = self(input_ids, attention_mask, labels)

        # metrics
        self.train_accuracy(outputs, labels.int())
        self.train_f1(outputs, labels.int())
        self.train_auroc(outputs, labels.int())
        self.log("train_loss", loss, prog_bar=True, logger=True)
        self.log("train_accuracy", self.train_accuracy, on_step=True, on_epoch=True)
        self.log("train_f1", self.train_f1, on_step=True, on_epoch=True)
        self.log("train_auroc", self.train_auroc, on_step=True, on_epoch=True)
        self.log("batch_size", input_ids.size(0), prog_bar=True, logger=True)
        self.train_outputs.append({"predictions": outputs, "labels": labels})

        return {"loss": loss, "predictions": outputs, "labels": labels}

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"].squeeze(-1)
        loss, outputs = self(input_ids, attention_mask, labels)
        self.val_accuracy(outputs, labels.int())
        self.val_f1(outputs, labels.int())
        self.val_auroc(outputs, labels.int())
        self.log("val_loss", loss, prog_bar=True, logger=True)
        self.log("val_accuracy", self.val_accuracy, on_step=True, on_epoch=True)
        self.log("val_f1", self.val_f1, on_step=True, on_epoch=True)
        self.log("val_auroc", self.val_auroc, on_step=True, on_epoch=True)
        self.log("batch_size", input_ids.size(0), prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=BASE_LEARNING_RATE)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.n_warmup_steps,
            num_training_steps=self.n_training_steps
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "interval": "step"
            }
        }

    def on_train_epoch_end(self):
        labels = torch.cat([output["labels"] for output in self.train_outputs])
        predictions = torch.cat([output["predictions"] for output in self.train_outputs])
        class_roc_auc = self.train_auroc(predictions, labels.int())
        self.log('train_roc_auc', class_roc_auc, prog_bar=True, logger=True)
        self.train_outputs = []  # clear outputs for the next epoch

    # currently freezing top layers but we could try bottom
    def freeze_bert_layers(self, num_layers_to_freeze):
        for param in self.bert.parameters():
            param.requires_grad = True

        # Set requires_grad to False for the specified number of layers
        for i, layer in enumerate(self.bert.encoder.layer):
            if i < num_layers_to_freeze:
                for param in layer.parameters():
                    param.requires_grad = False


In [10]:
steps_per_epoch = len(train_df) // BATCH_SIZE
total_training_steps = steps_per_epoch * EPOCHS
warmup_percentage = WARMUP_EPOCHS / EPOCHS
warmup_steps = int(total_training_steps * warmup_percentage)
print("Warmup Steps:", warmup_steps)
print("Total Training Steps:", total_training_steps)

model = SocialBiasClassifier(
    n_warmup_steps=warmup_steps,
    n_training_steps=total_training_steps,
    num_layers_to_freeze=NUM_TOP_BERT_LAYERS_FREEZE,
    dropout_prob=DROPOUT_PROB
)
print("Model initialized")

Warmup Steps: 1295
Total Training Steps: 3700
Model initialized


In [11]:
from pytorch_lightning import Trainer

# Callbacks for selecting the best checkpoint
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath='social-bias-model/',
    filename='best-checkpoint',
    save_top_k=1,
    mode='min'
)

early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=PATIENCE,
    mode='min'
)

# Logging for viewing in TensorBoard
logger = TensorBoardLogger('lightning_logs', name='social-bias-model')

# PL trainer with profiler
trainer = pl.Trainer(
    logger=logger,
    callbacks=[checkpoint_callback, early_stopping_callback],
    max_epochs=EPOCHS,
    log_every_n_steps=1,
    profiler="simple"
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [12]:
!rm -rf lightning_logs/
!rm -rf checkpoints/

In [13]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs/social-bias-model

ERROR: Could not find `tensorboard`. Please ensure that your PATH
contains an executable `tensorboard` program, or explicitly specify
the path to a TensorBoard binary by setting the `TENSORBOARD_BINARY`
environment variable.

In [14]:
trainer.fit(model, data_module)

You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/chowder/anaconda3/envs/eshack/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/chowder/Documents/AiLearning/codingChallenges/ethicalSpecticalHackathon/theirNotebook/social-bias-model exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type           | Params | Mode 
----------------------------------------------------------
0 | bert           | BertModel      | 109 M  | eval 
1 | classifier     | Sequential     | 769    | train
2 | loss_fn        | BCELoss        | 0      | train
3 | train_accuracy | BinaryAccuracy | 0      | tr

Epoch 1: 100%|██████████| 186/186 [00:41<00:00,  4.47it/s, v_num=0, train_loss=0.278, batch_size=4.000, val_loss=0.442, train_roc_auc=0.710]



Epoch 4: 100%|██████████| 186/186 [00:42<00:00,  4.43it/s, v_num=0, train_loss=0.111, batch_size=15.80, val_loss=0.497, train_roc_auc=0.993]  


FIT Profiler Report

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|  Action                                                                                                                                                               	|  Mean duration (s)	|  Num calls      	|  Total time (s) 	|  Percentage %   	|
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|  Total                                                                                                                                                                	|  -       