In [16]:
import re
import os 
import pickle 

train_root_folder = '/Users/rt853/repos/UoB/bath-persuasion-detection/data/processed/dataloaders/training'
test_root_folder = '/Users/rt853/repos/UoB/bath-persuasion-detection/data/processed/dataloaders/testing'

def init_loader_map():
    loader_map = {
        'dataloaders' : {
            'training_path' : None,
            'testing_path' : None},
        'model' : None,
        'embeddings' : None,
        'sampled' : None,
        'drop_labels' : None
        }
    return loader_map

def update_dic(trainloader_path, test_loader_path, root):
    
    loader_map = init_loader_map()
    loader_map['dataloaders']['training_path'] = trainloader_path
    loader_map['dataloaders']['testing_path'] = test_loader_path
    
    if root.split('_')[0] == 'TODBERT':
        loader_map['embeddings'] = 'TODBERT/TOD-BERT-JNT-V1'
        loader_map['model'] = 'todbert'
    else:
        loader_map['embeddings'] = root.split('_')[0]
        loader_map['model'] = loader_map['embeddings'].split('-')[0]
    loader_map['sampled'] = root.split('_')[2]
    loader_map['drop_labels'] = root.split('_')[5]
    return loader_map

def update_metadata(train_path, test_paths):
    
    for i in test_paths:
        root = train_path.split('_')[1:]
        root = '_'.join(root)[:-4]
    
        if re.search(root, i):
            trainloader_path = f'{train_root_folder}{train_path}'
            test_loader_path = f'{test_root_folder}{i}'
            loader_map = update_dic(trainloader_path, test_loader_path, root)
            break
    
    return loader_map
    
def gen_loader_map(train_root_folder, test_root_folder):
    

    experiment_confs = []
    test_paths = os.listdir(test_root_folder)
    for path in os.listdir(train_root_folder):
        loader_map = update_metadata(path, test_paths)
        experiment_confs.append(loader_map)
        
    return experiment_confs

experiments = gen_loader_map(train_root_folder, test_root_folder)


UnboundLocalError: cannot access local variable 'loader_map' where it is not associated with a value

In [10]:
import torch.nn as nn
import pandas as pd 
import torch
from torch.utils.data import Dataset
import numpy as np 
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoModel, AutoTokenizer

# Load pre-trained BERT model and tokenizer
class PersuasionStrategyDataset(Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: AutoTokenizer,
        max_token_len
            ):

        self.tokenizer = tokenizer
        self.data = data
        self.max_token_len = max_token_len
        self.LABEL_COLUMNS = data.columns.tolist()[1:]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):

        data_row = self.data.iloc[index]
        comment_text = data_row.text
        labels = data_row[self.LABEL_COLUMNS]
        encoding = self.tokenizer.encode_plus(
            comment_text,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
            )

        return dict(
            comment_text=comment_text,
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            labels=torch.FloatTensor(labels)
        )

class BertClassifier(nn.Module):

    """
    BERT-based classifier model.

    Args:
        bert_model (str): The pre-trained BERT model to use.
        num_labels (int): The number of output labels.

    Attributes:
        bert (BertModel): The BERT model.
        dropout (nn.Dropout): Dropout layer for regularization.
        classifier (nn.Linear): Linear layer for classification.

    """

    def __init__(self, name, bert_model, num_labels):
        super(BertClassifier, self).__init__()
        self.name = name
        self.bert = AutoModel.from_pretrained(bert_model)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):

        """
        Forward pass of the BERT classifier.

        Args:
            input_ids (torch.Tensor): The input token IDs.
            attention_mask (torch.Tensor): The attention mask.

        Returns:
            torch.Tensor: The logits for each class.

        """
        if self.name == 'bert':
            outputs = self.bert(input_ids=input_ids,
                                attention_mask=attention_mask)
            pooled_output = outputs.pooler_output

        elif self.name == 'distilbert':
            outputs = self.bert(input_ids=input_ids,
                                attention_mask=attention_mask)
            pooled_output = outputs.last_hidden_state[:, 0, :]
            
        else:
            raise ValueError("Invalid model name")

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits


In [11]:
from tqdm.auto import tqdm

def train_model(
        model, training_dataloader, num_epochs, device, optimizer, criterion):

    avg_loss = 0

    model.train()

    with tqdm(range(num_epochs), desc='Average Epoch Loss: ') as t:
        for e in range(num_epochs):
            epoch_loss = []

            with tqdm(range(len(training_dataloader)), desc='Loss: 0') as t2:
                for b, batch in enumerate(training_dataloader):

                    optimizer.zero_grad()

                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].to(device)
                    outputs = model(input_ids, attention_mask)

                    loss = criterion(outputs, labels.float())
                    epoch_loss.append(loss.item())
                    loss.backward()
                    optimizer.step()

                    avg_epoch_loss = round(sum(epoch_loss)/len(epoch_loss), 4)
                    description = f'Epoch: {e} | '
                    description += f'Batch {b} | '
                    description += f'Average Loss: {avg_epoch_loss}'

                    t2.set_description(description)
                    t2.update()

            t.set_description(
                f'Average Epoch Loss: \
                    {round(sum(epoch_loss)/len(epoch_loss), 4)}')
            t.update()

    return avg_loss, model


In [12]:
import torch 
import dill as pickle 

def init_model(config, label_columns):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    criterion = nn.BCEWithLogitsLoss()  # Binary cross-entropy loss for multilabel classification

    model = BertClassifier('distilbert', config.get('embeddings'), len(label_columns))
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    return model, device, criterion, optimizer

def fetch_loaders(config):
    with open(config['dataloaders']['training_path'], 'rb') as f:
        train_loader = pickle.load(f)
    
    with open(config['dataloaders']['testing_path'], 'rb') as f:
        test_loader = pickle.load(f)
    
    return train_loader, test_loader


In [13]:
def eval_model(model, eval_loader, device):
    """
    Evaluate the performance of a model on the evaluation data.

    Args:
        model (torch.nn.Module): The model to evaluate.
        eval_loader (torch.utils.data.DataLoader): The data loader for
        the evaluation data.
        device (torch.device): The device to run the evaluation on.

    Returns:
        tuple: A tuple containing the true labels and predicted labels.
    """

    true_labels = []
    predicted_labels = []

    model.eval()
    with torch.no_grad():
        for batch in tqdm(eval_loader):

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)

            predicted_probs = torch.sigmoid(outputs)
            predicted_labels.extend(predicted_probs.cpu().numpy() > 0.4)
            true_labels.extend(labels.cpu().numpy())

    return true_labels, predicted_labels

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def gen_stats(true_labels, predicted_labels, labels=None):

    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(
        true_labels, predicted_labels, average='weighted', zero_division=True)
    recall = recall_score(
        true_labels, predicted_labels, average='weighted', zero_division=True)
    f1 = f1_score(
        true_labels, predicted_labels, average='weighted', zero_division=True)
    report = classification_report(
        true_labels, predicted_labels, target_names=labels, zero_division=True)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall, 'f1': f1,
        'report': report
    }


In [14]:
import json

def gen_result_path(config, root_folder):
    
    path = f'{root_folder}{config["model"]}'
    path += f'_{config["sampled"]}_{config["drop_labels"]}.json'
    return path
    
def save_results(results, config, root_folder):
    file_path = gen_result_path(config, root_folder)
    with open(file_path, 'w') as f:
        json.dump(results, f)


In [15]:
num_epochs = 10

for e in experiments:
    
    print(f'Model : {e.get("model")} \nEmbeddings : {e.get("embeddings")}\nSampled : {e.get("sampled")}\nDrop Labels : {e.get("drop_labels")}\n')
    print('---' * 20 + '\n')
    if e.get('model') == 'todbert':
        continue 
    training_dataloader, testing_dataloader = fetch_loaders(e)
    model, device, num_epochs, criterion, optimizer = init_model(e, training_dataloader.dataset.LABEL_COLUMNS)
    trained_model = train_model(model, training_dataloader, num_epochs, device, optimizer, criterion)
    true_labels, predicted_labels = eval_model(trained_model, testing_dataloader, device)
    stats = gen_stats(true_labels, predicted_labels, testing_dataloader.dataset.LABEL_COLUMNS)
    save_results(stats, e, './')


NameError: name 'experiments' is not defined