In [2]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd


# Gen Dataset 

In [18]:
bin_df = pd.read_csv('../data/binary_classifier/binary_label_1-RAPPORT.csv')
label = 'Rapport'

In [19]:
import torch
from transformers import AutoTokenizer 
from torch.utils.data import WeightedRandomSampler


def gen_sampler(dataset):
    
    class_counts = dataset.data.binary_label.value_counts().to_list()
    num_samples = sum(class_counts)
    labels = dataset.data.binary_label.tolist()
    class_weights = [num_samples/class_counts[i] for i in range(len(class_counts))]
    weights = [class_weights[labels[i]] for i in range(int(num_samples))]
    sampler = WeightedRandomSampler(torch.DoubleTensor(weights), int(num_samples))
    return sampler
    
class binaryPersuasionDataset(Dataset):
    def __init__(
            self, data, label, tokenizer
            ):
        
        self.data = data.sample(frac=1)
        self.label = label
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
        self.max_token_len = 512
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):

        data_row = self.data.iloc[index]
        comment_text = data_row.text
        binary_label = data_row.binary_label
        encoding = self.tokenizer.encode_plus(
          comment_text,
          add_special_tokens=True,
          max_length=self.max_token_len,
          return_token_type_ids=False,
          padding="max_length",
          truncation=True,
          return_attention_mask=True,
          return_tensors='pt',
        )

        return dict(
          comment_text=comment_text[:10],
          input_ids=encoding["input_ids"].flatten(),
          attention_mask=encoding["attention_mask"].flatten(),
          binary_label=torch.FloatTensor([binary_label])
        )

In [24]:
def gen_datasets(df, label, tokenizer):

    train_df, test_df = train_test_split(
        df,
        test_size=0.2,
        random_state=42,
        shuffle=True,
    )

    train_ds = binaryPersuasionDataset(
        bin_df,
        label,
        tokenizer
    )

    test_ds = binaryPersuasionDataset(
        test_df,
        label,
        tokenizer)
    return train_ds, test_ds

In [28]:
def gen_dataloaders(dataset, batch_size, test=None):

    if test:
        sampler=None
    else:
        sampler = weighted_sampler = gen_sampler(ds)
    
    dataloader = DataLoader(
        dataset,
        sampler=sampler,
        batch_size=batch_size
    )
    return dataloader

In [75]:
train_dataset, test_dataset = gen_datasets(bin_df, 'rapport', 'bert-base-uncased')
train_dataloader = gen_dataloaders(train_dataset, 4)
test_dataloader = gen_dataloaders(test_dataset, 1, test=True)

In [76]:
import torch.nn as nn

# Load pre-trained BERT model and tokenizer

class BertClassifier(nn.Module):

    """
    BERT-based classifier model.

    Args:
        bert_model (str): The pre-trained BERT model to use.
        num_labels (int): The number of output labels.

    Attributes:
        bert (BertModel): The BERT model.
        dropout (nn.Dropout): Dropout layer for regularization.
        classifier (nn.Linear): Linear layer for classification.

    """

    def __init__(self, name, bert_model, num_labels):
        super(BertClassifier, self).__init__()
        self.name = name
        self.bert = AutoModel.from_pretrained(bert_model)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):

        """
        Forward pass of the BERT classifier.

        Args:
            input_ids (torch.Tensor): The input token IDs.
            attention_mask (torch.Tensor): The attention mask.

        Returns:
            torch.Tensor: The logits for each class.

        """
        if self.name == 'bert':
            outputs = self.bert(input_ids=input_ids,
                                attention_mask=attention_mask)
            pooled_output = outputs.pooler_output

        elif self.name == 'distilbert':
            outputs = self.bert(input_ids=input_ids,
                                attention_mask=attention_mask)
            pooled_output = outputs.last_hidden_state[:, 0, :]
        else:
            raise ValueError("Invalid model name")

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Set Hyperparamters

In [77]:
from torch.optim import lr_scheduler
from transformers import AutoModel
from tqdm.auto import tqdm


device = 'cuda' if torch.cuda.is_available() else 'cpu'
num_epochs = 15
criterion = nn.BCEWithLogitsLoss()  # Binary cross-entropy loss for multilabel classification

model = BertClassifier('distilbert', 'bert-base-uncased', 2)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

# Train

In [100]:


def train_model(
        model, training_dataloader, num_epochs, device, optimizer, criterion):

    avg_loss = 0

    model.train()

    with tqdm(range(num_epochs), desc='Average Epoch Loss: ') as t:
        for e in range(num_epochs):
            epoch_loss = []

            with tqdm(range(len(training_dataloader)), desc='Loss: 0') as t2:
                for b, batch in enumerate(training_dataloader):

                    optimizer.zero_grad()

                    if model.name == 'rnn':
                        input_ids = batch['input_ids'].to(device)
                        labels = batch['binary_label'].to(device)
                        embeddings = model.embd(input_ids)
                        outputs = model(embeddings)

                    elif model.name in ['bert', 'distilbert']:
                        input_ids = batch['input_ids'].to(device)
                        attention_mask = batch['attention_mask'].to(device)
                        labels = batch['binary_label'].to(device)
                        outputs = model(input_ids, attention_mask)

                    labels = one_hot_encode(labels)
                    loss = criterion(outputs, labels.float())
                    epoch_loss.append(loss.item())
                    loss.backward()
                    optimizer.step()

                    avg_epoch_loss = round(sum(epoch_loss)/len(epoch_loss), 4)
                    description = f'Epoch: {e} | '
                    description += f'Batch {b} | '
                    description += f'Average Loss: {avg_epoch_loss}'

                    t2.set_description(description)
                    t2.update()

            t.set_description(
                f'Average Epoch Loss: {round(sum(epoch_loss)/len(epoch_loss),4)}')
            t.update()

    return avg_loss, model

In [102]:
# trained_model, o = train_model(model, train_dataloader, num_epochs, device, optimizer, criterion)

In [106]:
def eval_model(model, eval_loader, device):
    """
    Evaluate the performance of a model on the evaluation data.

    Args:
        model (torch.nn.Module): The model to evaluate.
        eval_loader (torch.utils.data.DataLoader): The data loader for
        the evaluation data.
        device (torch.device): The device to run the evaluation on.

    Returns:
        tuple: A tuple containing the true labels and predicted labels.
    """

    true_labels = []
    predicted_labels = []

    model.eval()
    with torch.no_grad():
        for batch in tqdm(eval_loader):

            if model.name == 'rnn':
                input_ids = batch['input_ids'].to(device)
                labels = batch['binary_label'].to(device)
                embeddings = model.embd(input_ids)
                outputs = model(embeddings)
            else:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['binary_label'].to(device)
                outputs = model(input_ids, attention_mask)

            outputs = one_hot_encode(labels)
            predicted_probs = torch.sigmoid(outputs)
            predicted_labels.extend(predicted_probs.cpu().numpy() > 0.4)
            true_labels.extend(labels.cpu().numpy())

    return true_labels, predicted_labels

In [108]:
# true_labels, predicted_labels = eval_model(model, test_dataloader, device)

In [109]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score

accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels,average='weighted', zero_division=True) # Assuming multilabel
recall = recall_score(true_labels, predicted_labels, average='weighted',zero_division=True)  # Assuming multilabel
f1 = f1_score(true_labels, predicted_labels, average='weighted',zero_division=True)  # Assuming multilabel

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


NameError: name 'true_labels' is not defined

In [110]:
report = classification_report(
    true_labels, predicted_labels,
    target_names=label_columns,
    zero_division=True)  # label_names is a list of class names

print(report)

NameError: name 'true_labels' is not defined