# RoBERTa

In [38]:
#!pip install tensorflow
#!pip install keras
#!pip install torch
#!pip install transformers

In [26]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Text processing
import nltk
import string
import re

import warnings
warnings.filterwarnings("ignore")

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Hugging Face Transformers
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_scheduler
from keras.preprocessing.sequence import pad_sequences

In [22]:
# Load dataset
df = pd.read_excel('Data/df_complete_cleansing.xlsx', index_col=0)
df['Content'] = df['Content'].astype(str)

sentences = df['Content'].values
labels = df['NegoOutcomeLabel'].values

In [23]:
# Tokenizer setup
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [24]:
# Tokenize and encode inputs
MAX_LEN = 512

tokenized_texts = [tokenizer.tokenize(sentence) for sentence in sentences]
input_ids = [
    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence))[:MAX_LEN-2]
    for sentence in sentences
]
input_ids = [
    [tokenizer.cls_token_id] + ids + [tokenizer.sep_token_id]
    for ids in input_ids
]

# Pad sequences
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype='long', truncating='post', padding='post')

In [28]:
# Create attention masks
attention_masks = [
    [float(i > 0) for i in seq]
    for seq in input_ids
]

In [29]:
# Split into train and validation sets
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
    input_ids, labels, random_state=42, test_size=0.1, stratify=labels
)
train_masks, validation_masks, _, _ = train_test_split(
    attention_masks, input_ids, random_state=42, test_size=0.1, stratify=labels
)

In [30]:
# Convert to PyTorch tensors
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [32]:
# DataLoader setup
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [33]:
# Device setup
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


In [34]:
# Load RoBERTa model
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model.to(device)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [35]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 3)

# Loss function with class weights to handle imbalance
from torch.nn import CrossEntropyLoss
class_counts = np.bincount(labels)
class_weights = torch.tensor([1.0 / count for count in class_counts], dtype=torch.float).to(device)
loss_fn = CrossEntropyLoss(weight=class_weights)

In [36]:
# Accuracy computation
def compute_accuracy(model, data_loader, device):
    with torch.no_grad():
        correct_pred, num_examples = 0, 0

        for batch in data_loader:
            input_ids, input_mask, labels = batch
            input_ids, input_mask, labels = input_ids.to(device), input_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=input_mask)
            logits = outputs.logits
            _, predicted_labels = torch.max(logits, 1)

            num_examples += labels.size(0)
            correct_pred += (predicted_labels == labels).sum()

    return correct_pred.float() / num_examples * 100

In [37]:
# Training loop
import time

start_time = time.time()
NUM_EPOCHS = 10

for epoch in range(NUM_EPOCHS):
    model.train()

    for batch_idx, batch in enumerate(train_dataloader):
        input_ids, input_mask, labels = batch
        input_ids, input_mask, labels = input_ids.to(device), input_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=input_mask)
        logits = outputs.logits

        optimizer.zero_grad()
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        if batch_idx % 250 == 0:
            print(f'Epoch: {epoch + 1:04d}/{NUM_EPOCHS:04d} | Batch {batch_idx:04d}/{len(train_dataloader):04d} | Loss: {loss:.4f}')

    model.eval()
    with torch.no_grad():
        print(f'Training Accuracy: {compute_accuracy(model, train_dataloader, device):.2f}%')
        print(f'Validation Accuracy: {compute_accuracy(model, validation_dataloader, device):.2f}%')

print(f'Total Training Time: {(time.time() - start_time) / 60:.2f} min')

Epoch: 0001/0001 | Batch 0000/0132 | Loss: 0.6828


KeyboardInterrupt: 