In [1]:
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

In [3]:
# Load the correct HYPO-L dataset
dataset = load_dataset("tasksource/HYPO-L")

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [12]:
# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding='max_length', truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Prepare the dataset for PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(['sentence'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch')

# Combine train and test sets, then split
full_dataset = tokenized_datasets['train'].to_pandas()

# Perform the 80-20 split
train_data, test_data = train_test_split(full_dataset, test_size=0.2, random_state=42)

# Convert back to PyTorch datasets
train_dataset = TensorDataset(
    torch.tensor(train_data['input_ids'].tolist()),
    torch.tensor(train_data['attention_mask'].tolist()),
    torch.tensor(train_data['labels'].tolist())
)

test_dataset = TensorDataset(
    torch.tensor(test_data['input_ids'].tolist()),
    torch.tensor(test_data['attention_mask'].tolist()),
    torch.tensor(test_data['labels'].tolist())
)

In [13]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Set up training parameters
batch_size = 16
epochs = 3
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(device)
model.to(device)

# Create data loaders
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)



mps


In [15]:
# Training loop
for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    # Evaluation
    model.eval()
    predictions = []
    true_labels = []
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
    
    print(f"Epoch {epoch+1}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-score: {f1:.2f}")

# Save the model
model.save_pretrained('./hyperbole_model')
tokenizer.save_pretrained('./hyperbole_model')

Epoch 1
Accuracy: 0.74
Precision: 0.75
Recall: 0.35
F1-score: 0.48
Epoch 2
Accuracy: 0.75
Precision: 0.60
Recall: 0.71
F1-score: 0.65
Epoch 3
Accuracy: 0.76
Precision: 0.70
Recall: 0.52
F1-score: 0.60


('./hyperbole_model/tokenizer_config.json',
 './hyperbole_model/special_tokens_map.json',
 './hyperbole_model/vocab.txt',
 './hyperbole_model/added_tokens.json')