In [16]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [20]:
# Load the dataset
df = pd.read_csv('gold_data_annotated_hyperbole.tsv', sep='\t')

# Split the data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['tagged_comment'].tolist(),
    df['hyperbole_label'].tolist(),
    test_size=0.2,
    random_state=42
)
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [21]:


# Create a custom dataset
class HyperboleDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create DataLoaders
train_dataset = HyperboleDataset(train_texts, train_labels, tokenizer)
val_dataset = HyperboleDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [24]:
from collections import Counter

# Function to calculate and print label distribution
def print_label_distribution(labels, dataset_name):
    counter = Counter(labels)
    total = len(labels)
    print(f"\n{dataset_name} Dataset Distribution:")
    for label, count in counter.items():
        percentage = (count / total) * 100
        print(f"Label {label}: {count} ({percentage:.2f}%)")

# Print distributions
print_label_distribution(df['hyperbole_label'].tolist(), "Full")
print_label_distribution(train_labels, "Training")
print_label_distribution(val_labels, "Validation")


Full Dataset Distribution:
Label 1: 363 (24.22%)
Label 0: 1136 (75.78%)

Training Dataset Distribution:
Label 0: 898 (74.90%)
Label 1: 301 (25.10%)

Validation Dataset Distribution:
Label 0: 238 (79.33%)
Label 1: 62 (20.67%)


In [32]:
import time
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

def compute_metrics(true_labels, predictions):
    accuracy = accuracy_score(true_labels, predictions)
    macro_f1 = f1_score(true_labels, predictions, average='macro')
    weighted_f1 = f1_score(true_labels, predictions, average='weighted')
    return accuracy, macro_f1, weighted_f1

# Training loop
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
model.to(device)

num_epochs = 3
print_every = 100  # Print stats every 100 batches

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    start_time = time.time()

    for batch_idx, batch in enumerate(train_loader, 1):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        if batch_idx % print_every == 0:
            avg_loss = total_loss / batch_idx
            elapsed = time.time() - start_time
            print(f"Epoch {epoch+1}/{num_epochs}, Batch {batch_idx}/{len(train_loader)}, "
                  f"Avg Loss: {avg_loss:.4f}, Time: {elapsed:.2f}s")

            # Quick validation check
            model.eval()
            val_predictions = []
            val_true_labels = []

            with torch.no_grad():
                for val_batch in val_loader:
                    val_input_ids = val_batch['input_ids'].to(device)
                    val_attention_mask = val_batch['attention_mask'].to(device)
                    val_labels = val_batch['labels'].to(device)

                    val_outputs = model(val_input_ids, attention_mask=val_attention_mask)
                    _, val_preds = torch.max(val_outputs.logits, dim=1)

                    val_predictions.extend(val_preds.cpu().tolist())
                    val_true_labels.extend(val_labels.cpu().tolist())

            val_accuracy, val_macro_f1, val_weighted_f1 = compute_metrics(val_true_labels, val_predictions)
            print(f"Quick Validation Accuracy: {val_accuracy:.4f}")
            print(f"Quick Validation Macro F1: {val_macro_f1:.4f}")
            print(f"Quick Validation Weighted F1: {val_weighted_f1:.4f}")

            model.train()  # Set the model back to training mode

    # End of epoch
    avg_epoch_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} completed. Average Loss: {avg_epoch_loss:.4f}")

    # Full validation at the end of each epoch
    model.eval()
    val_predictions = []
    val_true_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            val_predictions.extend(preds.cpu().tolist())
            val_true_labels.extend(labels.cpu().tolist())

    val_accuracy, val_macro_f1, val_weighted_f1 = compute_metrics(val_true_labels, val_predictions)
    print(f"End of Epoch {epoch+1}/{num_epochs}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Validation Macro F1: {val_macro_f1:.4f}")
    print(f"Validation Weighted F1: {val_weighted_f1:.4f}")

# ... (rest of the code remains the same)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 completed. Average Loss: 0.5392
End of Epoch 1/3
Validation Accuracy: 0.7967
Validation Macro F1: 0.4591
Validation Weighted F1: 0.7098
Epoch 2/3 completed. Average Loss: 0.4236
End of Epoch 2/3
Validation Accuracy: 0.8167
Validation Macro F1: 0.6231
Validation Weighted F1: 0.7815
Epoch 3/3 completed. Average Loss: 0.2331
End of Epoch 3/3
Validation Accuracy: 0.7900
Validation Macro F1: 0.6817
Validation Weighted F1: 0.7906


In [33]:
import json
import os
from transformers import BertTokenizer, BertForSequenceClassification

# ... (previous code remains the same)

# After training is complete

# Define the directory to save the model
save_directory = 'hyperbole_model_nfl'

# Create the directory if it doesn't exist
os.makedirs(save_directory, exist_ok=True)

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

# Save additional information
model_info = {
    'model_name': 'BERT for Hyperbole Detection',
    'base_model': 'bert-base-uncased',
    'num_labels': 2,
    'training_params': {
        'num_epochs': num_epochs,
        'batch_size': 16,  # Adjust if you've changed this
        'learning_rate': 2e-5,  # Adjust if you've changed this
    },
    'performance': {
        'final_validation_accuracy': val_accuracy
    }
}

with open(os.path.join(save_directory, 'model_info.json'), 'w') as f:
    json.dump(model_info, f, indent=2)

print(f"Model saved to {save_directory}")

# To load the model later, you can use:
# loaded_model = BertForSequenceClassification.from_pretrained('hyperbole_model_nfl')
# loaded_tokenizer = BertTokenizer.from_pretrained('hyperbole_model_nfl')

Model saved to hyperbole_model_nfl
