Reference: https://medium.com/@khang.pham.exxact/text-classification-with-bert-7afaacc5e49b

In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import pandas as pd

In [2]:
train_data = pd.read_csv('/kaggle/input/aalto-snlp-2024/aalto-snlp-course-competition-2024/train_2024.csv', quoting=3)
eval_data = pd.read_csv('/kaggle/input/aalto-snlp-2024/aalto-snlp-course-competition-2024/test_2024.csv', quoting=3)
dev_data = pd.read_csv('/kaggle/input/aalto-snlp-2024/aalto-snlp-course-competition-2024/dev_2024.csv', quoting=3)

In [3]:
train_texts = train_data['text'].tolist()
dev_texts = dev_data['text'].tolist()
eval_texts = eval_data['text'].tolist()

train_labels = train_data['label'].tolist()
dev_labels = dev_data['label'].tolist()
eval_labels = eval_data['label'].tolist()

In [4]:
train_texts.extend(dev_texts)
train_labels.extend(dev_labels)

In [5]:
X = train_texts
y = train_labels

In [6]:

len(train_texts), len(train_labels), len(eval_texts), len(eval_labels)

(110000, 110000, 12001, 12001)

In [7]:
import numpy as np
num_words = list(map(len, [texts.split() for texts in train_texts]))
np.quantile(num_words, .9)

98.0

In [8]:
class TextClassificationDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
  def __len__(self):
        return len(self.texts)
  def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [9]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [10]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [11]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return f1_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [12]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return "positive" if preds.item() == 0 else "negative"

In [13]:
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 174
batch_size = 64
num_epochs = 4
learning_rate = 2e-5

In [14]:
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [17]:
device

device(type='cuda')

In [18]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [19]:
from copy import deepcopy

best_model = None
best_score = 0
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    f1, report = evaluate(model, val_dataloader, device)
    print(f"Validation F1: {f1:.4f}")
    print(report)
    if f1 > best_score:
        best_score = f1
        best_model = deepcopy(model)

Epoch 1/4
Validation F1: 0.9324
              precision    recall  f1-score   support

           0       0.97      0.95      0.96     13961
           1       0.91      0.95      0.93      8039

    accuracy                           0.95     22000
   macro avg       0.94      0.95      0.95     22000
weighted avg       0.95      0.95      0.95     22000

Epoch 2/4
Validation F1: 0.9302
              precision    recall  f1-score   support

           0       0.98      0.94      0.96     13961
           1       0.90      0.96      0.93      8039

    accuracy                           0.95     22000
   macro avg       0.94      0.95      0.94     22000
weighted avg       0.95      0.95      0.95     22000

Epoch 3/4
Validation F1: 0.9320
              precision    recall  f1-score   support

           0       0.97      0.96      0.96     13961
           1       0.92      0.94      0.93      8039

    accuracy                           0.95     22000
   macro avg       0.94      0.9

In [20]:
model = best_model
torch.save(model.state_dict(), "bert_classifier_v2_174.pth")

In [21]:
max_length, batch_size, num_epochs, learning_rate

(174, 64, 4, 2e-05)

In [22]:
scheduler.__dict__

{'optimizer': AdamW (
 Parameter Group 0
     betas: (0.9, 0.999)
     correct_bias: True
     eps: 1e-06
     initial_lr: 2e-05
     lr: 0.0
     weight_decay: 0.0
 ),
 'lr_lambdas': [functools.partial(<function _get_linear_schedule_with_warmup_lr_lambda at 0x7c3399114790>, num_warmup_steps=0, num_training_steps=5500)],
 'base_lrs': [2e-05],
 'last_epoch': 5500,
 'verbose': False,
 '_step_count': 5501,
 '_get_lr_called_within_step': False,
 '_last_lr': [0.0]}

In [23]:
f1, report = evaluate(model, train_dataloader, device)
print(f"Training F1 Score: {f1:.4f}")
print(report)

Training F1 Score: 0.9563
              precision    recall  f1-score   support

           0       0.99      0.96      0.97     55691
           1       0.94      0.98      0.96     32309

    accuracy                           0.97     88000
   macro avg       0.96      0.97      0.97     88000
weighted avg       0.97      0.97      0.97     88000



In [24]:
f1, report = evaluate(model, val_dataloader, device)
print(f"Validation F1 Score: {f1:.4f}")
print(report)

Validation F1 Score: 0.9324
              precision    recall  f1-score   support

           0       0.97      0.95      0.96     13961
           1       0.91      0.95      0.93      8039

    accuracy                           0.95     22000
   macro avg       0.94      0.95      0.95     22000
weighted avg       0.95      0.95      0.95     22000



In [25]:
eval_preds = []
for txt in eval_texts:
  pred = predict_sentiment(txt, model, tokenizer, device)
  if pred == 'positive':
    eval_preds.append(0)
  else:
    eval_preds.append(1)

In [1]:
eval_data_ids = eval_data['id']
eval_set_ids = pd.DataFrame({'id': eval_data_ids})

preds_df = pd.DataFrame({'label': eval_preds})

final_output = pd.concat([eval_set_ids, preds_df], axis=1)
final_output.to_csv('/kaggle/working/submission.csv',index=False)

NameError: name 'eval_data' is not defined