<a href="https://colab.research.google.com/github/541DeepLearning-Group8/baselines/blob/main/ordinal_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers scikit-learn -q

In [None]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from tqdm import tqdm
from google.colab import drive

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
drive.mount('/content/drive')
train_df = pd.read_csv('/content/drive/MyDrive/541project/train.csv')
val_df = pd.read_csv('/content/drive/MyDrive/541project/val.csv')
test_df = pd.read_csv('/content/drive/MyDrive/541project/test.csv')

train_df = train_df[['Clean Comments', 'Rating']].dropna()
val_df = val_df[['Clean Comments', 'Rating']].dropna()
test_df = test_df[['Clean Comments', 'Rating']].dropna()


train_df['label'] = train_df['Rating'] - 1
val_df['label'] = val_df['Rating'] - 1
test_df['label'] = test_df['Rating'] - 1

In [None]:
class CommentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')
        self.labels = torch.tensor(labels, dtype=torch.long)
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item
    def __len__(self):
        return len(self.labels)

MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

train_dataset = CommentDataset(train_df['Clean Comments'].tolist(), train_df['label'].tolist(), tokenizer)
val_dataset = CommentDataset(val_df['Clean Comments'].tolist(), val_df['label'].tolist(), tokenizer)
test_dataset = CommentDataset(test_df['Clean Comments'].tolist(), test_df['label'].tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)


In [None]:
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=5).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


In [None]:
def train(model):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc="Training"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    return total_loss / len(train_loader)

In [None]:
def evaluate(model, dataloader, name="Validation"):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc=f"Evaluating {name}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds += torch.argmax(logits, dim=-1).tolist()
            labels += batch['labels'].tolist()
    print(f"{name} Classification Report:")
    print(classification_report(labels, preds, digits=4))

In [None]:
EPOCHS = 3
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")
    loss = train(model)
    print(f"Training Loss: {loss:.4f}")
    evaluate(model, val_loader)

In [None]:
evaluate(model, test_loader, name="Test")