In [None]:
!pip3 install --upgrade pip
!pip install wordfreq
!pip install nltk
!pip install transformers datasets torch accelerate
!pip install pyarrow

In [None]:
import os
import re
import glob
import random
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.ipc as ipc
import pyarrow.dataset as ds
from datasets import load_dataset
from datasets import load_from_disk

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Loading in the datasets! 😍


In [None]:
# Change based on personal file path
dataset_folder = '/content/drive/MyDrive/CDS/paper-moderation-SP25'
# dataset_folder = 'content/drive/MyDrive/CDS/paper-moderation-SP25'

train_dataset = load_from_disk(dataset_folder + "/new_train.arrow")
val_dataset = load_from_disk(dataset_folder + "/new_val.arrow")
test_dataset = load_from_disk(dataset_folder + "/new_test.arrow")

Sanity Check!! 😤


In [None]:
train_df = train_dataset.to_pandas()
train_df.head()
val_df = val_dataset.to_pandas()
val_df.head()
test_df = test_dataset.to_pandas()
test_df.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from datasets import Dataset
from transformers import T5Tokenizer, T5Model, TrainingArguments, Trainer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

This is our preprocess function! 💕 💞

In [None]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
def preprocess_function(examples):
    # Create input prompts using the paper text (no evidence mention)
    inputs = [
        (
            "You are a helpful assistant that classifies research papers.\n\n"
            "Given the abstract and body of a research paper, predict whether the paper is from arXiv (label 1) or viXra (label 0).\n\n"
            "Respond in the following format:\n"
            "Label: <0 or 1>\n\n"
            "Paper:\n" + text
        )
        for text in examples["text"]
    ]

    # Create hard targets (1 for arXiv, 0 for viXra)
    targets = [
        f"Label: {label}"
        for label in examples["labels"]
    ]

    # Tokenize the inputs (the prompt)
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    # Tokenize the targets
    with tokenizer.as_target_tokenizer():
        tokenized_labels = tokenizer(
            targets,
            max_length=8,  # very short now ("Label: 1")
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = tokenized_labels["input_ids"]
    return {
        "input_ids": model_inputs["input_ids"],
        "attention_mask": model_inputs["attention_mask"],
        "labels": tokenized_labels["input_ids"]
    }

In [None]:
# Tokenize datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

Making sure its on GPU! 😎

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Testing if we are using a GPU
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))
!nvidia-smi

In [None]:
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
!pip install --upgrade transformers

In [None]:
import transformers
print(transformers.__version__)

These are our training arguments! 😇

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    predict_with_generate=True,
)

This is our compute metrics function!! 🖤

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Decode token IDs → strings
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=False)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Extract just the "Label: ..." integer from each prediction
    def extract_label(text):
        match = re.search(r"Label:\s*([01])", text)
        if match:
            return int(match.group(1))
        return 0  # fallback if pattern not found

    pred_ints = [extract_label(pred) for pred in decoded_preds]
    label_ints = [extract_label(label.strip()) for label in decoded_labels]

    return {
        "accuracy": accuracy_score(label_ints, pred_ints),
        "precision": precision_score(label_ints, pred_ints, zero_division=0),
        "recall": recall_score(label_ints, pred_ints, zero_division=0),
    }

Training time! ⏳ ⌛

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ---- Step 11: Train the model ----
trainer.train()

In [None]:
test_dataset = test_dataset.map(preprocess_function, batched=True)

In [None]:
predictions = trainer.predict(test_dataset)

Example Predictions! 😄

In [None]:
decoded_preds = tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(predictions.label_ids, skip_special_tokens=True)

# Get indices for some random examples
indices = random.sample(range(len(decoded_preds)), 10)
for i in indices:
    print(f"Example {i+1}:")
    print("Prediction:")
    print(decoded_preds[i])
    print("Label:")
    print(decoded_labels[i])
    print("-" * 40)

In [None]:
def compute_metrics_test(decoded_preds,decoded_labels):

    # Extract just the "Label: ..." integer from each prediction
    def extract_label(text):
        if "0" in text:
          return 0
        return 1

    pred_ints = [extract_label(pred) for pred in decoded_preds]
    label_ints = [extract_label(label.strip()) for label in decoded_labels]

    return {
        "accuracy": accuracy_score(label_ints, pred_ints),
        "precision": precision_score(label_ints, pred_ints, zero_division=0),
        "recall": recall_score(label_ints, pred_ints, zero_division=0),
    }

Predictions! 💡

In [None]:
compute_metrics_test(decoded_preds,decoded_labels)