In [None]:
!pip install transformers
!pip install sentence_transformers
!pip install wtpsplit
!pip install accelerate
!pip install datasets

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence_transformers)
 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Imports

In [None]:
import os
import glob
import pandas as pd
import numpy as np
import torch
from datetime import datetime
from transformers import AutoTokenizer, LongformerTokenizer, LongformerModel, LongformerForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score
from wtpsplit import SaT
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from datasets import load_dataset, load_from_disk
import accelerate

# Setup
Initializes variables and loads data as dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
dataset_folder = '/content/drive/MyDrive/paper-moderation-SP25'
output_folder = "/content/drive/MyDrive/paper-moderation-SP25/longformer"
os.makedirs(output_folder, exist_ok=True)

train_dataset = load_from_disk(dataset_folder + "/new_train.arrow")
val_dataset = load_from_disk(dataset_folder + "/new_val.arrow")
test_dataset = load_from_disk(dataset_folder + "/new_test.arrow")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model Training

In [None]:
def generate_tokens(data):
    """
    Train a Longformer classifier on the prepared datasets.

    Args:
        data: Input data
    Returns:
        Tokens generated from the input data
    """
    return tokenizer(
    data["text"],
    padding="max_length",
    truncation=True,
    max_length=4096,
    )

def train_classifier(train_dataset, val_dataset, test_dataset):
    """
    Train a Longformer classifier on the prepared datasets.

    Args:
        train_dataset: Training dataset
        test_dataset: Testing dataset
        val_dataset: Validation dataset
        num_classes: Number of classes for classification

    Returns:
        Trained classifier model
    """
    print("Training classifier model...")

    # Load the classification model
    classifier = LongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096").to(device)

    training_args = TrainingArguments(
    output_dir=f"{output_folder}/base_model",
    save_strategy="epoch",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    fp16=True,
    report_to="none",
    )

    # Define metrics for evaluation
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return {
            # other metrics: mainly precision
            "accuracy": accuracy_score(labels, predictions),
            "precision": precision_score(labels, predictions),
            "recall": recall_score(labels, predictions)
        }

    # Create trainer
    trainer = Trainer(
        model=classifier,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    # Train the model
    print("Starting training...")
    trainer.train()

    # Evaluate the model
    print("Evaluating model...")
    eval_results = trainer.evaluate(test_dataset)
    print(f"Evaluation results: {eval_results}")

    return classifier

# Main
Run this to execute the entire pipeline

In [None]:
def main(max_papers_per_source=None):
    """
    Main function to orchestrate the entire pipeline.
    """
    print("Starting paper processing pipeline...")

    #Apply tokenization to all datasets
    print("Starting data tokenization...")
    tokenized_train = train_dataset.map(generate_tokens, batched=True)
    tokenized_val = val_dataset.map(generate_tokens, batched=True)
    tokenized_test = test_dataset.map(generate_tokens, batched=True)

    tokenized_train.save_to_disk(f"{output_folder}/base_datasets/train")
    tokenized_val.save_to_disk(f"{output_folder}/base_datasets/val")
    tokenized_test.save_to_disk(f"{output_folder}/base_datasets/test")
    print("Datasets saved to Google Drive.")

    print("Tokenization of train dataset complete.")

    # Format datasets to return PyTorch tensors
    tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    tokenized_val.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    tokenized_test.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    classifier = train_classifier(tokenized_train, tokenized_val, tokenized_test)

    print("Pipeline complete!")

In [None]:
main()

Starting paper processing pipeline...
Starting data tokenization...


Map:   0%|          | 0/27499 [00:00<?, ? examples/s]

Map:   0%|          | 0/9166 [00:00<?, ? examples/s]

Map:   0%|          | 0/9166 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/27499 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/9166 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/9166 [00:00<?, ? examples/s]

Datasets saved to Google Drive.
Tokenization of train dataset complete.
Training classifier model...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Initializing global attention on CLS token...


Starting training...


model.safetensors:   0%|          | 0.00/597M [00:00<?, ?B/s]

Step,Training Loss
500,0.3052
1000,0.1992
1500,0.1315
2000,0.1008
2500,0.073


Step,Training Loss
500,0.3052
1000,0.1992
1500,0.1315
2000,0.1008
2500,0.073
3000,0.0648


Evaluating model...


Evaluation results: {'eval_loss': 0.06556926667690277, 'eval_accuracy': 0.9853807549639974, 'eval_precision': 0.9770196635868278, 'eval_recall': 0.9911079067531843, 'eval_runtime': 1738.7991, 'eval_samples_per_second': 5.271, 'eval_steps_per_second': 0.659, 'epoch': 0.9998545454545454}
Pipeline complete!


# Tokenized Data
Run this once data has been tokenized and saved to output folder.

In [None]:
"""
This loads everything that has been tokenized to retrieve the accuracy of the model
"""
def run_pipeline():
    tokenized_train = load_from_disk(f"{output_folder}/base_datasets/train")
    tokenized_val = load_from_disk(f"{output_folder}/base_datasets/val")
    tokenized_test = load_from_disk(f"{output_folder}/base_datasets/test")

    # Format datasets to return PyTorch tensors
    tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    tokenized_val.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    tokenized_test.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    # Train
    model = train_classifier(tokenized_train, tokenized_val, tokenized_test)

In [None]:
run_pipeline()

Training classifier model...


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Step,Training Loss
500,0.1089
1000,0.0663
1500,0.0304
2000,0.0222
2500,0.0184
