In [1]:
!pip install transformers datasets torch accelerate

Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import os
import glob
import pandas as pd

In [7]:
# Define folder paths (update these!)
arxiv_path = "/content/drive/My Drive/CDS/arxiv Automatic Paper Moderation Data (FA24)/arxiv_papers/"
vixra_path = "/content/drive/My Drive/CDS/arxiv Automatic Paper Moderation Data (FA24)/vixra_papers/"

def read_txt_files(folder_path, folder_name, limit=300):
    """Reads the first `limit` .txt files from a folder and returns a list of tuples (filename, content, folder_name)."""
    txt_files = glob.glob(os.path.join(folder_path, "*.txt"))
    txt_files = txt_files[:limit]  # Limit to the first `limit` files

    data = []
    for file in txt_files:
        file_path = os.path.join(folder_path, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            data.append((content, folder_name))  # Store filename, content, and source folder

    return data

# Read files from both folders
arxiv_data = read_txt_files(arxiv_path, "1")
vixra_data = read_txt_files(vixra_path, "0")

# Combine both datasets into a single DataFrame
df = pd.DataFrame(arxiv_data + vixra_data, columns=["content", "folder"])
print(df.head())

                                             content folder
0  EPJ manuscript No.\n(will be inserted by the e...      1
1  Interchromatidal central ridge and transversal...      1
2  \nRANKIN -- SELBERG PAIRINGS OF NEWFORMS AND O...      1
3  Anomaly detection models for IoT time series d...      1
4  \nFinite groups with two Chermak-Delgado\n\nme...      1


In [None]:
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer

In [None]:
df = df.sample(frac=1).reset_index(drop=True)  # Shuffle the dataset

In [None]:
df["label_text"] = df["folder"].apply(lambda x: f"This text belongs to {x}")

In [None]:
# train test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["folder"])

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Tokenization function
def preprocess_function(examples):
    inputs = [f"Classify: {text}" for text in examples["content"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    labels = tokenizer(examples["label_text"], max_length=10, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
# Tokenize datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [None]:
print(test_dataset)

Dataset({
    features: ['content', 'folder', 'label_text', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 20
})


In [None]:
train_dataset = train_dataset.remove_columns(["content", "folder", "label_text"])
test_dataset = test_dataset.remove_columns(["content", "folder", "label_text"])

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model.to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    push_to_hub=False,
    fp16=torch.cuda.is_available()  # Enable mixed precision on CUDA
)

trainer = Trainer(
    model=model.to(device),  # Move model to GPU
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33med547[0m ([33med547-cornell-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,6.672116
2,No log,2.809159
3,No log,2.020551


TrainOutput(global_step=60, training_loss=6.293196105957032, metrics={'train_runtime': 23.2966, 'train_samples_per_second': 10.302, 'train_steps_per_second': 2.575, 'total_flos': 32482032353280.0, 'train_loss': 6.293196105957032, 'epoch': 3.0})

In [None]:
def classify_text(text):
    input_text = f"Classify: {text}"
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Move inputs to the same device as the model
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Run inference
    outputs = model.generate(**inputs)

    # Decode output
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the model on 5 random samples
for i in range(5):
    sample_text = test_df.iloc[i]["content"]
    actual_label = test_df.iloc[i]["folder"]
    predicted_label = classify_text(sample_text)

    print(f"{actual_label},{predicted_label}")  # Output in CSV-like format

1,Classified: Draft version August 30, 2018 Typeset using LATEX RNAAS style in A
1,In-situ sensors and Wireless Sensor Networks (WSNs) have become more and
1,Object Mapping - Object Mapping - a UML specification for information systems
1,00, No. 00, Month 200x, 1 -- 23 3 1 0 2
0,Lessons of the Isotropic Schwarzschild Metric’s Horizon Steven Kenneth Kauf
