In [None]:
!pip install transformers
!pip install sentence_transformers
!pip install wtpsplit
!pip install accelerate

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence_transformers)
 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import glob
import pandas as pd
import numpy as np
import torch
from datetime import datetime
from transformers import LongformerTokenizer, LongformerModel, LongformerForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from wtpsplit import SaT  # For text segmentation (optional with Longformer)
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
#from datasets import Dataset
import accelerate


In [None]:
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
# model = LongformerModel.from_pretrained("allenai/longformer-base-4096")
# model = SentenceTransformer('allenai/longformer-base-4096')


# Define paths
data_folder = "data"
output_folder = "embeddings"
os.makedirs(output_folder, exist_ok=True)

#Initialize SaT
sat_model = SaT("sat-3l-sm")

# Define paper sources
paper_sources = ['arxiv_papers', 'conservapedia_papers', 'vixra_papers', 'wiki_papers']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/965 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/855M [00:00<?, ?B/s]

In [None]:
def segment_text(text, max_length=2000):  # Reduced max_length for CPU efficiency
    """
    Segment text into chunks that fit within Longformer's context window.
    Using smaller chunks for CPU processing.
    """
    # Option 1: Simple chunking by character count (if text is very long)
    if len(text) <= max_length:
        return [text]

    # Option 2: Use SaT for more intelligent segmentation (preserves semantic units)
    segments = []
    #seperate into paragrpahs and preprocessing
    for split in sat_model.split([text], remove_whitespace_before_inference=True, do_paragraph_segmentation=True):
        # Combine paragraphs until we approach max_length
        current_segment = ""
        for paragraph in split:
            # Join the sentences in this paragraph, removing newlines
            paragraph_text = " ".join([sentence.replace("\n", " ") for sentence in paragraph])

            # If adding this paragraph would exceed max_length, save current segment and start a new one
            if len(current_segment) + len(paragraph_text) > max_length and current_segment:
                segments.append(current_segment)
                current_segment = paragraph_text
            else:
                current_segment += " " + paragraph_text if current_segment else paragraph_text

        # Add the last segment if it's not empty
        if current_segment:
            segments.append(current_segment)

    return segments

# Embeddings using SetneceTransfoermer
# def generate_embeddings(text_segments):
#     """
#     Generate embeddings using SentenceTransformer with Longformer.
#     """
#     # Use SentenceTransformer's encode method
#     embeddings = model.encode(text_segments)
#     return embeddings

def generate_embeddings(text_segments):
    """
    Generate tokenized inputs using Longformer tokenizer.

    Args:
        text_segments: List of text segments to tokenize

    Returns:
        Dictionary of tokenized inputs (as lists)
    """
    # Tokenize all segments at once, without converting to tensors
    tokenized_inputs = tokenizer(
        text_segments,
        padding="max_length",
        truncation=True,
        max_length=4096,
    )

    return tokenized_inputs

def process_paper(file_path, source):
    """
    Process a single paper: read, segment, embed.

    Args:
        file_path: Path to the text file
        source: Source of the paper (arxiv, conservapedia, etc.)

    Returns:
        Dictionary containing paper data and embeddings
    """
    print(f"Processing {file_path}")

    # Read text file
    try:
        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
            text = f.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

    if not text:
        print(f"Empty text from {file_path}")
        return None

    # Segment the text (coarse segmentation for Longformer)
    segments = segment_text(text)
    print(f"  Segmented into {len(segments)} chunks")

    # Generate embeddings
    embeddings = generate_embeddings(segments)
    print(f"  Generated embeddings")

    # Return paper data
    return {
        'file_path': file_path,
        'source': source,
        'segments': segments,
        'embeddings': embeddings
    }

def process_batch(file_paths, source, batch_size=10):
    """
    Process a batch of papers.

    Args:
        file_paths: List of paths to text files
        source: Source of the papers
        batch_size: Number of papers to process in each batch

    Returns:
        List of processed paper data
    """
    results = []

    for i in range(0, len(file_paths), batch_size):
        batch = file_paths[i:i+batch_size]
        batch_results = []

        for file_path in batch:
            result = process_paper(file_path, source)
            if result:
                batch_results.append(result)

        results.extend(batch_results)

        # # Save intermediate results
        # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        # save_path = f"{output_folder}/batch_{timestamp}.pkl"
        # pd.DataFrame(batch_results).to_pickle(save_path)
        # print(f"Saved batch to {save_path}")

    return results

def prepare_classification_datasets(data):
    """
    Prepare datasets for classification - super simplified.
    """
    print("Preparing datasets for classification...")

    # Create a list of complete data points
    data_points = []

    # Extract data points from each paper
    for item in data:
        label = 1 if item['source'] == 'arxiv' else 0

        # For each segment in this paper
        for i in range(len(item['segments'])):
            data_point = {
                "input_ids": item['embeddings']['input_ids'][i],
                "attention_mask": item['embeddings']['attention_mask'][i],
                "label": label
            }
            data_points.append(data_point)

    # Do a simple train_test_split on the whole list
    train_data, test_data = train_test_split(data_points, test_size=0.2, random_state=42)

    # Convert to Datasets
    train_dataset = {
        "input_ids": [item["input_ids"] for item in train_data],
        "attention_mask": [item["attention_mask"] for item in train_data],
        "label": [item["label"] for item in train_data]
    }

    test_dataset = {
        "input_ids": [item["input_ids"] for item in test_data],
        "attention_mask": [item["attention_mask"] for item in test_data],
        "label": [item["label"] for item in test_data]
    }

    print(f"Training on {len(train_dataset)} segments")
    print(f"Testing on {len(test_dataset)} segments")

    return {
        "train_dataset": train_dataset,
        "test_dataset": test_dataset
    }

def train_classifier(datasets, num_classes=2):
    """
    Train a Longformer classifier on the prepared datasets.

    Args:
        datasets: Dictionary containing train_dataset and test_dataset
        num_classes: Number of classes for classification

    Returns:
        Trained classifier model
    """
    print("Training classifier model...")

    # Load the classification model
    classifier = LongformerForSequenceClassification.from_pretrained(
        "allenai/longformer-base-4096",
        num_labels=num_classes
    )

    # Or slightly more customized but still simple
    training_args = TrainingArguments(
    output_dir=f"{output_folder}/longformer_results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    )

    # Define metrics for evaluation
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return {
            "accuracy": accuracy_score(labels, predictions)
        }

    # Create trainer
    trainer = Trainer(
        model=classifier,
        args=training_args,
        train_dataset=datasets["train_dataset"],
        eval_dataset=datasets["test_dataset"],
        compute_metrics=compute_metrics,
    )

    # Train the model
    print("Starting training...")
    trainer.train()

    # Evaluate the model
    print("Evaluating model...")
    eval_results = trainer.evaluate()
    print(f"Evaluation results: {eval_results}")

    # Save the model
    model_path = f"{output_folder}/longformer_classifier"
    trainer.save_model(model_path)

    return classifier

def main(max_papers_per_source=None):
    """
    Main function to orchestrate the entire pipeline.
    """
    print("Starting paper processing pipeline...")

    # # Make sure output folder exists
    # os.makedirs(output_folder, exist_ok=True)

    # all_results = []

    # # Process each source folder
    # for source in paper_sources:
    #     source_folder = os.path.join(data_folder, source)
    #     if not os.path.exists(source_folder):
    #         print(f"Warning: Source folder {source_folder} does not exist. Skipping.")
    #         continue

    #     # Get all text files in the source folder
    #     file_paths = glob.glob(os.path.join(source_folder, "**/*.txt"), recursive=True)

    #     if not file_paths:
    #         print(f"Warning: No text files found in {source_folder}. Skipping.")
    #         continue

    #     # Limit the number of papers if specified
    #     if max_papers_per_source is not None:
    #         file_paths = file_paths[:max_papers_per_source]
    #         print(f"Found {len(file_paths)} text files in {source_folder} (limited to {max_papers_per_source})")
    #     else:
    #         print(f"Found {len(file_paths)} text files in {source_folder}")

    #     # Process papers in batches
    #     results = process_batch(file_paths, source)
    #     all_results.extend(results)

    # if not all_results:
    #     print("Error: No papers were successfully processed. Exiting.")
    #     return

    # # Save all processed papers
    # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    # final_path = f"{output_folder}/all_papers_{timestamp}.pkl"
    # with open(final_path, 'wb') as f:
    #     pickle.dump(all_results, f)
    # print(f"Saved all processed papers to {final_path}")

    # # Check if we have enough data for classification
    # sources = set(item['source'] for item in all_results)
    # if len(sources) < 2:
    #     print("Warning: Need at least 2 different sources for classification. Only found: {sources}")
    #     return


    dataset_folder = '/content/drive/MyDrive/arxiv_Automatic_Paper_Moderation_Data_FA24'

    paper_folders = {
        "arxiv": os.path.join(dataset_folder, "arxiv_papers"),
        "conservapedia": os.path.join(dataset_folder, "conservapedia_papers"),
        "vixra": os.path.join(dataset_folder, "vixra_papers"),
        "wiki": os.path.join(dataset_folder, "wiki_papers"),
    }

    all_results = []

    for source, folder_path in paper_folders.items():
        file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.txt')]

        processed_data = process_batch(file_paths, source)

        all_results.extend(processed_data)

    # Prepare datasets for classification
    datasets = prepare_classification_datasets(all_results)

    # Train classifier
    classifier = train_classifier(datasets)

    print("Pipeline complete!")



In [None]:
main(1)

Starting paper processing pipeline...
Processing /content/drive/MyDrive/arxiv_Automatic_Paper_Moderation_Data_FA24/arxiv_papers/1605_05870.txt
  Segmented into 24 chunks
  Generated embeddings
Processing /content/drive/MyDrive/arxiv_Automatic_Paper_Moderation_Data_FA24/arxiv_papers/0803_0964.txt
  Segmented into 19 chunks
  Generated embeddings
Processing /content/drive/MyDrive/arxiv_Automatic_Paper_Moderation_Data_FA24/arxiv_papers/2301_00612.txt
  Segmented into 36 chunks
  Generated embeddings
Processing /content/drive/MyDrive/arxiv_Automatic_Paper_Moderation_Data_FA24/arxiv_papers/1812_00890.txt
  Segmented into 29 chunks
  Generated embeddings
Processing /content/drive/MyDrive/arxiv_Automatic_Paper_Moderation_Data_FA24/arxiv_papers/1811_07477.txt
  Segmented into 9 chunks
  Generated embeddings
Processing /content/drive/MyDrive/arxiv_Automatic_Paper_Moderation_Data_FA24/arxiv_papers/1803_08555.txt
  Segmented into 9 chunks
  Generated embeddings
Processing /content/drive/MyDrive/a