In [None]:
!pip install parascore pyreadr

In [None]:
# If on Google Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd /content/drive/MyDrive/GitHub/paraphrase_py/code

In [None]:
from read_and_write_docs import read_jsonl, write_jsonl
from scorer import ParaphraseScorer
import os
import json
import argparse
import logging
import pandas as pd
import sys
import time

In [None]:
def process_file(input_file, output_file, p_scorer,
                 text_column='text', rephrased_column='paraphrased_text',
                 parascore_diversity_weight=0.05):
    """
    Processes a single JSONL file and saves the results.

    Parameters:
        input_file (str): Path to input JSONL.
        output_file (str): Path for output JSONL.
        p_scorer: Initialised parascore model.
        num_layers (int, optional): Number of model layers.
        text_column (str): Column name for original text.
        rephrased_column (str): Column name for paraphrased text.
		parascore_diversity_weight (float): weighting of diversity in parascore_free
    """

    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    df = read_jsonl(input_file)

    if df.empty:
        logging.error("Input file is empty or could not be read. Skipping processing.")
        return

    # Rename columns if necessary
    rename_mapping = {
        "original_sentence": "original",
        text_column: "original",
        rephrased_column: "rephrased"
    }
    df.rename(columns=rename_mapping, inplace=True)

    # Process file
    try:
        df_with_score = p_scorer.calculate_score(df, parascore_diversity_weight)
        write_jsonl(df_with_score, output_file)
    except Exception as e:
        logging.error(f"Error during processing: {e}")

In [None]:
def list_jsonl_files(directory: str) -> list:
    """
    Return a list of all .jsonl files in the given directory.

    Args:
        directory (str): Path to the directory to scan.

    Returns:
        List[str]: Full paths to each .jsonl file found.
    """
    return [
        fname
        for fname in os.listdir(directory)
        if fname.lower().endswith(".jsonl")
    ]

In [None]:
# Parameters
CORPUS     = "Wiki"
DATA_TYPE  = "training"
MODEL      = "ModernBERT-large"
MODEL_DIR = f"/content/drive/MyDrive/models/{MODEL}"
NUM_LAYERS = 28

# Paths
BASE_DATA = f"/content/drive/MyDrive/author_verification/{DATA_TYPE}/{CORPUS}/Qwen_2.5_1.5B/gen_t_0.7_tp_0.9"
INPUT_DIR = os.path.join(BASE_DATA, "full_doc_paraphrase_clean")
OUTPUT_DIR = os.path.join(BASE_DATA, "parascore")

In [None]:
parascore_free = ParaphraseScorer(
      score_type='parascore_free',
      model_type=MODEL_DIR,
      num_layers=NUM_LAYERS
  )

In [None]:
def process_dir(input_dir, output_dir, p_scorer, text_column='text',
                rephrased_column='paraphrased_text',
                parascore_diversity_weight=0.05):

    input_files = list_jsonl_files(input_dir)
    output_files = list_jsonl_files(output_dir)

    # Filter out files already processed, then sort
    to_process = sorted(f for f in input_files if f not in output_files)
    num_files = len(to_process)
    print(f"Found {num_files} files to process in input directory")

    for idx, fname in enumerate(to_process, start=1):
        # idx is 1-based
        print(f"Processing file {fname} ({idx} of {num_files})")

        input_path  = os.path.join(input_dir, fname)
        output_path = os.path.join(output_dir, fname)

        if not os.path.isfile(input_path):
            logging.warning(f"Input file not found – skipping: {input_path}")
            continue  # skip rather than exit

        if os.path.exists(output_path):
            logging.info(f"Output already exists – skipping: {output_path}")
            continue  # skip rather than exit

        file_start = time.perf_counter()

        process_file(
            input_file=input_path,
            output_file=output_path,
            p_scorer=p_scorer,
            text_column=text_column,
            rephrased_column=rephrased_column,
            parascore_diversity_weight=parascore_diversity_weight
        )

        file_elapsed = time.perf_counter() - file_start

        print(f"Completed file {fname} ({idx} of {num_files}) in {file_elapsed:.2f} seconds")

In [None]:
process_dir(
    input_dir=INPUT_DIR,
    output_dir=OUTPUT_DIR,
    p_scorer=parascore_free,
    text_column='text',
    rephrased_column='clean_text',
    parascore_diversity_weight=0.05)