In [25]:
!pip install parascore pyreadr



In [40]:
# If on Google Colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
cd /content/drive/MyDrive/GitHub/paraphrase_py/code

/content/drive/MyDrive/GitHub/paraphrase_py/code


In [69]:
from read_and_write_docs import read_jsonl, write_jsonl
from scorer import ParaphraseScorer
import os
import json
import argparse
import logging
import pandas as pd
import sys

In [65]:
def process_file(input_file, output_file, p_scorer,
                 text_column='text', rephrased_column='paraphrased_text',
                 parascore_diversity_weight=0.05):
    """
    Processes a single JSONL file and saves the results.

    Parameters:
        input_file (str): Path to input JSONL.
        output_file (str): Path for output JSONL.
        p_scorer: Initialised parascore model.
        num_layers (int, optional): Number of model layers.
        text_column (str): Column name for original text.
        rephrased_column (str): Column name for paraphrased text.
		parascore_diversity_weight (float): weighting of diversity in parascore_free
    """
    print(f"Processing file: {input_file}")
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    # Load input data
    print("Reading input file...")
    df = read_jsonl(input_file)

    print(f"DataFrame loaded: {df.shape}")  # Debug

    if df.empty:
        logging.error("Input file is empty or could not be read. Skipping processing.")
        return

    # Rename columns if necessary
    print("Renaming columns if needed...")
    rename_mapping = {
        "original_sentence": "original",
        text_column: "original",
        rephrased_column: "rephrased"
    }
    df.rename(columns=rename_mapping, inplace=True)
    print(f"Columns after renaming: {list(df.columns)}")

    # Process file
    try:
        print("Calculating scores...")
        df_with_score = p_scorer.calculate_score(df, parascore_diversity_weight)
        print("Score calculation completed.")

        print(f"Writing output to {output_file}...")
        write_jsonl(df_with_score, output_file)
        print(f"Processing complete: {output_file}")

    except Exception as e:
        logging.error(f"Error during processing: {e}")

In [72]:
def list_jsonl_files(directory: str) -> list:
    """
    Return a list of all .jsonl files in the given directory.

    Args:
        directory (str): Path to the directory to scan.

    Returns:
        List[str]: Full paths to each .jsonl file found.
    """
    return [
        fname
        for fname in os.listdir(directory)
        if fname.lower().endswith(".jsonl")
    ]

In [78]:
# Parameters
CORPUS     = "Wiki"
DATA_TYPE  = "training"
MODEL      = "ModernBERT-large"
MODEL_DIR = f"/content/drive/MyDrive/models/{MODEL}"
NUM_LAYERS = 28

# Paths
BASE_DATA = f"/content/drive/MyDrive/author_verification/{DATA_TYPE}/{CORPUS}/Qwen_2.5_1.5B/gen_t_0.7_tp_0.9"
INPUT_DIR = os.path.join(BASE_DATA, "full_doc_paraphrase_clean")
OUTPUT_DIR = os.path.join(BASE_DATA, "parascore")

In [79]:
parascore_free = ParaphraseScorer(
      score_type='parascore_free',
      model_type=MODEL_DIR,
      num_layers=NUM_LAYERS
  )

In [80]:
def process_dir(input_dir, output_dir, p_scorer, text_column='text',
                rephrased_column='paraphrased_text',
                parascore_diversity_weight=0.05):

    file_list = list_jsonl_files(input_dir)
    num_files = len(file_list)
    print(f"Found {num_files} files in {input_dir}")

    for idx, fname in enumerate(file_list, start=1):
        # idx is 1-based
        print(f"Processing file {fname} ({idx} of {num_files})")

        input_path  = os.path.join(input_dir, fname)
        output_path = os.path.join(output_dir, fname)

        if not os.path.isfile(input_path):
            logging.warning(f"Input file not found – skipping: {input_path}")
            continue  # skip rather than exit

        if os.path.exists(output_path):
            logging.info(f"Output already exists – skipping: {output_path}")
            continue  # skip rather than exit

        process_file(
            input_file=input_path,
            output_file=output_path,
            p_scorer=p_scorer,
            text_column=text_column,
            rephrased_column=rephrased_column,
            parascore_diversity_weight=parascore_diversity_weight
        )

        print(f"Completed file {fname} ({idx} of {num_files})")


In [None]:
process_dir(
    input_dir=INPUT_DIR,
    output_dir=OUTPUT_DIR,
    p_scorer=parascore_free,
    text_column='text',
    rephrased_column='clean_text',
    parascore_diversity_weight=0.05)

Found 225 files in /content/drive/MyDrive/author_verification/training/Wiki/Qwen_2.5_1.5B/gen_t_0.7_tp_0.9/full_doc_paraphrase_clean
Processing file enemesis_text_1.jsonl (1 of 225)
Processing file: /content/drive/MyDrive/author_verification/training/Wiki/Qwen_2.5_1.5B/gen_t_0.7_tp_0.9/full_doc_paraphrase_clean/enemesis_text_1.jsonl
Reading input file...
DataFrame loaded: (500, 15)
Renaming columns if needed...
Columns after renaming: ['doc_id', 'orig_doc_id', 'corpus', 'author', 'texttype', 'original', 'generated_text', 'top_p', 'temperature', 'time_sec', 'tokens_per_sec', 'rephrased', 'text_cleaned', 'clean_stage', 'parsing_errors']
Calculating scores...
