# Vectorize Full Document Paraphrases

This notebook will be used to test vectorizing the full document paraphrases into sentences. It uses the same framework done on the original data which is to perform named entitiy recognition, then vectorize and then add the named entities back in to the data.

In [1]:
import os
import sys
from pathlib import Path

In [2]:
sys.path.append(os.path.abspath('../src'))

from read_and_write_docs import read_jsonl, write_jsonl
from preprocessing import vectorize_df

In [10]:
# To get spacy model, run in terminal
# python -m spacy download en_core_web_sm

In [11]:
# To get nltk to work
# import nltk
# nltk.download('punkt_tab')

## Set Locations

Set the locations of the top impostors after parascoring and finding the top n impostors according to the score.

In [12]:
base_loc = "/Volumes/BCross/datasets/author_verification"

data_type = "training"

corpus = 'Wiki'

data_loc = f"{base_loc}/{data_type}/{corpus}/Qwen_2.5_1.5B/gen_t_0.7_tp_0.9/top_impostors"
data_loc_path = Path(data_loc)

save_loc = f"{base_loc}/{data_type}/{corpus}/Qwen_2.5_1.5B/top_impostors_tokenized"

## Load the Data and Preprocess

There are several steps to load the data and preprocess it so it fits the vectorizing code as it currently stands. Basic changes to the data.

In [13]:
df = read_jsonl(f"{data_loc}/142_196_88_228_text_1.jsonl")
df['impostor_id'] = df.index + 1

In [14]:
df.head(2)

Unnamed: 0,doc_id,orig_doc_id,corpus,author,texttype,original,rephrased,top_p,temperature,time_sec,tokens_per_sec,clean_text,text_cleaned,clean_stage,parsing_errors,similarity_score,diversity_score,diversity_weighting,parascore_free,impostor_id
0,142_196_88_228_text_1,known [142.196.88.228 - Text-1].txt,Wiki,142.196.88.228,known,The article that is being referred to via the ...,"{""new_document"": ""Winemaking's origins might b...",0.9,0.7,11.183702,107.298996,Winemaking's origins might be traced back to S...,1,fix_salvage_quotes,[original: Invalid control character at: line ...,0.825557,0.35,0.05,0.843057,1
1,142_196_88_228_text_1,known [142.196.88.228 - Text-1].txt,Wiki,142.196.88.228,known,The article that is being referred to via the ...,"{""new_document"": ""The recent investigation con...",0.9,0.7,9.610908,118.511176,The recent investigation concludes that winema...,1,fix_salvage_quotes,[original: Invalid control character at: line ...,0.813687,0.35,0.05,0.831187,2


In [15]:
df = df[['doc_id','corpus', 'impostor_id', 'author', 'texttype', 'rephrased', 'parascore_free']]
df.rename(columns={'rephrased': 'text'}, inplace=True)
df.head()

Unnamed: 0,doc_id,corpus,impostor_id,author,texttype,text,parascore_free
0,142_196_88_228_text_1,Wiki,1,142.196.88.228,known,"{""new_document"": ""Winemaking's origins might b...",0.843057
1,142_196_88_228_text_1,Wiki,2,142.196.88.228,known,"{""new_document"": ""The recent investigation con...",0.831187
2,142_196_88_228_text_1,Wiki,3,142.196.88.228,known,"```json\n{\n ""new_document"": ""A recent stud...",0.830041
3,142_196_88_228_text_1,Wiki,4,142.196.88.228,known,"{""new_document"": ""Winemaking's origins might h...",0.826515
4,142_196_88_228_text_1,Wiki,5,142.196.88.228,known,"{""new_document"": ""Winemaking's origins might h...",0.825814


In [16]:
vectorized_df = vectorize_df(df, impostors=True)
vectorized_df.rename(columns={'sentence': 'text'}, inplace=True)

In [17]:
vectorized_df

Unnamed: 0,corpus,doc_id,impostor_id,chunk_id,author,texttype,text,parascore_free
0,Wiki,142_196_88_228_text_1,1,1,142.196.88.228,known,"{""new_document"": ""Winemaking's origins might b...",0.843057
1,Wiki,142_196_88_228_text_1,1,2,142.196.88.228,known,This study fails to provide substantial eviden...,0.843057
2,Wiki,142_196_88_228_text_1,1,3,142.196.88.228,known,"Instead, the Georgian theory seems to be an ar...",0.843057
3,Wiki,142_196_88_228_text_1,1,4,142.196.88.228,known,Scholars generally believe that since Western ...,0.843057
4,Wiki,142_196_88_228_text_1,1,5,142.196.88.228,known,"Therefore, it should be categorized as part of...",0.843057
...,...,...,...,...,...,...,...,...
3529,Wiki,142_196_88_228_text_1,300,6,142.196.88.228,known,"However, despite claims to the contrary, many ...",0.758463
3530,Wiki,142_196_88_228_text_1,300,7,142.196.88.228,known,"For instance, the British Museum's geographica...",0.758463
3531,Wiki,142_196_88_228_text_1,300,8,142.196.88.228,known,"Moreover, the notion of cultural diversity pre...",0.758463
3532,Wiki,142_196_88_228_text_1,300,9,142.196.88.228,known,"Thus, the argument that winemaking could have ...",0.758463


## Get LogProbs Code

This needs moving to a .py file for ease of use.

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
from tqdm import tqdm
import torch
import numpy as np

In [None]:
# Load model and tokenizer once
model_name = "Qwen2.5-1.5B-Instruct"
model_loc = f"C:/Users/benjc/Documents/local models/{model_name}"

tokenizer = AutoTokenizer.from_pretrained(model_loc)
model = AutoModelForCausalLM.from_pretrained(model_loc)
model.eval()

In [None]:
def compute_log_probs_with_median(text: str):
    """
    For each token (excluding first), return:
    - tokens: list of tokens in the text
    - log_probs: list of chosen-token log-probs
    - median_logprobs: list of median log-probs for each token
    """
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"]
    tokens = tokenizer.decode(input_ids[0]).split()  # Convert input_ids to tokens

    with torch.no_grad():
        outputs = model(input_ids)

    logits = outputs.logits  # [batch_size=1, seq_len, vocab_size]
    
    log_probs = []
    median_logprobs = []
    # We start from the second token, as the first one has no previous token to condition on
    for i in range(0, input_ids.size(1)):
        if i == 0:
            logits_prev = logits[0, 0]
        else:
            logits_prev = logits[0, i - 1]
        dist = torch.log_softmax(logits_prev, dim=-1)
        
        # Extract the log probabilities
        log_prob = dist[input_ids[0, i].item()].item()
        median_logprob = float(dist.median().item())
        
        # Append to lists
        log_probs.append(log_prob)
        median_logprobs.append(median_logprob)
    
    # The tokens list starts from the first token, but the log_probs and median_logprobs start from the second
    # To align them, we need to slice the tokens list to match the lengths
    tokens = tokens[0:]  # Match the length of log_probs and median_logprobs
    
    return tokens, log_probs, median_logprobs

In [None]:
def score_dataframe(df: pd.DataFrame, text_column: str = "text") -> pd.DataFrame:
    """
    Takes a dataframe with a column of texts and computes:
    - list of log-probs per token
    - median log-probs per token
    - number of tokens
    - sum of log probs
    - average log-prob
    - differences between log_probs and median log-probs
    - absolute differences between log_probs and median log-probs
    - mean of differences
    - mean of absolute differences
    """
    tqdm.pandas(desc="Scoring texts")
    df = df.copy()

    # Step 1: Extract tokens, log_probs, and median log_probs
    df[['tokens', 'log_probs', 'med_log_prob']] = df[text_column].progress_apply(
        lambda t: pd.Series(compute_log_probs_with_median(t))
    )

    # Step 2: Compute differences
    df['differences'] = df.apply(
        lambda row: [lp - mlp for lp, mlp in zip(row['log_probs'], row['med_log_prob'])],
        axis=1
    )

    # Compute absolute differences
    df['abs_differences'] = df.apply(
        lambda row: [abs(lp - mlp) for lp, mlp in zip(row['log_probs'], row['med_log_prob'])],
        axis=1
    )

    # Compute summary stats
    df["num_tokens"] = df["log_probs"].apply(len)
    df["sum_log_prob"] = df["log_probs"].apply(sum)
    df["avg_log_prob"] = df["sum_log_prob"] / df["num_tokens"]

    # Compute mean of differences and absolute differences
    df["mean_diff"] = df["differences"].apply(np.mean)
    df["mean_abs_diff"] = df["abs_differences"].apply(np.mean)

    return df

## Score all files in data loc

In [None]:
max_threads = os.cpu_count()
print(f"Maximum threads available: {max_threads} - Using {max_threads - 2}")

os.environ["OMP_NUM_THREADS"] = str(max_threads - 2)

In [None]:
# If all your .jsonl files are directly in that folder:
jsonl_paths = list(data_loc_path.glob("*.jsonl"))

# Extract only the filename for each
jsonl_names = [p.name for p in jsonl_paths]

total = len(jsonl_names)
for idx, j in enumerate(jsonl_names, start=1):
    print(f"Processing file {idx} of {total}: {j}")
    save_path = os.path.join(save_loc, j)
    if os.path.exists(save_path):
        print(f"  → Skipping {j} (already exists)")
        continue

    df = read_jsonl(os.path.join(data_loc, j))
    df['impostor_id'] = df.index + 1

    df = df[['doc_id', 'corpus', 'impostor_id', 'author', 'texttype', 'rephrased']]
    df.rename(columns={'rephrased': 'text'}, inplace=True)

    vectorized_df = vectorize_df(df, impostors=True)
    vectorized_df.rename(columns={'sentence': 'text'}, inplace=True)

    scored_df = score_dataframe(vectorized_df)

    write_jsonl(scored_df, save_path)
    print(f"  ✓ Finished {j}")