# Run Paraphrase Common Token N-grams

## Load Libraries

Load libraries from standard modules and my own code

In [6]:
import sys
import os
import json
import math
import re

import pandas as pd

from typing import List

sys.path.append(os.path.abspath('../../../src'))

from read_and_write_docs import read_jsonl, read_rds
from tokenize_and_score import load_model
from utils import get_base_location, apply_temp_doc_id, build_metadata_df
from n_gram_functions import (
    common_ngrams,
    pretty_print_common_ngrams,
    keep_before_phrase,
    compute_log_probs_with_median
)
from open_ai import initialise_client, llm

## Set Locations & Load Base Data

In [7]:
corpus = "Wiki"
data_type = "training"

# Set NAS so can run on Windows laptop seamlessly
nas_base_loc = get_base_location()

# Load known data
known_loc = f"{nas_base_loc}/datasets/author_verification/{data_type}/{corpus}/known_raw.jsonl"
known = read_jsonl(known_loc)
known = apply_temp_doc_id(known)

# Load unknown data
unknown_loc = f"{nas_base_loc}/datasets/author_verification/{data_type}/{corpus}/unknown_raw.jsonl"
unknown = read_jsonl(unknown_loc)
unknown_df = apply_temp_doc_id(unknown)

# Load and build metadata
metadata_loc = f"{nas_base_loc}/datasets/author_verification/{data_type}/metadata.rds"
metadata = read_rds(metadata_loc)
filtered_metadata = metadata[metadata['corpus'] == corpus]
agg_metadata = build_metadata_df(filtered_metadata, known, unknown)

# Load the pre-made problem datasets for speed
problem_dataset_base = f"{nas_base_loc}/datasets/author_verification/{data_type}/{corpus}"
problem_dataset_agg = read_jsonl(f"{problem_dataset_base}/{corpus}_{data_type}_agg.jsonl")
problem_dataset_profile = read_jsonl(f"{problem_dataset_base}/{corpus}_{data_type}_profile.jsonl")


In [8]:
tokenizer, model = load_model(f"{nas_base_loc}/models/Qwen 2.5/Qwen2.5-0.5B-Instruct")

## View Same and Different-Author Datasets

In [9]:
same_probs = problem_dataset_agg[problem_dataset_agg['known_author'] == problem_dataset_agg['unknown_author']].copy()
same_probs.sort_values(["highest_common_count"], ascending=[False], inplace=True)
same_probs[(same_probs['highest_common_count'] >= 3) & (same_probs['highest_common_count'] <= 10)]

Unnamed: 0,problem,known_author,unknown_author,known_doc_id,unknown_doc_id,highest_common_count,highest_common_ngram
379,Greg_L vs Greg_L,Greg_L,Greg_L,greg_l_text_11,greg_l_text_10,9,", Ġthey Ġshould Ġhave Ġparticipated Ġin Ġthe Ġ..."
432,Haymaker vs Haymaker,Haymaker,Haymaker,haymaker_text_3,haymaker_text_2,9,"Ġat Ġthe Ġend Ġof Ġthe Ġday , Ġwe 're"
354,Fragments_of_Jade vs Fragments_of_Jade,Fragments_of_Jade,Fragments_of_Jade,fragments_of_jade_text_2,fragments_of_jade_text_10,8,"Ġme , Ġand Ġit 's Ġgetting Ġold .Ċ"
337,Fixentries vs Fixentries,Fixentries,Fixentries,fixentries_text_2,fixentries_text_5,8,Ġthe Ġindividual Ġher it ability Ġof Ġintellig...
248,DonaNobisPacem vs DonaNobisPacem,DonaNobisPacem,DonaNobisPacem,donanobispacem_text_5,donanobispacem_text_2,8,Ġafter Ġ 1 8 - 2 0 Ġweeks
...,...,...,...,...,...,...,...
222,David_Shankbone vs David_Shankbone,David_Shankbone,David_Shankbone,david_shankbone_text_1,david_shankbone_text_4,3,", Ġwhich Ġis"
211,D7G1DX~0 vs D7G1DX~0,D7G1DX~0,D7G1DX~0,d7g1dx_0_text_2,d7g1dx_0_text_5,3,Ġdon 't Ġthink
187,Collect vs Collect,Collect,Collect,collect_text_12,collect_text_11,3,", Ġand Ġi"
198,Cptnono vs Cptnono,Cptnono,Cptnono,cptnono_text_1,cptnono_text_12,3,", Ġthough .Ċ"


In [10]:
diff_probs = problem_dataset_agg[problem_dataset_agg['known_author'] != problem_dataset_agg['unknown_author']].copy()
diff_probs.sort_values(["highest_common_count"], ascending=[False], inplace=True)
diff_probs[(diff_probs['highest_common_count'] >= 3) & (diff_probs['highest_common_count'] <= 10)]

Unnamed: 0,problem,known_author,unknown_author,known_doc_id,unknown_doc_id,highest_common_count,highest_common_ngram
287,Enemesis vs Equanimous1,Enemesis,Equanimous1,enemesis_text_3,equanimous1_text_5,5,Ġthe Ġcurrent Ġstate Ġof Ġthe
161,Caboga vs Chanakyathegreat,Caboga,Chanakyathegreat,caboga_text_5,chanakyathegreat_text_1,5,Ġ 2 0 0 8
262,Dweller vs Ecelan,Dweller,Ecelan,dweller_text_3,ecelan_text_2,5,.Ċ i Ġdon 't Ġthink
364,Fyunck(click) vs Garda40,Fyunck(click),Garda40,fyunck_click_text_12,garda40_text_1,5,Ġ 2 0 1 1
365,Fyunck(click) vs Garda40,Fyunck(click),Garda40,fyunck_click_text_2,garda40_text_1,5,Ġ 2 0 1 2
...,...,...,...,...,...,...,...
243,DIREKTOR vs DonaNobisPacem,DIREKTOR,DonaNobisPacem,direktor_text_10,donanobispacem_text_2,3,.Ċ the Ġpoint
237,Dennis_Brown vs DIREKTOR,Dennis_Brown,DIREKTOR,dennis_brown_text_1,direktor_text_1,3,", Ġand Ġhad"
245,DIREKTOR vs DonaNobisPacem,DIREKTOR,DonaNobisPacem,direktor_text_3,donanobispacem_text_2,3,Ġi 'm Ġnot
238,Dennis_Brown vs DIREKTOR,Dennis_Brown,DIREKTOR,dennis_brown_text_10,direktor_text_1,3,", Ġbut Ġit"


## Select Known and Unknown Documents

In [11]:
known_doc = "dweller_text_3"
known_text = known[known['doc_id'] == known_doc].reset_index().loc[0, 'text']

unknown_doc = "ecelan_text_2"
unknown_text = unknown[unknown['doc_id'] == unknown_doc].reset_index().loc[0, 'text']

# Get the metadata for current problem, will be added to Excel
p_metadata = agg_metadata[(agg_metadata['known_doc_id'] == known_doc) 
                          & ((agg_metadata['unknown_doc_id'] == unknown_doc))].reset_index()
p_metadata['target'] = p_metadata['known_author'] == p_metadata['unknown_author']
specific_problem = p_metadata.loc[0, 'problem']

print(f"Working on problem: {specific_problem}")

Working on problem: Dweller vs Ecelan


In [12]:
docs_df = pd.DataFrame(
    {
        "known":   [corpus, data_type, known_doc, known_text],
        "unknown": [corpus, data_type, unknown_doc, unknown_text],
    },
    index=["corpus", "data type", "doc", "text"],
)

## Get N-Grams in Common

In [13]:
common = common_ngrams(known_text, unknown_text, 2, model, tokenizer, lowercase=True)
n_gram_list = pretty_print_common_ngrams(common, tokenizer=tokenizer, order='len_desc', return_format='flat')
n_gram_list

[".\ni don't think",
 ' and the',
 " didn't",
 ' from the',
 ' in the',
 ' is a',
 " it's",
 ' of my',
 ' of the',
 ' on the',
 ' questions,',
 ' should be',
 ' the article',
 ' to do',
 ' what i',
 ', even',
 ', i',
 ', in',
 ', it',
 ', or',
 '.\nbut',
 '.\ncan',
 '.\nthe']

## Initialise OpenAI Stuff

In [14]:
client = initialise_client("../../../credentials.json")

In [15]:
def create_user_prompt(known_text, phrase):
    user_prompt = f"""
<DOC>
{known_text}
</DOC>
<NGRAM>
"{phrase}"
</NGRAM>
"""
    
    return user_prompt

In [16]:
def create_system_prompt(prompt_loc):
    with open(prompt_loc,"r") as f:
        system_prompt = f.read()
        
    return system_prompt

In [17]:
def parse_paraphrases(response, phrase):
    
    paraphrase_list = []
    for i in range(1, len(response.choices)):
        content = response.choices[i].message.content
        
        try:
            content_json = json.loads(content)
            for para in content_json['paraphrases']:
                if para != phrase:
                    paraphrase_list.append(para)  
        except:
            pass
        
    unique_list = list(set(paraphrase_list))
    
    return unique_list

In [18]:
system_prompt = create_system_prompt("../../../prompts/exhaustive_constrained_ngram_paraphraser_prompt_JSON.txt")

In [19]:
n_gram_dict = {}
width = len(str(len(n_gram_list)))  # e.g., 10 -> 2, 100 -> 3

for idx, phrase in enumerate(n_gram_list, start=1):
    user_prompt = create_user_prompt(known_text, phrase)
    response = llm(
        system_prompt,
        user_prompt,
        client,
        model="gpt-4.1",
        max_tokens=5000,
        temperature=0.7,
        n=10,
        response_format={"type": "json_object"},
    )
    paraphrases = parse_paraphrases(response, phrase)
    key = f"phrase_{idx:0{width}d}"  # -> phrase_01, phrase_002, etc.
    n_gram_dict[key] = {"phrase": phrase, "paraphrases": paraphrases}


In [20]:
def score_phrases(
    base_text: str,
    ref_phrase: str,
    paraphrases: list[str],
    tokenizer,
    model
) -> pd.DataFrame:
    """
    Returns rows for the reference and each paraphrase with:
      sum_log_probs_phrase (log-likelihood for the phrase tokens) and
      raw_prob = exp(sum_log_probs_phrase)
    """
    # 1) score base_text
    _, log_probs_base, _ = compute_log_probs_with_median(base_text.strip(), tokenizer, model)
    base_total = sum(log_probs_base)

    items = [("reference", ref_phrase)] + [("paraphrase", p) for p in paraphrases]
    rows = []

    for ptype, phrase in items:
        # a) phrase alone → token count
        tokens_phrase, log_probs_phrase, _ = compute_log_probs_with_median(phrase, tokenizer, model)
        n_phrase_tokens = len(tokens_phrase)

        # b) full sequence
        full_text = base_text + phrase
        tokens_full, log_probs_full, _ = compute_log_probs_with_median(full_text, tokenizer, model)

        # c) full sum (base + phrase)
        sum_before = sum(log_probs_full)

        # d/e) last n tokens correspond to phrase
        phrase_tokens    = tokens_full[-n_phrase_tokens:]
        phrase_log_probs = log_probs_full[-n_phrase_tokens:]

        # f) totals
        phrase_total = sum(phrase_log_probs)
        difference   = base_total - sum_before  # typically == -phrase_total

        # raw (unnormalized) probability of the phrase given the base
        raw_prob = math.exp(phrase_total)  # may underflow to 0.0 for long phrases; that's fine

        rows.append({
            "phrase_type":               ptype,
            "phrase":                    phrase,
            "tokens":                    phrase_tokens,
            "sum_log_probs_base":        base_total,
            "sum_log_probs_inc_phrase":  sum_before,
            "difference":                difference,
            "phrase_log_probs":          phrase_log_probs,
            "sum_log_probs_phrase":      phrase_total,
            "raw_prob":                  raw_prob,
        })

    return pd.DataFrame(rows, columns=[
        "phrase_type", "phrase", "tokens",
        "sum_log_probs_base", "sum_log_probs_inc_phrase",
        "difference", "phrase_log_probs", "sum_log_probs_phrase",
        "raw_prob",
    ])

def get_scored_df(n_gram_dict, full_text, tokenizer, model):
    """Row-concat each scored df, add phrase_num, sort, then rank paraphrases within each phrase_num."""
    dfs = []
    for phrase_num, entry in n_gram_dict.items():  # relies on insertion order
        print(f"Processing Phrase - {phrase_num}")
        phrase = entry["phrase"]
        paraphrases = entry["paraphrases"]

        base_text = keep_before_phrase(full_text, phrase)

        df = score_phrases(base_text, phrase, paraphrases, tokenizer, model).copy()
        df.insert(0, "original_phrase", phrase)
        df.insert(0, "phrase_num", phrase_num)  # first column
        dfs.append(df)

    if not dfs:
        return pd.DataFrame(columns=["phrase_num"])

    out = pd.concat(dfs, ignore_index=True)

    # sort by phrase_num (zero-padded → lexicographic == numeric)
    out = out.sort_values("phrase_num", kind="mergesort").reset_index(drop=True)

    # rank within phrase_num: reference -> 0; paraphrases ranked by descending raw_prob starting at 1
    out["rank"] = None
    mask = out["phrase_type"].eq("paraphrase")
    out.loc[mask, "rank"] = (
        out.loc[mask]
           .groupby("phrase_num")["raw_prob"]
           .rank(method="first", ascending=False)
           .astype(int)
    )
    out.loc[out["phrase_type"].eq("reference"), "rank"] = 0
    out["rank"] = out["rank"].astype(int)

    out = out.sort_values(["phrase_num", "rank"], kind="mergesort").reset_index(drop=True)

    return out

In [None]:
known_scored = get_scored_df(n_gram_dict, known_text, tokenizer, model)

Processing Phrase - phrase_01
Processing Phrase - phrase_02
Processing Phrase - phrase_03
Processing Phrase - phrase_04
Processing Phrase - phrase_05
Processing Phrase - phrase_06
Processing Phrase - phrase_07
Processing Phrase - phrase_08
Processing Phrase - phrase_09
Processing Phrase - phrase_10


In [None]:
unknown_scored = get_scored_df(n_gram_dict, unknown_text, tokenizer, model)

In [None]:
def score_phrases_no_context(
    ref_phrase: str,
    paraphrases: List[str],
    tokenizer,
    model
) -> pd.DataFrame:
    """
    Score the reference and each paraphrase *without* any ranking.
    Returns:
      phrase_type, phrase, tokens, log_probs, sum_log_probs, raw_prob
    """
    items = [("reference", ref_phrase)] + [("paraphrase", p) for p in paraphrases]
    rows = []

    for idx, (ptype, phrase) in enumerate(items, start=1):
        print(f"→ [{idx}/{len(items)}] Processing {ptype}…")
        tokens_phrase, log_probs_phrase, _ = compute_log_probs_with_median(phrase, tokenizer, model)
        phrase_total = sum(log_probs_phrase)
        raw_prob = math.exp(phrase_total)  # unnormalized prob

        rows.append({
            "phrase_type":   ptype,
            "phrase":        phrase,
            "tokens":        tokens_phrase,
            "log_probs":     log_probs_phrase,
            "sum_log_probs": phrase_total,
            "raw_prob":      raw_prob,
        })

    return pd.DataFrame(rows, columns=[
        "phrase_type", "phrase", "tokens", "log_probs", "sum_log_probs", "raw_prob"
    ])

def get_scored_df_no_context(n_gram_dict, tokenizer, model):
    """
    Row-concat each score_phrases_no_context df, add phrase_num, sort by phrase_num,
    then rank paraphrases within each phrase_num by descending raw_prob.
    'reference' rows always get rank 0.
    """
    dfs = []
    for phrase_num, entry in n_gram_dict.items():  # insertion order preserved
        print(f"Processing Phrase - {phrase_num}")
        phrase = entry["phrase"]
        paraphrases = entry["paraphrases"]

        df = score_phrases_no_context(phrase, paraphrases, tokenizer, model).copy()
        df.insert(0, "original_phrase", phrase)
        df.insert(0, "phrase_num", phrase_num)  # make it the first column
        dfs.append(df)

    if not dfs:
        return pd.DataFrame(columns=["phrase_num"])

    out = pd.concat(dfs, ignore_index=True)

    # zero-padded keys => lexicographic equals numeric order
    out = out.sort_values("phrase_num", kind="mergesort").reset_index(drop=True)

    # Rank within phrase_num: reference -> 0; paraphrases ranked by descending raw_prob starting at 1
    out["rank"] = None
    mask_para = out["phrase_type"].eq("paraphrase")
    out.loc[mask_para, "rank"] = (
        out.loc[mask_para]
           .groupby("phrase_num")["raw_prob"]
           .rank(method="first", ascending=False)  # use "dense" if you prefer 1,2,3 without gaps
           .astype(int)
    )
    out.loc[out["phrase_type"].eq("reference"), "rank"] = 0
    out["rank"] = out["rank"].astype(int)

    out = out.sort_values(["phrase_num", "rank"], kind="mergesort").reset_index(drop=True)

    return out


In [None]:
score_df_no_context = get_scored_df_no_context(n_gram_dict, tokenizer, model)

→ [1/58] Processing reference…
→ [2/58] Processing paraphrase…
→ [3/58] Processing paraphrase…
→ [4/58] Processing paraphrase…
→ [5/58] Processing paraphrase…
→ [6/58] Processing paraphrase…
→ [7/58] Processing paraphrase…
→ [8/58] Processing paraphrase…
→ [9/58] Processing paraphrase…
→ [10/58] Processing paraphrase…
→ [11/58] Processing paraphrase…
→ [12/58] Processing paraphrase…
→ [13/58] Processing paraphrase…
→ [14/58] Processing paraphrase…
→ [15/58] Processing paraphrase…
→ [16/58] Processing paraphrase…
→ [17/58] Processing paraphrase…
→ [18/58] Processing paraphrase…
→ [19/58] Processing paraphrase…
→ [20/58] Processing paraphrase…
→ [21/58] Processing paraphrase…
→ [22/58] Processing paraphrase…
→ [23/58] Processing paraphrase…
→ [24/58] Processing paraphrase…
→ [25/58] Processing paraphrase…
→ [26/58] Processing paraphrase…
→ [27/58] Processing paraphrase…
→ [28/58] Processing paraphrase…
→ [29/58] Processing paraphrase…
→ [30/58] Processing paraphrase…
→ [31/58] Processing

In [None]:
distinct_phrases = score_df_no_context[['phrase_num', 'original_phrase']].drop_duplicates()

In [None]:
# remove illegal control chars (keep \t, \n, \r)
_ILLEGAL_RE = re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F]")

def _clean_cell(x):
    if isinstance(x, str):
        return _ILLEGAL_RE.sub("", x)
    return x

def clean_for_excel(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    obj_cols = df.select_dtypes(include=["object"]).columns
    df[obj_cols] = df[obj_cols].applymap(_clean_cell)
    return df


In [None]:
save_loc = f"{nas_base_loc}/paraphrase examples/{specific_problem}.xlsx"

with pd.ExcelWriter(save_loc, engine="openpyxl") as xls:
    clean_for_excel(docs_df).to_excel(xls, sheet_name="docs", index=False)
    clean_for_excel(p_metadata).to_excel(xls, sheet_name="metadata", index=False)
    clean_for_excel(score_df_no_context).to_excel(xls, sheet_name="no context", index=False)
    clean_for_excel(known_scored).to_excel(xls, sheet_name="known", index=False)
    clean_for_excel(unknown_scored).to_excel(xls, sheet_name="unknown", index=False)
    clean_for_excel(distinct_phrases).to_excel(xls, sheet_name="LLR", index=False)
    


