In [71]:
import sys
import os
import json

import pandas as pd

In [78]:
sys.path.append(os.path.abspath('../../../src'))

from read_and_write_docs import read_jsonl, read_rds
from tokenize_and_score import load_model
from utils import apply_temp_doc_id, build_metadata_df
from n_gram_functions import (
    common_ngrams,
    pretty_print_common_ngrams,
    keep_before_phrase,
    score_phrases,
    add_pmf_column,
    score_phrases_no_context,
    compute_log_probs_with_median
)
from open_ai import initialise_client, llm

In [22]:
corpus = "Wiki"
data_type = "training"

known_loc = f"/Volumes/BCross/datasets/author_verification/{data_type}/{corpus}/known_raw.jsonl"
known = read_jsonl(known_loc)
known = apply_temp_doc_id(known)

unknown_loc = f"/Volumes/BCross/datasets/author_verification/{data_type}/{corpus}/unknown_raw.jsonl"
unknown = read_jsonl(unknown_loc)
unknown_df = apply_temp_doc_id(unknown)

metadata_loc = f"/Volumes/BCross/datasets/author_verification/{data_type}/metadata.rds"
metadata = read_rds(metadata_loc)
filtered_metadata = metadata[metadata['corpus'] == corpus]
agg_metadata = build_metadata_df(filtered_metadata, known, unknown)

In [23]:
problem_dataset_agg = read_jsonl("/Users/user/Documents/test_data/n-gram_tracing/Wiki_training_agg.jsonl")
problem_dataset_profile = read_jsonl("/Users/user/Documents/test_data/n-gram_tracing/Wiki_training_profile.jsonl")

In [24]:
same_author_problems = problem_dataset_agg[problem_dataset_agg['known_author'] == problem_dataset_agg['unknown_author']].copy()
same_author_problems.sort_values(["highest_common_count"], ascending=[False], inplace=True)
same_author_problems[(same_author_problems['highest_common_count'] >= 3) & (same_author_problems['highest_common_count'] <= 10)]

Unnamed: 0,problem,known_author,unknown_author,known_doc_id,unknown_doc_id,highest_common_count,highest_common_ngram
379,Greg_L vs Greg_L,Greg_L,Greg_L,greg_l_text_11,greg_l_text_10,9,", Ġthey Ġshould Ġhave Ġparticipated Ġin Ġthe Ġ..."
432,Haymaker vs Haymaker,Haymaker,Haymaker,haymaker_text_3,haymaker_text_2,9,"Ġat Ġthe Ġend Ġof Ġthe Ġday , Ġwe 're"
354,Fragments_of_Jade vs Fragments_of_Jade,Fragments_of_Jade,Fragments_of_Jade,fragments_of_jade_text_2,fragments_of_jade_text_10,8,"Ġme , Ġand Ġit 's Ġgetting Ġold .Ċ"
337,Fixentries vs Fixentries,Fixentries,Fixentries,fixentries_text_2,fixentries_text_5,8,Ġthe Ġindividual Ġher it ability Ġof Ġintellig...
248,DonaNobisPacem vs DonaNobisPacem,DonaNobisPacem,DonaNobisPacem,donanobispacem_text_5,donanobispacem_text_2,8,Ġafter Ġ 1 8 - 2 0 Ġweeks
...,...,...,...,...,...,...,...
222,David_Shankbone vs David_Shankbone,David_Shankbone,David_Shankbone,david_shankbone_text_1,david_shankbone_text_4,3,", Ġwhich Ġis"
211,D7G1DX~0 vs D7G1DX~0,D7G1DX~0,D7G1DX~0,d7g1dx_0_text_2,d7g1dx_0_text_5,3,Ġdon 't Ġthink
187,Collect vs Collect,Collect,Collect,collect_text_12,collect_text_11,3,", Ġand Ġi"
198,Cptnono vs Cptnono,Cptnono,Cptnono,cptnono_text_1,cptnono_text_12,3,", Ġthough .Ċ"


In [None]:
known_doc = "greg_l_text_11"
known_text = known[known['doc_id'] == known_doc].reset_index().loc[0, 'text']

unknown_doc = "greg_l_text_10"
unknown_text = unknown[unknown['doc_id'] == unknown_doc].reset_index().loc[0, 'text']

specific_problem_metadata = agg_metadata[(agg_metadata['known_doc_id'] == known_doc) & ((agg_metadata['unknown_doc_id'] == unknown_doc))].reset_index()
specific_problem_metadata['target'] = specific_problem_metadata['known_author'] == specific_problem_metadata['unknown_author']
specific_problem = specific_problem_metadata.loc[0, 'problem']

'Greg_L vs Greg_L'

In [26]:
tokenizer, model = load_model("/Volumes/BCross/models/Qwen 2.5/Qwen2.5-0.5B-Instruct")

In [27]:
common = common_ngrams(known_text, unknown_text, 2, model, tokenizer, lowercase=True)

In [30]:
n_gram_list = pretty_print_common_ngrams(common, tokenizer=tokenizer, order='len_desc', return_format='flat')
n_gram_list

[' allege harbor views that are contrary to the consensus',
 ', they should have participated in the poll.\n',
 ' had felt they had',
 ' dicklyon',
 ' meritorious',
 ' the eight editors',
 ' consensus view',
 ' of the',
 ' such as',
 ' was a',
 '.\nyou']

## Initialise OpenAI Stuff

In [31]:
client = initialise_client("../../../credentials.json")

In [47]:
def create_user_prompt(known_text, phrase):
    user_prompt = f"""
<DOC>
{known_text}
</DOC>
<NGRAM>
"{phrase}"
</NGRAM>
"""
    
    return user_prompt

In [48]:
def create_system_prompt(prompt_loc):
    with open(prompt_loc,"r") as f:
        system_prompt = f.read()
        
    return system_prompt

In [49]:
def parse_paraphrases(response, phrase):
    
    paraphrase_list = []
    for i in range(1, len(response.choices)):
        content = response.choices[i].message.content
        
        try:
            content_json = json.loads(content)
            for para in content_json['paraphrases']:
                if para != phrase:
                    paraphrase_list.append(para)  
        except:
            pass
        
    unique_list = list(set(paraphrase_list))
    
    return unique_list

In [50]:
system_prompt = create_system_prompt("../../../prompts/exhaustive_constrained_ngram_paraphraser_prompt_JSON.txt")

In [55]:
n_gram_dict = {}
width = len(str(len(n_gram_list)))  # e.g., 10 -> 2, 100 -> 3

for idx, phrase in enumerate(n_gram_list, start=1):
    user_prompt = create_user_prompt(known_text, phrase)
    response = llm(
        system_prompt,
        user_prompt,
        client,
        model="gpt-4.1",
        max_tokens=5000,
        temperature=0.7,
        n=10,
        response_format={"type": "json_object"},
    )
    paraphrases = parse_paraphrases(response, phrase)
    key = f"phrase_{idx:0{width}d}"  # -> phrase_01, phrase_002, etc.
    n_gram_dict[key] = {"phrase": phrase, "paraphrases": paraphrases}


In [56]:
n_gram_dict

{'phrase_01': {'phrase': ' allege harbor views that are contrary to the consensus',
  'paraphrases': ['allege have views that differ from the consensus',
   'allege harbour beliefs which are contrary to the consensus',
   'allege harbor views that run counter to the consensus',
   ' allege harbor opinions which are opposed to the consensus',
   'allege have opinions that are contrary to the consensus',
   'allege have beliefs that are contrary to the consensus',
   'allege possess views that go against the consensus',
   ' allege harbor opinions that are contrary to consensus',
   ' allege harbor beliefs which are opposed to the consensus',
   'allege harbour opinions that run counter to the consensus',
   'allege harbor opinions that are contrary to consensus',
   ' allege have opinions that are contrary to consensus',
   ' allege harbor beliefs that go against the consensus',
   ' allege harbour beliefs which are contrary to the consensus',
   'allege possess views which are contrary

In [74]:

def get_scored_df(n_gram_dict, full_text, tokenizer, model):
    
    dfs=[]
    
    for phrase_num, entry in n_gram_dict.items():  # keeps insertion order
        phrase = entry["phrase"]
        paraphrases = entry["paraphrases"]
        
        print(f"Completing {phrase_num} - {phrase}")
        base_text = keep_before_phrase(known_text, phrase)
        
        df = score_phrases(base_text, phrase, paraphrases, tokenizer, model).copy()
        df.insert(0, "phrase_num", phrase_num)  # put at beginning
        dfs.append(df)

    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()


In [75]:
known_scored = get_scored_df(n_gram_dict, known_text, tokenizer, model)

Completing phrase_01 -  allege harbor views that are contrary to the consensus
→ Scoring base_text alone…
   base_total = -167.7171

→ [1/277] Processing reference…
→ [2/277] Processing paraphrase…
→ [3/277] Processing paraphrase…
→ [4/277] Processing paraphrase…
→ [5/277] Processing paraphrase…
→ [6/277] Processing paraphrase…
→ [7/277] Processing paraphrase…
→ [8/277] Processing paraphrase…
→ [9/277] Processing paraphrase…
→ [10/277] Processing paraphrase…
→ [11/277] Processing paraphrase…
→ [12/277] Processing paraphrase…
→ [13/277] Processing paraphrase…
→ [14/277] Processing paraphrase…
→ [15/277] Processing paraphrase…
→ [16/277] Processing paraphrase…
→ [17/277] Processing paraphrase…
→ [18/277] Processing paraphrase…
→ [19/277] Processing paraphrase…
→ [20/277] Processing paraphrase…
→ [21/277] Processing paraphrase…
→ [22/277] Processing paraphrase…
→ [23/277] Processing paraphrase…
→ [24/277] Processing paraphrase…
→ [25/277] Processing paraphrase…
→ [26/277] Processing parap

In [76]:
known_scored

Unnamed: 0,phrase_num,phrase_type,phrase,tokens,sum_log_probs_base,sum_log_probs_inc_phrase,difference,phrase_log_probs,sum_log_probs_phrase
0,phrase_01,reference,allege harbor views that are contrary to the ...,"[Ġallege, Ġharbor, Ġviews, Ġthat, Ġare, Ġcontr...",-167.717062,-214.570143,46.853081,"[-13.314923286437988, -15.109182357788086, -5....",-46.853080
1,phrase_01,paraphrase,allege have views that differ from the consensus,"[al, lege, Ġhave, Ġviews, Ġthat, Ġdiffer, Ġfro...",-167.717062,-218.790776,51.073714,"[-14.468381881713867, -8.374942779541016, -8.5...",-51.073713
2,phrase_01,paraphrase,allege harbour beliefs which are contrary to t...,"[al, lege, Ġharbour, Ġbeliefs, Ġwhich, Ġare, Ġ...",-167.717062,-225.757962,58.040900,"[-14.468381881713867, -8.374942779541016, -13....",-58.040899
3,phrase_01,paraphrase,allege harbor views that run counter to the co...,"[al, lege, Ġharbor, Ġviews, Ġthat, Ġrun, Ġcoun...",-167.717062,-224.819965,57.102903,"[-14.468381881713867, -8.374942779541016, -14....",-57.102902
4,phrase_01,paraphrase,allege harbor opinions which are opposed to t...,"[Ġallege, Ġharbor, Ġopinions, Ġwhich, Ġare, Ġo...",-167.717062,-221.924563,54.207500,"[-13.314923286437988, -15.109182357788086, -6....",-54.207499
...,...,...,...,...,...,...,...,...,...
439,phrase_11,paraphrase,""".\nyou""","[.Ċ, you, ""]",-892.965883,-923.521836,30.555953,"[-3.666156530380249, -8.921257019042969, -10.1...",-22.751129
440,phrase_11,paraphrase,'. you',"[.'., Ġ, Ġyou, ']",-892.965883,-930.174937,37.209054,"[-16.047304153442383, -6.903303146362305, -9.4...",-40.340238
441,phrase_11,paraphrase,""".\nYou""","[.Ċ, You, ""]",-892.965883,-920.921140,27.955257,"[-3.666156530380249, -4.338685989379883, -12.1...",-20.150432
442,phrase_11,paraphrase,'.\nyou',"[.Ċ, you, ']",-892.965883,-918.430422,25.464539,"[-2.4166526794433594, -8.296977043151855, -7.2...",-17.987002


In [83]:
import math
import pandas as pd

def score_phrases(
    base_text: str,
    ref_phrase: str,
    paraphrases: list[str],
    tokenizer,
    model
) -> pd.DataFrame:
    """
    Returns rows for the reference and each paraphrase with:
      sum_log_probs_phrase (log-likelihood for the phrase tokens) and
      raw_prob = exp(sum_log_probs_phrase)
    """
    # 1) score base_text
    _, log_probs_base, _ = compute_log_probs_with_median(base_text.strip(), tokenizer, model)
    base_total = sum(log_probs_base)

    items = [("reference", ref_phrase)] + [("paraphrase", p) for p in paraphrases]
    rows = []

    for ptype, phrase in items:
        # a) phrase alone → token count
        tokens_phrase, log_probs_phrase, _ = compute_log_probs_with_median(phrase, tokenizer, model)
        n_phrase_tokens = len(tokens_phrase)

        # b) full sequence
        full_text = base_text + phrase
        tokens_full, log_probs_full, _ = compute_log_probs_with_median(full_text, tokenizer, model)

        # c) full sum (base + phrase)
        sum_before = sum(log_probs_full)

        # d/e) last n tokens correspond to phrase
        phrase_tokens    = tokens_full[-n_phrase_tokens:]
        phrase_log_probs = log_probs_full[-n_phrase_tokens:]

        # f) totals
        phrase_total = sum(phrase_log_probs)
        difference   = base_total - sum_before  # typically == -phrase_total

        # raw (unnormalized) probability of the phrase given the base
        raw_prob = math.exp(phrase_total)  # may underflow to 0.0 for long phrases; that's fine

        rows.append({
            "phrase_type":               ptype,
            "phrase":                    phrase,
            "tokens":                    phrase_tokens,
            "sum_log_probs_base":        base_total,
            "sum_log_probs_inc_phrase":  sum_before,
            "difference":                difference,
            "phrase_log_probs":          phrase_log_probs,
            "sum_log_probs_phrase":      phrase_total,
            "raw_prob":                  raw_prob,
        })

    return pd.DataFrame(rows, columns=[
        "phrase_type", "phrase", "tokens",
        "sum_log_probs_base", "sum_log_probs_inc_phrase",
        "difference", "phrase_log_probs", "sum_log_probs_phrase",
        "raw_prob",
    ])


def get_scored_df(n_gram_dict, full_text, tokenizer, model):
    """Row-concat each scored df, add phrase_num, sort, then rank paraphrases within each phrase_num."""
    dfs = []
    for phrase_num, entry in n_gram_dict.items():  # relies on insertion order
        phrase = entry["phrase"]
        paraphrases = entry["paraphrases"]

        base_text = keep_before_phrase(full_text, phrase)

        df = score_phrases(base_text, phrase, paraphrases, tokenizer, model).copy()
        df.insert(0, "phrase_num", phrase_num)  # first column
        dfs.append(df)

    if not dfs:
        return pd.DataFrame(columns=["phrase_num"])

    out = pd.concat(dfs, ignore_index=True)

    # sort by phrase_num (zero-padded → lexicographic == numeric)
    out = out.sort_values("phrase_num", kind="mergesort").reset_index(drop=True)

    # rank within phrase_num: reference -> 0; paraphrases ranked by descending raw_prob starting at 1
    out["rank"] = None
    mask = out["phrase_type"].eq("paraphrase")
    out.loc[mask, "rank"] = (
        out.loc[mask]
           .groupby("phrase_num")["raw_prob"]
           .rank(method="first", ascending=False)
           .astype(int)
    )
    out.loc[out["phrase_type"].eq("reference"), "rank"] = 0
    out["rank"] = out["rank"].astype(int)

    return out


In [84]:
known_scored = get_scored_df(n_gram_dict, known_text, tokenizer, model)

In [85]:
unknown_scored = get_scored_df(n_gram_dict, unknown_text, tokenizer, model)

In [102]:
import math
import pandas as pd
from typing import List

def score_phrases_no_context(
    ref_phrase: str,
    paraphrases: List[str],
    tokenizer,
    model
) -> pd.DataFrame:
    """
    Score the reference and each paraphrase *without* any ranking.
    Returns:
      phrase_type, phrase, tokens, log_probs, sum_log_probs, raw_prob
    """
    items = [("reference", ref_phrase)] + [("paraphrase", p) for p in paraphrases]
    rows = []

    for idx, (ptype, phrase) in enumerate(items, start=1):
        print(f"→ [{idx}/{len(items)}] Processing {ptype}…")
        tokens_phrase, log_probs_phrase, _ = compute_log_probs_with_median(phrase, tokenizer, model)
        phrase_total = sum(log_probs_phrase)
        raw_prob = math.exp(phrase_total)  # unnormalized prob

        rows.append({
            "phrase_type":   ptype,
            "phrase":        phrase,
            "tokens":        tokens_phrase,
            "log_probs":     log_probs_phrase,
            "sum_log_probs": phrase_total,
            "raw_prob":      raw_prob,
        })

    return pd.DataFrame(rows, columns=[
        "phrase_type", "phrase", "tokens", "log_probs", "sum_log_probs", "raw_prob"
    ])

def get_scored_df_no_context(n_gram_dict, tokenizer, model):
    """
    Row-concat each score_phrases_no_context df, add phrase_num, sort by phrase_num,
    then rank paraphrases within each phrase_num by descending raw_prob.
    'reference' rows always get rank 0.
    """
    dfs = []
    for phrase_num, entry in n_gram_dict.items():  # insertion order preserved
        phrase = entry["phrase"]
        paraphrases = entry["paraphrases"]

        df = score_phrases_no_context(phrase, paraphrases, tokenizer, model).copy()
        df.insert(0, "phrase_num", phrase_num)  # make it the first column
        dfs.append(df)

    if not dfs:
        return pd.DataFrame(columns=["phrase_num"])

    out = pd.concat(dfs, ignore_index=True)

    # zero-padded keys => lexicographic equals numeric order
    out = out.sort_values("phrase_num", kind="mergesort").reset_index(drop=True)

    # Rank within phrase_num: reference -> 0; paraphrases ranked by descending raw_prob starting at 1
    out["rank"] = None
    mask_para = out["phrase_type"].eq("paraphrase")
    out.loc[mask_para, "rank"] = (
        out.loc[mask_para]
           .groupby("phrase_num")["raw_prob"]
           .rank(method="first", ascending=False)  # use "dense" if you prefer 1,2,3 without gaps
           .astype(int)
    )
    out.loc[out["phrase_type"].eq("reference"), "rank"] = 0
    out["rank"] = out["rank"].astype(int)

    return out


In [103]:
score_df_no_context = get_scored_df_no_context(n_gram_dict, tokenizer, model)

→ [1/277] Processing reference…
→ [2/277] Processing paraphrase…
→ [3/277] Processing paraphrase…
→ [4/277] Processing paraphrase…
→ [5/277] Processing paraphrase…
→ [6/277] Processing paraphrase…
→ [7/277] Processing paraphrase…
→ [8/277] Processing paraphrase…
→ [9/277] Processing paraphrase…
→ [10/277] Processing paraphrase…
→ [11/277] Processing paraphrase…
→ [12/277] Processing paraphrase…
→ [13/277] Processing paraphrase…
→ [14/277] Processing paraphrase…
→ [15/277] Processing paraphrase…
→ [16/277] Processing paraphrase…
→ [17/277] Processing paraphrase…
→ [18/277] Processing paraphrase…
→ [19/277] Processing paraphrase…
→ [20/277] Processing paraphrase…
→ [21/277] Processing paraphrase…
→ [22/277] Processing paraphrase…
→ [23/277] Processing paraphrase…
→ [24/277] Processing paraphrase…
→ [25/277] Processing paraphrase…
→ [26/277] Processing paraphrase…
→ [27/277] Processing paraphrase…
→ [28/277] Processing paraphrase…
→ [29/277] Processing paraphrase…
→ [30/277] Processing pa

In [104]:
import re
import pandas as pd

# remove illegal control chars (keep \t, \n, \r)
_ILLEGAL_RE = re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F]")

def _clean_cell(x):
    if isinstance(x, str):
        return _ILLEGAL_RE.sub("", x)
    return x

def clean_for_excel(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    obj_cols = df.select_dtypes(include=["object"]).columns
    df[obj_cols] = df[obj_cols].applymap(_clean_cell)
    return df


In [105]:
save_loc = f"/Users/user/Documents/PhD Stuff/Supervisor Meetings/paraphrase examples/{specific_problem}_v2.xlsx"

with pd.ExcelWriter(save_loc, engine="openpyxl") as xls:
    clean_for_excel(specific_problem_metadata).to_excel(xls, sheet_name="metadata", index=False)
    clean_for_excel(score_df_no_context).to_excel(xls, sheet_name="no context", index=False)
    clean_for_excel(known_scored).to_excel(xls, sheet_name="known", index=False)
    clean_for_excel(unknown_scored).to_excel(xls, sheet_name="unknown", index=False)
    


