In [1]:
import sys
import os
import re

import pandas as pd
from typing import Any, Dict, List, Sequence, Set, Tuple, Literal
from collections import defaultdict

In [2]:
sys.path.append(os.path.abspath('../../src'))

from read_and_write_docs import read_jsonl, read_rds
from tokenize_and_score import load_model, compute_log_probs_with_median
from utils import apply_temp_doc_id, build_metadata_df

## Load the Data

We import the known, unknown and metadata. I have already identified documents of interest for a same_author = True and same_author = False test.

In [3]:
known_loc = "/Volumes/BCross/datasets/author_verification/training/Wiki/known_raw.jsonl"
known = read_jsonl(known_loc)
known = apply_temp_doc_id(known)

unknown_loc = "/Volumes/BCross/datasets/author_verification/training/Wiki/unknown_raw.jsonl"
unknown = read_jsonl(unknown_loc)
unknown_df = apply_temp_doc_id(unknown)

metadata_loc = "/Volumes/BCross/datasets/author_verification/training/metadata.rds"
metadata = read_rds(metadata_loc)
filtered_metadata = metadata[metadata['corpus'] == 'Wiki']
agg_metadata = build_metadata_df(filtered_metadata, known, unknown)

In [4]:
tokenizer, model = load_model("/Volumes/BCross/models/Qwen 2.5/Qwen2.5-0.5B-Instruct")

## Select Texts

Some texts i identified from the Wiki training corpus as having common n-grams within the metadata problem space, while also ensuring that the same author is in both problems are below:

* Same Author = True
    * fipplet_text_2
    * fipplet_text_5
* Same Author = False
    * falcon9x5_text_3
    * fipplet_text_5

In [5]:
same_author_known = known[known['doc_id'] == 'greg_l_text_11'].reset_index().loc[0, 'text']
same_author_unknown = unknown[unknown['doc_id'] == 'greg_l_text_10'].reset_index().loc[0, 'text']

diff_author_known = known[known['doc_id'] == 'britmax_text_1'].reset_index().loc[0, 'text']
diff_author_unknown = unknown[unknown['doc_id'] == 'brotherdarksoul_text_4'].reset_index().loc[0, 'text']

In [6]:
same_author_unknown

"It s a practice I suspect would benefit Wikipedia if more editors did so.\nThink of my user page as being a  Well done, Guy, on your exploration into the discipline of TDOTSCIFOGLH.\nYou seem to have an engineering bent in your makeup as you have properly touched upon real-world shortcomings and omissions in the measure tolerance, calibration, traceability, etc.\nI 'can' tell you that though American I am all-things-metric and have been for decades.\nSo I used a tape measure from a major manufacturer that was marked in millimeters, so the value was not the product of a conversion.\nAs for the 'weight' of the cover, GFHandel, due to your previous prompting, I actually went out there in the middle of the road one day armed with a prying tool and quickly realized it was a monster.\nOnly recently did I read that manhole covers were disappearing in some U.S. city so they could be cut up and sold to recycling centers.\nThey apparently weigh nearly a hundred kilos.\nI had been planning if I 

## Common n-grams 

Return the common n-grams between the two texts.

In [None]:
def common_ngrams(
    text1: str,
    text2: str,
    n: int,
    model: Any = None,
    tokenizer: Any = None,
    include_subgrams: bool = False,
) -> Dict[int, Set[Tuple[Any, ...]]]:
    """
    Return shared n-grams of length >= n between two texts.

    If include_subgrams is False (default), remove any shared n-gram that is a
    contiguous subspan of a longer shared n-gram. (So a 5-gram that’s part of a
    shared 6-gram is excluded; unrelated 5-grams remain.)
    """
    if n < 1:
        raise ValueError("n must be >= 1")

    def _word_tokens(s: str) -> List[str]:
        return re.findall(r"\w+", s.casefold())

    def _hf_tokens(txt: str) -> List[Any]:
        if hasattr(tokenizer, "tokenize"):
            return list(tokenizer.tokenize(txt))
        enc = tokenizer(
            txt,
            add_special_tokens=False,
            return_attention_mask=False,
            return_token_type_ids=False,
        )
        input_ids = enc.get("input_ids", [])
        if input_ids and isinstance(input_ids[0], (list, tuple)):
            input_ids = input_ids[0]
        if hasattr(tokenizer, "convert_ids_to_tokens"):
            return tokenizer.convert_ids_to_tokens(input_ids)
        return input_ids

    def _ngrams_by_len(seq: Sequence[Any], min_n: int) -> Dict[int, Set[Tuple[Any, ...]]]:
        out: Dict[int, Set[Tuple[Any, ...]]] = {}
        L = len(seq)
        for k in range(min_n, L + 1):
            s: Set[Tuple[Any, ...]] = set()
            for i in range(0, L - k + 1):
                s.add(tuple(seq[i : i + k]))
            if s:
                out[k] = s
        return out

    token_mode = (model is not None) and (tokenizer is not None)
    seq1 = _hf_tokens(text1) if token_mode else _word_tokens(text1)
    seq2 = _hf_tokens(text2) if token_mode else _word_tokens(text2)

    ngrams1 = _ngrams_by_len(seq1, n)
    ngrams2 = _ngrams_by_len(seq2, n)

    common: Dict[int, Set[Tuple[Any, ...]]] = {}
    for k in set(ngrams1.keys()).intersection(ngrams2.keys()):
        inter = ngrams1[k] & ngrams2[k]
        if inter:
            common[k] = inter

    if include_subgrams or not common:
        return common

    # Remove n-grams that are contiguous subspans of any longer shared n-gram
    to_remove: Dict[int, Set[Tuple[Any, ...]]] = defaultdict(set)
    lengths = sorted(common.keys())
    for k in lengths:
        # For each longer length, generate all contiguous subspans down to n
        for longer_k in [L for L in lengths if L > k]:
            for g in common[longer_k]:
                # produce all subspans of length k from g
                for i in range(0, longer_k - k + 1):
                    to_remove[k].add(g[i : i + k])

    # Apply removals
    for k, rem in to_remove.items():
        if k in common:
            common[k] = {g for g in common[k] if g not in rem}
            if not common[k]:
                del common[k]

    return common

def pretty_print_common_ngrams(
    common: Dict[int, Set[Tuple[Any, ...]]],
    sep: str = " ",
    order: str = "count_desc",  # "count_desc" | "len_asc" | "len_desc"
) -> None:
    """
    Pretty-print shared n-grams.

    - Groups by n (the integer length).
    - Converts each n-gram tuple into a string joined by `sep`.
    - Prints lists, ordered by the number of n-grams per length (descending) by default.
    """
    if not common:
        print("{}")
        return

    # Convert tuples to strings per length key
    grouped: Dict[int, List[str]] = {
        n: sorted(sep.join(map(str, g)) for g in grams)
        for n, grams in common.items()
    }

    # Choose group ordering
    if order == "count_desc":
        items = sorted(grouped.items(), key=lambda kv: (-len(kv[1]), kv[0]))
    elif order == "len_asc":
        items = sorted(grouped.items(), key=lambda kv: kv[0])
    elif order == "len_desc":
        items = sorted(grouped.items(), key=lambda kv: -kv[0])
    else:
        items = sorted(grouped.items(), key=lambda kv: (-len(kv[1]), kv[0]))

    # Print: e.g., "3-grams (5): ['a b c', 'd e f', ...]"
    for n, strings in items:
        print(f"{n}-grams ({len(strings)}): {strings}")
        
def keep_before_phrase(text: str, phrase: str, case_insensitive: bool = False) -> str:
    """
    Return everything in `text` before the first occurrence of `phrase`.
    If `phrase` isn’t found, returns the entire `text`.

    :param text:       The full string you want to trim.
    :param phrase:     The substring (phrase) you want to stop at.
    :param case_insensitive:  If True, match phrase ignoring case.
    :return:           The portion of `text` before `phrase`.
    """
    if case_insensitive:
        idx = text.lower().find(phrase.lower())
    else:
        idx = text.find(phrase)

    return text[:idx] if idx != -1 else text

## Same Author = True Test

In [8]:
common_same_author = common_ngrams(same_author_known, same_author_unknown, n=2, tokenizer=tokenizer, model=model)
pretty_print_common_ngrams(common_same_author)

2-grams (5): ['.Ċ You', 'Ġconsensus Ġview', 'Ġof Ġthe', 'Ġsuch Ġas', 'Ġwas Ġa']
3-grams (3): ['ĠDick ly on', 'Ġmer itor ious', 'Ġthe Ġeight Ġeditors']
9-grams (2): [', Ġthey Ġshould Ġhave Ġparticipated Ġin Ġthe Ġpoll .Ċ', 'Ġallege Ġharbor Ġviews Ġthat Ġare Ġcontrary Ġto Ġthe Ġconsensus']
4-grams (1): ['Ġhad Ġfelt Ġthey Ġhad']


In [11]:
from typing import Any, Dict, List, Set, Tuple

def pretty_print_common_ngrams(
    common: Dict[int, Set[Tuple[Any, ...]]],
    sep: str = " ",
    order: str = "count_desc",  # "count_desc" | "len_asc" | "len_desc"
    tokenizer=None,             # Optional HuggingFace tokenizer
) -> None:
    """
    Pretty-print shared n-grams.

    - Groups by n (the integer length).
    - If `tokenizer` is None: converts each n-gram tuple into a string joined by `sep` (original behavior).
    - If `tokenizer` is provided: decodes token ids/strings to readable text (special tokens removed).
    - Prints lists, ordered by the number of n-grams per length (descending) by default.
    """
    if not common:
        print("{}")
        return

    def stringify_ngram(ngram: Tuple[Any, ...]) -> str:
        # Original behavior (no tokenizer): join items with sep
        if tokenizer is None:
            return sep.join(map(str, ngram))

        # With tokenizer: decode to human-readable text
        toks = list(ngram)

        # If everything is ids, use fast decode
        if all(isinstance(t, int) for t in toks):
            return tokenizer.decode(
                toks,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False,
            )

        # Otherwise, we may have token *strings* or a mix of ids & strings
        specials = set(getattr(tokenizer, "all_special_tokens", []))
        norm_tokens: List[str] = []
        for t in toks:
            if isinstance(t, int):
                # convert id -> token string
                norm_tokens.append(tokenizer.convert_ids_to_tokens(t))
            else:
                norm_tokens.append(str(t))

        # Drop special tokens (e.g., <s>, </s>)
        norm_tokens = [t for t in norm_tokens if t not in specials]

        # Let the tokenizer handle spacing/newlines between tokens
        return tokenizer.convert_tokens_to_string(norm_tokens)

    # Convert tuples to strings per length key
    grouped: Dict[int, List[str]] = {
        n: sorted(stringify_ngram(g) for g in grams)
        for n, grams in common.items()
    }

    # Choose group ordering
    if order == "count_desc":
        items = sorted(grouped.items(), key=lambda kv: (-len(kv[1]), kv[0]))
    elif order == "len_asc":
        items = sorted(grouped.items(), key=lambda kv: kv[0])
    elif order == "len_desc":
        items = sorted(grouped.items(), key=lambda kv: -kv[0])
    else:
        items = sorted(grouped.items(), key=lambda kv: (-len(kv[1]), kv[0]))

    # Print: e.g., "3-grams (5): ['a b c', 'd e f', ...]"
    for n, strings in items:
        print(f"{n}-grams ({len(strings)}): {strings}")


In [19]:
common_same_author[9]

{(',',
  'Ġthey',
  'Ġshould',
  'Ġhave',
  'Ġparticipated',
  'Ġin',
  'Ġthe',
  'Ġpoll',
  '.Ċ'),
 ('Ġallege',
  'Ġharbor',
  'Ġviews',
  'Ġthat',
  'Ġare',
  'Ġcontrary',
  'Ġto',
  'Ġthe',
  'Ġconsensus')}

In [15]:
pretty_print_common_ngrams(common_same_author, tokenizer=tokenizer, order='len_asc')

2-grams (5): [' consensus view', ' of the', ' such as', ' was a', '.\nYou']
3-grams (3): [' Dicklyon', ' meritorious', ' the eight editors']
4-grams (1): [' had felt they had']
9-grams (2): [' allege harbor views that are contrary to the consensus', ', they should have participated in the poll.\n']


In [73]:
same_phrase_1 = "allege harbor views that are contrary to the consensus"
same_para_1 = [
    "allege harbor views that are opposed to the consensus",
    "allege harbor views that are antithetical to the consensus",
    "allege harbor opinions that are contrary to the consensus",
    "allege harbor opinions that are opposed to the consensus",
    "allege harbor opinions that are antithetical to the consensus",
    "allege harbor beliefs that are contrary to the consensus",
    "allege harbor beliefs that are opposed to the consensus",
    "allege harbor beliefs that are antithetical to the consensus",
    "allege harbor positions that are contrary to the consensus",
    "allege harbor positions that are opposed to the consensus",
    "allege harbor positions that are antithetical to the consensus",
    "allege hold views that are contrary to the consensus",
    "allege hold views that are opposed to the consensus",
    "allege hold views that are antithetical to the consensus",
    "allege hold opinions that are contrary to the consensus",
    "allege hold opinions that are opposed to the consensus",
    "allege hold opinions that are antithetical to the consensus",
    "allege hold beliefs that are contrary to the consensus",
    "allege hold beliefs that are opposed to the consensus",
    "allege hold beliefs that are antithetical to the consensus",
    "allege hold positions that are contrary to the consensus",
    "allege hold positions that are opposed to the consensus",
    "allege hold positions that are antithetical to the consensus", 
    "allege have views that are contrary to the consensus",
    "allege have views that are opposed to the consensus", "allege have views that are antithetical to the consensus", "allege have opinions that are contrary to the consensus", "allege have opinions that are opposed to the consensus", "allege have opinions that are antithetical to the consensus", "allege have beliefs that are contrary to the consensus", "allege have beliefs that are opposed to the consensus", "allege have beliefs that are antithetical to the consensus", "allege have positions that are contrary to the consensus", "allege have positions that are opposed to the consensus", "allege have positions that are antithetical to the consensus", "claim harbor views that are contrary to the consensus", "claim harbor views that are opposed to the consensus", "claim harbor views that are antithetical to the consensus", "claim harbor opinions that are contrary to the consensus", "claim harbor opinions that are opposed to the consensus", "claim harbor opinions that are antithetical to the consensus", "claim harbor beliefs that are contrary to the consensus", "claim harbor beliefs that are opposed to the consensus", "claim harbor beliefs that are antithetical to the consensus", "claim harbor positions that are contrary to the consensus", "claim harbor positions that are opposed to the consensus", "claim harbor positions that are antithetical to the consensus", "claim hold views that are contrary to the consensus", "claim hold views that are opposed to the consensus", "claim hold views that are antithetical to the consensus", "claim hold opinions that are contrary to the consensus", "claim hold opinions that are opposed to the consensus", "claim hold opinions that are antithetical to the consensus", "claim hold beliefs that are contrary to the consensus", "claim hold beliefs that are opposed to the consensus", "claim hold beliefs that are antithetical to the consensus", "claim hold positions that are contrary to the consensus", "claim hold positions that are opposed to the consensus", "claim hold positions that are antithetical to the consensus", "claim have views that are contrary to the consensus", "claim have views that are opposed to the consensus", "claim have views that are antithetical to the consensus", "claim have opinions that are contrary to the consensus", "claim have opinions that are opposed to the consensus", "claim have opinions that are antithetical to the consensus", "claim have beliefs that are contrary to the consensus", "claim have beliefs that are opposed to the consensus", "claim have beliefs that are antithetical to the consensus", "claim have positions that are contrary to the consensus", "claim have positions that are opposed to the consensus", "claim have positions that are antithetical to the consensus", "assert harbor views that are contrary to the consensus", "assert harbor views that are opposed to the consensus", "assert harbor views that are antithetical to the consensus", "assert harbor opinions that are contrary to the consensus", "assert harbor opinions that are opposed to the consensus", "assert harbor opinions that are antithetical to the consensus", "assert harbor beliefs that are contrary to the consensus", "assert harbor beliefs that are opposed to the consensus", "assert harbor beliefs that are antithetical to the consensus", "assert harbor positions that are contrary to the consensus", "assert harbor positions that are opposed to the consensus", "assert harbor positions that are antithetical to the consensus", "assert hold views that are contrary to the consensus", "assert hold views that are opposed to the consensus", "assert hold views that are antithetical to the consensus", "assert hold opinions that are contrary to the consensus", "assert hold opinions that are opposed to the consensus", "assert hold opinions that are antithetical to the consensus", "assert hold beliefs that are contrary to the consensus", "assert hold beliefs that are opposed to the consensus", "assert hold beliefs that are antithetical to the consensus", "assert hold positions that are contrary to the consensus", "assert hold positions that are opposed to the consensus", "assert hold positions that are antithetical to the consensus", "assert have views that are contrary to the consensus", "assert have views that are opposed to the consensus", "assert have views that are antithetical to the consensus", "assert have opinions that are contrary to the consensus", "assert have opinions that are opposed to the consensus", "assert have opinions that are antithetical to the consensus", "assert have beliefs that are contrary to the consensus", "assert have beliefs that are opposed to the consensus", "assert have beliefs that are antithetical to the consensus", "assert have positions that are contrary to the consensus", "assert have positions that are opposed to the consensus", "assert have positions that are antithetical to the consensus", "contend harbor views that are contrary to the consensus", "contend harbor views that are opposed to the consensus", "contend harbor views that are antithetical to the consensus", "contend harbor opinions that are contrary to the consensus", "contend harbor opinions that are opposed to the consensus", "contend harbor opinions that are antithetical to the consensus", "contend harbor beliefs that are contrary to the consensus", "contend harbor beliefs that are opposed to the consensus", "contend harbor beliefs that are antithetical to the consensus", "contend harbor positions that are contrary to the consensus", "contend harbor positions that are opposed to the consensus", "contend harbor positions that are antithetical to the consensus", "contend hold views that are contrary to the consensus", "contend hold views that are opposed to the consensus", "contend hold views that are antithetical to the consensus", "contend hold opinions that are contrary to the consensus", "contend hold opinions that are opposed to the consensus", "contend hold opinions that are antithetical to the consensus", "contend hold beliefs that are contrary to the consensus", "contend hold beliefs that are opposed to the consensus", "contend hold beliefs that are antithetical to the consensus", "contend hold positions that are contrary to the consensus", "contend hold positions that are opposed to the consensus", "contend hold positions that are antithetical to the consensus", "contend have views that are contrary to the consensus", "contend have views that are opposed to the consensus", "contend have views that are antithetical to the consensus", "contend have opinions that are contrary to the consensus", "contend have opinions that are opposed to the consensus", "contend have opinions that are antithetical to the consensus", "contend have beliefs that are contrary to the consensus", "contend have beliefs that are opposed to the consensus", "contend have beliefs that are antithetical to the consensus", "contend have positions that are contrary to the consensus", "contend have positions that are opposed to the consensus", "contend have positions that are antithetical to the consensus", "allege harbour views that are contrary to the consensus", "allege harbour opinions that are contrary to the consensus", "allege harbour beliefs that are contrary to the consensus", "allege harbour positions that are contrary to the consensus", "claim harbour views that are contrary to the consensus", "claim harbour opinions that are contrary to the consensus", "claim harbour beliefs that are contrary to the consensus", "claim harbour positions that are contrary to the consensus", "assert harbour views that are contrary to the consensus", "assert harbour opinions that are contrary to the consensus", "assert harbour beliefs that are contrary to the consensus", "assert harbour positions that are contrary to the consensus", "contend harbour views that are contrary to the consensus", "contend harbour opinions that are contrary to the consensus", "contend harbour beliefs that are contrary to the consensus", "contend harbour positions that are contrary to the consensus", "allege harbour views that are opposed to the consensus", "claim harbour views that are opposed to the consensus", "assert harbour views that are opposed to the consensus", "contend harbour views that are opposed to the consensus", "allege harbour opinions that are opposed to the consensus", "claim harbour opinions that are opposed to the consensus", "assert harbour opinions that are opposed to the consensus", "contend harbour opinions that are opposed to the consensus", "allege harbour beliefs that are opposed to the consensus", "claim harbour beliefs that are opposed to the consensus", "assert harbour beliefs that are opposed to the consensus", "contend harbour beliefs that are opposed to the consensus", "allege harbour positions that are opposed to the consensus", "claim harbour positions that are opposed to the consensus", "assert harbour positions that are opposed to the consensus", "contend harbour positions that are opposed to the consensus", "allege harbour views that are antithetical to the consensus", "claim harbour views that are antithetical to the consensus", "assert harbour views that are antithetical to the consensus", "contend harbour views that are antithetical to the consensus", "allege harbour opinions that are antithetical to the consensus", "claim harbour opinions that are antithetical to the consensus", "assert harbour opinions that are antithetical to the consensus", "contend harbour opinions that are antithetical to the consensus", "allege harbour beliefs that are antithetical to the consensus", "claim harbour beliefs that are antithetical to the consensus", "assert harbour beliefs that are antithetical to the consensus", "contend harbour beliefs that are antithetical to the consensus", "allege harbour positions that are antithetical to the consensus", "claim harbour positions that are antithetical to the consensus", "assert harbour positions that are antithetical to the consensus", "contend harbour positions that are antithetical to the consensus", "alledge harbor views that are contrary to the consensus", "aver harbor views that are contrary to the consensus", "allege keep views that are contrary to the consensus"]
same_known_text_1 = keep_before_phrase(same_author_known, same_phrase_1)
same_unknown_text_1 = keep_before_phrase(same_author_unknown, same_phrase_1)

In [75]:
same_phrase_2 = "they should have participated in the poll"
same_para_2 = ["they should've participated in the poll", "they should have taken part in the poll", "they should've taken part in the poll", "they should have partaken in the poll", "they should've partaken in the poll", "in the poll, they should have participated", "in the poll, they should've participated", "in the poll, they should have taken part", "in the poll, they should've taken part", "in the poll, they should have partaken", "in the poll, they should've partaken", "they should of participated in the poll", "they should of taken part in the poll", "they should of partaken in the poll", "they should have particpated in the poll", "they should've particpated in the poll", "they should have participated in the vote", "they should've participated in the vote", "they should have taken part in the vote", "they should've taken part in the vote", "they should have partaken in the vote", "they should've partaken in the vote", "in the vote, they should have participated", "in the vote, they should've participated", "in the vote, they should have taken part", "in the vote, they should've taken part", "in the vote, they should have partaken", "in the vote, they should've partaken", "they shoulda participated in the poll", "they shoulda taken part in the poll", "they shoulda partaken in the poll"]
same_known_text_2 = keep_before_phrase(same_author_known, same_phrase_2)
same_unknown_text_2 = keep_before_phrase(same_author_unknown, same_phrase_2)

In [77]:
same_phrase_3 = "truly had felt they had"
same_para_3 = ["truely had felt they had", "had truly felt they had", "truly had felt they'd", "had truly felt they'd", "truly had felt they possessed", "had truly felt they possessed", "truly had believed they had", "had truly believed they had", "truly had believed they'd", "had truly believed they'd", "truly had believed they possessed", "had truly believed they possessed", "truly had thought they had", "had truly thought they had", "truly had thought they'd", "had truly thought they'd", "truly had thought they possessed", "had truly thought they possessed", "truly had reckoned they had", "had truly reckoned they had", "truly had reckoned they'd", "had truly reckoned they'd", "truly had reckoned they possessed", "had truly reckoned they possessed", "truly had supposed they had", "had truly supposed they had", "truly had supposed they'd", "had truly supposed they'd", "truly had supposed they possessed", "had truly supposed they possessed", "truly had figured they had", "had truly figured they had", "truly had figured they'd", "had truly figured they'd", "truly had figured they possessed", "had truly figured they possessed", "really had felt they had", "had really felt they had", "really had felt they'd", "had really felt they'd", "really had felt they possessed", "had really felt they possessed", "really had believed they had", "had really believed they had", "really had believed they'd", "had really believed they'd", "really had believed they possessed", "had really believed they possessed", "really had thought they had", "had really thought they had", "really had thought they'd", "had really thought they'd", "really had thought they possessed", "had really thought they possessed", "really had reckoned they had", "had really reckoned they had", "really had reckoned they'd", "had really reckoned they'd", "really had reckoned they possessed", "had really reckoned they possessed", "really had supposed they had", "had really supposed they had", "really had supposed they'd", "had really supposed they'd", "really had supposed they possessed", "had really supposed they possessed", "really had figured they had", "had really figured they had", "really had figured they'd", "had really figured they'd", "really had figured they possessed", "had really figured they possessed", "genuinely had felt they had", "had genuinely felt they had", "genuinely had felt they'd", "had genuinely felt they'd", "genuinely had felt they possessed", "had genuinely felt they possessed", "genuinely had believed they had", "had genuinely believed they had", "genuinely had believed they'd", "had genuinely believed they'd", "genuinely had believed they possessed", "had genuinely believed they possessed", "genuinely had thought they had", "had genuinely thought they had", "genuinely had thought they'd", "had genuinely thought they'd", "genuinely had thought they possessed", "had genuinely thought they possessed", "genuinely had reckoned they had", "had genuinely reckoned they had", "genuinely had reckoned they'd", "had genuinely reckoned they'd", "genuinely had reckoned they possessed", "had genuinely reckoned they possessed", "genuinely had supposed they had", "had genuinely supposed they had", "genuinely had supposed they'd", "had genuinely supposed they'd", "genuinely had supposed they possessed", "had genuinely supposed they possessed", "genuinely had figured they had", "had genuinely figured they had", "genuinely had figured they'd", "had genuinely figured they'd", "genuinely had figured they possessed", "had genuinely figured they possessed", "indeed had felt they had", "had indeed felt they had", "indeed had felt they'd", "had indeed felt they'd", "indeed had felt they possessed", "had indeed felt they possessed", "indeed had believed they had", "had indeed believed they had", "indeed had believed they'd", "had indeed believed they'd", "indeed had believed they possessed", "had indeed believed they possessed", "indeed had thought they had", "had indeed thought they had", "indeed had thought they'd", "had indeed thought they'd", "indeed had thought they possessed", "had indeed thought they possessed", "indeed had reckoned they had", "had indeed reckoned they had", "indeed had reckoned they'd", "had indeed reckoned they'd", "indeed had reckoned they possessed", "had indeed reckoned they possessed", "indeed had supposed they had", "had indeed supposed they had", "indeed had supposed they'd", "had indeed supposed they'd", "indeed had supposed they possessed", "had indeed supposed they possessed", "indeed had figured they had", "had indeed figured they had", "indeed had figured they'd", "had indeed figured they'd", "indeed had figured they possessed", "had indeed figured they possessed", "actually had felt they had", "had actually felt they had", "actually had felt they'd", "had actually felt they'd", "actually had felt they possessed", "had actually felt they possessed", "actually had believed they had", "had actually believed they had", "actually had believed they'd", "had actually believed they'd", "actually had believed they possessed", "had actually believed they possessed", "actually had thought they had", "had actually thought they had", "actually had thought they'd", "had actually thought they'd", "actually had thought they possessed", "had actually thought they possessed", "actually had reckoned they had", "had actually reckoned they had", "actually had reckoned they'd", "had actually reckoned they'd", "actually had reckoned they possessed", "had actually reckoned they possessed", "actually had supposed they had", "had actually supposed they had", "actually had supposed they'd", "had actually supposed they'd", "actually had supposed they possessed", "had actually supposed they possessed", "actually had figured they had", "had actually figured they had", "actually had figured they'd", "had actually figured they'd", "actually had figured they possessed", "had actually figured they possessed", "truly had had a feeling they had", "had truly had a feeling they had", "truly had had a feeling they'd", "had truly had a feeling they'd", "truly had had a feeling they possessed", "had truly had a feeling they possessed", "really had had a feeling they had", "had really had a feeling they had", "really had had a feeling they'd", "had really had a feeling they'd", "really had had a feeling they possessed", "had really had a feeling they possessed"]
same_known_text_3 = keep_before_phrase(same_author_known, same_phrase_3)
same_unknown_text_3 = keep_before_phrase(same_author_unknown, same_phrase_3)

In [78]:
same_phrase_4 = "the consensus view"
same_para_4 = ["the consensus opinion", "the consensus position", "the consensus stance", "the consensus perspective", "the consensus viewpoint", "the consensus standpoint", "the consensus take", "the consensus outlook", "the consensus judgment", "the consensus judgement", "the consensus POV", "the consensus's view", "the consensus' view", "the consensus's opinion", "the consensus' opinion", "the consensus's position", "the consensus' position", "the consensus's stance", "the consensus' stance", "the consensus's perspective", "the consensus' perspective", "the consensus's viewpoint", "the consensus' viewpoint", "the consensus's standpoint", "the consensus' standpoint", "the consensus's take", "the consensus' take", "the consensus's outlook", "the consensus' outlook", "the consensus's judgment", "the consensus' judgment", "the consensus's judgement", "the consensus' judgement", "the consensus's POV", "the consensus' POV", "the view of the consensus", "the opinion of the consensus", "the position of the consensus", "the stance of the consensus", "the perspective of the consensus", "the viewpoint of the consensus", "the standpoint of the consensus", "the take of the consensus", "the outlook of the consensus", "the judgment of the consensus", "the judgement of the consensus", "the POV of the consensus", "the concensus view", "the consencus view", "the consensus veiw", "the concensus opinion", "the consencus opinion", "the concensus viewpoint", "the consencus viewpoint", "the concensus POV", "the consencus POV"]
same_known_text_4 = keep_before_phrase(same_author_known, same_phrase_4)
same_unknown_text_4 = keep_before_phrase(same_author_unknown, same_phrase_4)

In [79]:
same_phrase_5 = "they elected to"
same_para_5 = ["they 'elected' to", "they chose to", "they decided to", "they opted to", "they resolved to", "they determined to", "they made the decision to", "they made a decision to", "they took the decision to", "they took a decision to", "they reached a decision to", "they reached the decision to", "they came to a decision to", "they came to the decision to", "they arrived at a decision to", "they made the choice to", "they made a choice to", "they made the call to", "they made up their minds to", "they made up their mind to", "they made a determination to", "they reached a determination to", "they elcted to", "they decieded to", "they optd to", "they resovled to", "they determind to", "they made a desision to",]
same_known_text_5 = keep_before_phrase(same_author_known, same_phrase_5)
same_unknown_text_5 = keep_before_phrase(same_author_unknown, same_phrase_5)

In [80]:
import torch
import pandas as pd
from typing import List

def compute_log_probs_with_median(text: str, tokenizer, model):
    """
    For each token (including the first), returns:
      - tokens: list of tokenizer.convert_ids_to_tokens
      - log_probs: list of log-probs for each token
      - median_logprobs: median log-prob of the distribution at each step
    """
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"]         # shape [1, seq_len]
    # --- ALIGN TOKENS CORRECTLY HERE ---
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    with torch.no_grad():
        outputs = model(input_ids)
    logits = outputs.logits                 # [1, seq_len, vocab_size]

    log_probs = []
    median_logprobs = []
    # for each position i, look at logits from i-1 (or the BOS for i=0)
    for i in range(input_ids.size(1)):
        prev_idx = 0 if i == 0 else i - 1
        dist = torch.log_softmax(logits[0, prev_idx], dim=-1)
        log_prob = dist[input_ids[0, i]].item()
        median_lp = float(dist.median().item())
        log_probs.append(log_prob)
        median_logprobs.append(median_lp)

    return tokens, log_probs, median_logprobs

def score_phrases(
    base_text: str,
    ref_phrase: str,
    paraphrases: List[str],
    tokenizer,
    model
) -> pd.DataFrame:
    """
    1) Score base_text alone → base_total
    2) For each phrase (reference + paraphrases):
         a) Get its token count by scoring phrase alone
         b) Score base_text + phrase → full tokens & log_probs
         c) sum_before = sum(full log_probs)
         d) phrase_tokens    = last n_phrase tokens of full tokens
         e) phrase_log_probs = last n_phrase values of full log_probs
         f) phrase_total     = sum(phrase_log_probs)
         g) difference       = base_total - sum_before
         h) APPEND row
    3) Return DataFrame with columns:
       phrase_type, phrase, tokens, base_total, sum_before,
       log_probs, phrase_total, difference
    """
    # 1) score base_text
    print("→ Scoring base_text alone…")
    _, log_probs_base, _ = compute_log_probs_with_median(base_text.strip(), tokenizer, model)
    base_total = sum(log_probs_base)
    print(f"   base_total = {base_total:.4f}\n")

    items = [("reference", ref_phrase)] + [("paraphrase", p) for p in paraphrases]
    rows = []

    for idx, (ptype, phrase) in enumerate(items, start=1):
        print(f"→ [{idx}/{len(items)}] Processing {ptype}…")

        # a) phrase alone → get token count
        tokens_phrase, log_probs_phrase, _ = compute_log_probs_with_median(phrase, tokenizer, model)
        n_phrase_tokens = len(tokens_phrase)
        # b) full sequence
        full_text = base_text + phrase
        tokens_full, log_probs_full, _ = compute_log_probs_with_median(full_text, tokenizer, model)
        # c) full sum
        sum_before = sum(log_probs_full)
        # d/e) slice last n_phrase_tokens
        phrase_tokens    = tokens_full[-n_phrase_tokens:]
        phrase_log_probs = log_probs_full[-n_phrase_tokens:]
        # f/g) compute sums
        phrase_total = sum(phrase_log_probs)
        difference   = base_total - sum_before
        # h) collect
        rows.append({
            "phrase_type":  ptype,
            "phrase":       phrase,
            "tokens":       phrase_tokens,
            "sum_log_probs_base":   base_total,
            "sum_log_probs_inc_phrase":   sum_before,
            "difference":   difference,
            "phrase_log_probs":    phrase_log_probs,
            "sum_log_probs_phrase": phrase_total,
        })

    return pd.DataFrame(rows, columns=[
        "phrase_type", "phrase", "tokens",
        "sum_log_probs_base", "sum_log_probs_inc_phrase",
        "difference", "phrase_log_probs", "sum_log_probs_phrase",
    ])


In [86]:
import math
import pandas as pd
from typing import Sequence, Union, Optional

def _logsumexp(xs: Sequence[float]) -> float:
    m = max(xs)
    return m + math.log(sum(math.exp(x - m) for x in xs))

def add_pmf_column(
    df: pd.DataFrame,
    logprob_col: str,
    priors: Optional[Union[str, Sequence[float]]] = None,
    out_col: str = "pmf",
    keep_logZ: bool = False,
) -> pd.DataFrame:
    """
    Treat each row as a candidate. `logprob_col` contains a list of token log-probs per row.
    Computes P(row i) ∝ exp(sum(logprobs_i)) * prior_i and writes it to `out_col`.

    priors:
      - None: uniform
      - str: name of a column holding prior probabilities per row
      - Sequence[float]: prior probs aligned with df.index
    """
    # sequence log-likelihood per row
    L = df[logprob_col].apply(
        lambda xs: sum(xs) if isinstance(xs, (list, tuple)) and len(xs) > 0 else float("-inf")
    ).tolist()

    # apply priors (in probability space)
    if priors is None:
        L_adj = L
    else:
        if isinstance(priors, str):
            prior_vals = df[priors].tolist()
        else:
            prior_vals = list(priors)
            if len(prior_vals) != len(L):
                raise ValueError("Length of `priors` must match number of rows.")
        L_adj = [Li + (math.log(p) if (p is not None and p > 0) else float("-inf"))
                 for Li, p in zip(L, prior_vals)]

    # normalize across rows (stable)
    logZ = _logsumexp(L_adj)
    pmf = [math.exp(Li - logZ) if Li != float("-inf") else 0.0 for Li in L_adj]

    df = df.copy()
    df[out_col] = pmf
    if keep_logZ:
        df["_logZ"] = logZ  # same for all rows
    return df


In [83]:
same_known_result_1 = score_phrases(same_known_text_1, same_phrase_1, same_para_1, tokenizer, model)
same_known_result_2 = score_phrases(same_known_text_2, same_phrase_2, same_para_2, tokenizer, model)
same_known_result_3 = score_phrases(same_known_text_3, same_phrase_3, same_para_3, tokenizer, model)
same_known_result_4 = score_phrases(same_known_text_4, same_phrase_4, same_para_4, tokenizer, model)
same_known_result_5 = score_phrases(same_known_text_5, same_phrase_5, same_para_5, tokenizer, model)


→ Scoring base_text alone…
   base_total = -167.7171

→ [1/195] Processing reference…
→ [2/195] Processing paraphrase…
→ [3/195] Processing paraphrase…
→ [4/195] Processing paraphrase…
→ [5/195] Processing paraphrase…
→ [6/195] Processing paraphrase…
→ [7/195] Processing paraphrase…
→ [8/195] Processing paraphrase…
→ [9/195] Processing paraphrase…
→ [10/195] Processing paraphrase…
→ [11/195] Processing paraphrase…
→ [12/195] Processing paraphrase…
→ [13/195] Processing paraphrase…
→ [14/195] Processing paraphrase…
→ [15/195] Processing paraphrase…
→ [16/195] Processing paraphrase…
→ [17/195] Processing paraphrase…
→ [18/195] Processing paraphrase…
→ [19/195] Processing paraphrase…
→ [20/195] Processing paraphrase…
→ [21/195] Processing paraphrase…
→ [22/195] Processing paraphrase…
→ [23/195] Processing paraphrase…
→ [24/195] Processing paraphrase…
→ [25/195] Processing paraphrase…
→ [26/195] Processing paraphrase…
→ [27/195] Processing paraphrase…
→ [28/195] Processing paraphrase…
→ [2

In [92]:
same_unknown_result_1 = score_phrases(same_unknown_text_1, same_phrase_1, same_para_1, tokenizer, model)
same_unknown_result_2 = score_phrases(same_unknown_text_2, same_phrase_2, same_para_2, tokenizer, model)
same_unknown_result_3 = score_phrases(same_unknown_text_3, same_phrase_3, same_para_3, tokenizer, model)
same_unknown_result_4 = score_phrases(same_unknown_text_4, same_phrase_4, same_para_4, tokenizer, model)
same_unknown_result_5 = score_phrases(same_unknown_text_5, same_phrase_5, same_para_5, tokenizer, model)

→ Scoring base_text alone…


KeyboardInterrupt: 

In [93]:
same_pmf_known_1 = add_pmf_column(same_known_result_1, logprob_col='phrase_log_probs')
same_pmf_known_2 = add_pmf_column(same_known_result_2, logprob_col='phrase_log_probs')
same_pmf_known_3 = add_pmf_column(same_known_result_3, logprob_col='phrase_log_probs')
same_pmf_known_4 = add_pmf_column(same_known_result_4, logprob_col='phrase_log_probs')
same_pmf_known_5 = add_pmf_column(same_known_result_5, logprob_col='phrase_log_probs')

In [106]:
same_pmf_unknown_1 = add_pmf_column(same_unknown_result_1, logprob_col='phrase_log_probs')
same_pmf_unknown_2 = add_pmf_column(same_unknown_result_2, logprob_col='phrase_log_probs')
same_pmf_unknown_3 = add_pmf_column(same_unknown_result_3, logprob_col='phrase_log_probs')
same_pmf_unknown_4 = add_pmf_column(same_unknown_result_4, logprob_col='phrase_log_probs')
same_pmf_unknown_5 = add_pmf_column(same_unknown_result_5, logprob_col='phrase_log_probs')

In [111]:
same_pmf_unknown_1.to_clipboard()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [118]:
def score_phrases_no_context(
    ref_phrase: str,
    paraphrases: List[str],
    tokenizer,
    model
) -> pd.DataFrame:
    """
    1) Score base_text alone → base_total
    2) For each phrase (reference + paraphrases):
         a) Get its token count by scoring phrase alone
         b) Score base_text + phrase → full tokens & log_probs
         c) sum_before = sum(full log_probs)
         d) phrase_tokens    = last n_phrase tokens of full tokens
         e) phrase_log_probs = last n_phrase values of full log_probs
         f) phrase_total     = sum(phrase_log_probs)
         g) difference       = base_total - sum_before
         h) APPEND row
    3) Return DataFrame with columns:
       phrase_type, phrase, tokens, base_total, sum_before,
       log_probs, phrase_total, difference
    """
    # 1) score base_text
    items = [("reference", ref_phrase)] + [("paraphrase", p) for p in paraphrases]
    rows = []

    for idx, (ptype, phrase) in enumerate(items, start=1):
        print(f"→ [{idx}/{len(items)}] Processing {ptype}…")

        # a) phrase alone → get token count
        tokens_phrase, log_probs_phrase, _ = compute_log_probs_with_median(phrase, tokenizer, model)
        # b) compute sum
        phrase_total = sum(log_probs_phrase)
        # h) collect
        rows.append({
            "phrase_type":  ptype,
            "phrase":       phrase,
            "tokens":       tokens_phrase,
            "log_probs":    log_probs_phrase,
            "sum_log_probs": phrase_total,
        })

    return pd.DataFrame(rows, columns=[
        "phrase_type", "phrase", "tokens",
        "log_probs", "sum_log_probs",
    ])

In [120]:
phrase_1_no_context = score_phrases_no_context(same_phrase_1, same_para_1, tokenizer, model)
phrase_2_no_context = score_phrases_no_context(same_phrase_2, same_para_2, tokenizer, model)
phrase_3_no_context = score_phrases_no_context(same_phrase_3, same_para_3, tokenizer, model)
phrase_4_no_context = score_phrases_no_context(same_phrase_4, same_para_4, tokenizer, model)
phrase_5_no_context = score_phrases_no_context(same_phrase_5, same_para_5, tokenizer, model)

→ [1/195] Processing reference…
→ [2/195] Processing paraphrase…
→ [3/195] Processing paraphrase…
→ [4/195] Processing paraphrase…
→ [5/195] Processing paraphrase…
→ [6/195] Processing paraphrase…
→ [7/195] Processing paraphrase…
→ [8/195] Processing paraphrase…
→ [9/195] Processing paraphrase…
→ [10/195] Processing paraphrase…
→ [11/195] Processing paraphrase…
→ [12/195] Processing paraphrase…
→ [13/195] Processing paraphrase…
→ [14/195] Processing paraphrase…
→ [15/195] Processing paraphrase…
→ [16/195] Processing paraphrase…
→ [17/195] Processing paraphrase…
→ [18/195] Processing paraphrase…
→ [19/195] Processing paraphrase…
→ [20/195] Processing paraphrase…
→ [21/195] Processing paraphrase…
→ [22/195] Processing paraphrase…
→ [23/195] Processing paraphrase…
→ [24/195] Processing paraphrase…
→ [25/195] Processing paraphrase…
→ [26/195] Processing paraphrase…
→ [27/195] Processing paraphrase…
→ [28/195] Processing paraphrase…
→ [29/195] Processing paraphrase…
→ [30/195] Processing pa

In [121]:
pmf_phrase_1 = add_pmf_column(phrase_1_no_context, logprob_col='log_probs')
pmf_phrase_2 = add_pmf_column(phrase_2_no_context, logprob_col='log_probs')
pmf_phrase_3 = add_pmf_column(phrase_3_no_context, logprob_col='log_probs')
pmf_phrase_4 = add_pmf_column(phrase_4_no_context, logprob_col='log_probs')
pmf_phrase_5 = add_pmf_column(phrase_5_no_context, logprob_col='log_probs')

In [128]:
pmf_phrase_5.to_clipboard()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
phrase_no_context_results.to_clipboard()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
ref_metrics(phrase_no_context_results['log_probs'].tolist())

{'p_ref': 0.02942703608435714,
 'pmf': [0.02942703608435714,
  0.08518705975877248,
  0.00153309522815474,
  0.02223181571976897,
  0.49502898376941395,
  0.006620540822840208,
  0.05334360039450061,
  0.005839424315377672,
  0.19029799722300392,
  0.0279927686630204,
  0.00036593318764960883,
  4.4279190953408884e-07,
  0.016561591261916014,
  0.012231002196408955,
  0.015443991910654193,
  0.015744169946309096,
  0.01598870792759976,
  0.0019328462398086792,
  0.00202757444268726,
  0.00030843112278552016,
  0.0005150109174092671,
  0.0007774717419987056,
  0.00018063978406505414,
  0.00011064914097456609,
  0.00016519444097009707,
  2.539807869046445e-05,
  1.2079753541652437e-05,
  3.586496373663749e-05,
  4.6547386795370134e-05,
  9.675178073893586e-06,
  1.0786488684721038e-05,
  1.4202459703150708e-07,
  1.32861536726391e-06,
  1.0233554992817302e-10,
  5.5139786685459756e-14,
  2.1847368120972857e-06,
  1.3638956227979283e-08],
 'log_den': -33.29816781462455,
 'den': 3.45771712