In [1]:

import sys
import os
import re

import pandas as pd

from typing import Any, Dict, List, Sequence, Set, Tuple, Optional, Iterable
from collections import defaultdict

In [2]:
sys.path.append(os.path.abspath('../../src'))

from read_and_write_docs import read_jsonl, read_rds, write_jsonl
from tokenize_and_score import load_model
from utils import apply_temp_doc_id, build_metadata_df

In [3]:
def common_ngrams(
    text1: str,
    text2: str,
    n: int,
    model: Any = None,
    tokenizer: Any = None,
    include_subgrams: bool = False,
    lowercase: bool = True,
) -> Dict[int, Set[Tuple[Any, ...]]]:
    """
    Return shared n-grams of length >= n between two texts.

    If include_subgrams is False (default), remove any shared n-gram that is a
    contiguous subspan of a longer shared n-gram. (So a 5-gram that’s part of a
    shared 6-gram is excluded; unrelated 5-grams remain.)

    Parameters
    ----------
    lowercase : bool, default True
        If True, normalize text using str.casefold() before tokenization.
        Applies to both the simple regex tokenization path and the Hugging Face
        tokenizer path (by case-folding the raw text before calling the tokenizer).
    """
    if n < 1:
        raise ValueError("n must be >= 1")

    def _word_tokens(s: str) -> List[str]:
        s2 = s.casefold() if lowercase else s
        return re.findall(r"\w+", s2)

    def _hf_tokens(txt: str) -> List[Any]:
        src = txt.casefold() if lowercase else txt
        if hasattr(tokenizer, "tokenize"):
            return list(tokenizer.tokenize(src))
        enc = tokenizer(
            src,
            add_special_tokens=False,
            return_attention_mask=False,
            return_token_type_ids=False,
        )
        input_ids = enc.get("input_ids", [])
        if input_ids and isinstance(input_ids[0], (list, tuple)):
            input_ids = input_ids[0]
        if hasattr(tokenizer, "convert_ids_to_tokens"):
            return tokenizer.convert_ids_to_tokens(input_ids)
        return input_ids

    def _ngrams_by_len(seq: Sequence[Any], min_n: int) -> Dict[int, Set[Tuple[Any, ...]]]:
        out: Dict[int, Set[Tuple[Any, ...]]] = {}
        L = len(seq)
        for k in range(min_n, L + 1):
            s: Set[Tuple[Any, ...]] = set()
            for i in range(0, L - k + 1):
                s.add(tuple(seq[i : i + k]))
            if s:
                out[k] = s
        return out

    token_mode = (model is not None) and (tokenizer is not None)
    seq1 = _hf_tokens(text1) if token_mode else _word_tokens(text1)
    seq2 = _hf_tokens(text2) if token_mode else _word_tokens(text2)

    ngrams1 = _ngrams_by_len(seq1, n)
    ngrams2 = _ngrams_by_len(seq2, n)

    common: Dict[int, Set[Tuple[Any, ...]]] = {}
    for k in set(ngrams1.keys()).intersection(ngrams2.keys()):
        inter = ngrams1[k] & ngrams2[k]
        if inter:
            common[k] = inter

    if include_subgrams or not common:
        return common

    # Remove n-grams that are contiguous subspans of any longer shared n-gram
    to_remove: Dict[int, Set[Tuple[Any, ...]]] = defaultdict(set)
    lengths = sorted(common.keys())
    for k in lengths:
        # For each longer length, generate all contiguous subspans down to n
        for longer_k in [L for L in lengths if L > k]:
            for g in common[longer_k]:
                # produce all subspans of length k from g
                for i in range(0, longer_k - k + 1):
                    to_remove[k].add(g[i : i + k])

    # Apply removals
    for k, rem in to_remove.items():
        if k in common:
            common[k] = {g for g in common[k] if g not in rem}
            if not common[k]:
                del common[k]

    return common

def pretty_print_common_ngrams(
    common: Dict[int, Set[Tuple[Any, ...]]],
    sep: str = " ",
    order: str = "count_desc",  # "count_desc" | "len_asc" | "len_desc"
    tokenizer=None,             # Optional HuggingFace tokenizer
) -> None:
    """
    Pretty-print shared n-grams.

    - Groups by n (the integer length).
    - If `tokenizer` is None: converts each n-gram tuple into a string joined by `sep` (original behavior).
    - If `tokenizer` is provided: decodes token ids/strings to readable text (special tokens removed).
    - Prints lists, ordered by the number of n-grams per length (descending) by default.
    """
    if not common:
        print("{}")
        return

    def stringify_ngram(ngram: Tuple[Any, ...]) -> str:
        # Original behavior (no tokenizer): join items with sep
        if tokenizer is None:
            return sep.join(map(str, ngram))

        # With tokenizer: decode to human-readable text
        toks = list(ngram)

        # If everything is ids, use fast decode
        if all(isinstance(t, int) for t in toks):
            return tokenizer.decode(
                toks,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False,
            )

        # Otherwise, we may have token *strings* or a mix of ids & strings
        specials = set(getattr(tokenizer, "all_special_tokens", []))
        norm_tokens: List[str] = []
        for t in toks:
            if isinstance(t, int):
                # convert id -> token string
                norm_tokens.append(tokenizer.convert_ids_to_tokens(t))
            else:
                norm_tokens.append(str(t))

        # Drop special tokens (e.g., <s>, </s>)
        norm_tokens = [t for t in norm_tokens if t not in specials]

        # Let the tokenizer handle spacing/newlines between tokens
        return tokenizer.convert_tokens_to_string(norm_tokens)

    # Convert tuples to strings per length key
    grouped: Dict[int, List[str]] = {
        n: sorted(stringify_ngram(g) for g in grams)
        for n, grams in common.items()
    }

    # Choose group ordering
    if order == "count_desc":
        items = sorted(grouped.items(), key=lambda kv: (-len(kv[1]), kv[0]))
    elif order == "len_asc":
        items = sorted(grouped.items(), key=lambda kv: kv[0])
    elif order == "len_desc":
        items = sorted(grouped.items(), key=lambda kv: -kv[0])
    else:
        items = sorted(grouped.items(), key=lambda kv: (-len(kv[1]), kv[0]))

    # Print: e.g., "3-grams (5): ['a b c', 'd e f', ...]"
    for n, strings in items:
        print(f"{n}-grams ({len(strings)}): {strings}")
        
def highest_common(common: Dict[int, Set[Tuple[Any, ...]]]) -> Tuple[int, Set[Tuple[Any, ...]]]:
    """
    Given the dict returned by `common_ngrams`, return (max_n, ngrams_at_max).
    If there are none, returns (0, empty set).
    """
    if not common:
        return 0, set()
    max_k = max(common.keys())
    return max_k, common[max_k]


In [4]:
def largest_common_ngram_problems(
    metadata: pd.DataFrame,
    known: pd.DataFrame,
    unknown: pd.DataFrame,
    n: int,
    model: Any = None,
    tokenizer: Any = None,
    print_progress: bool = True,
) -> pd.DataFrame:
    """
    For each metadata row (keeps, problem, known_author, unknown_author, known_doc_id, unknown_doc_id),
    filter `known`/`unknown` by doc_id, take text via .reset_index().loc[0, 'text'],
    compute common_ngrams(...) then highest_common(common), and extract:
      - highes_common_count: the number (first element)
      - highes_common_ngram: the n-gram as a space-joined string

    Returns columns:
      ['keeps','problem','known_author','unknown_author','known_doc_id','unknown_doc_id',
       'highes_common_count','highes_common_ngram']
    """
    required_meta_cols = [
        "problem", "known_author", "unknown_author",
        "known_doc_id", "unknown_doc_id",
    ]
    missing_meta = [c for c in required_meta_cols if c not in metadata.columns]
    if missing_meta:
        raise ValueError(f"metadata missing columns: {missing_meta}")

    for df_name, df in [("known", known), ("unknown", unknown)]:
        if "doc_id" not in df.columns:
            raise ValueError(f"'{df_name}' is missing required column 'doc_id'")
        if "text" not in df.columns:
            raise ValueError(f"'{df_name}' is missing required column 'text'")

    def _pick_ngram_string(ngrams: Any) -> str:
        """
        Accepts one of:
          - a tuple/list of tokens (single n-gram),
          - a set/list of n-gram tuples (choose deterministic first),
          - an already-joined string.
        Returns a single space-joined n-gram string.
        """
        # If we received a collection of n-grams, pick a deterministic one
        if isinstance(ngrams, (set, list, tuple)) and ngrams and isinstance(next(iter(ngrams)), (tuple, list, str)):
            # If it's a set/list of tuples/lists/strings, sort deterministically
            if isinstance(ngrams, (set, list)) and ngrams and not isinstance(ngrams, str):
                try:
                    candidate = sorted(ngrams)[0]
                except Exception:
                    candidate = next(iter(ngrams))
            else:
                candidate = ngrams  # already a single n-gram tuple/list/str
        else:
            candidate = ngrams

        # If candidate is a sequence of tokens, join with spaces; otherwise cast to str
        if isinstance(candidate, (tuple, list)):
            return " ".join(map(str, candidate))
        return str(candidate)

    rows: List[Dict[str, Any]] = []
    total = len(metadata)
    it = metadata[required_meta_cols].itertuples(index=False, name="MetaRow")

    for i, row in enumerate(it, 1):
        problem, known_author, unknown_author, known_doc_id, unknown_doc_id = row

        kdf = known.loc[known["doc_id"] == known_doc_id].reset_index(drop=True)
        udf = unknown.loc[unknown["doc_id"] == unknown_doc_id].reset_index(drop=True)

        if kdf.empty or udf.empty:
            count_val = None
            ngram_str = None
        else:
            text_known = kdf.loc[0, "text"]
            text_unknown = udf.loc[0, "text"]

            # If your common_ngrams expects n, switch to: common_ngrams(text_known, text_unknown, n)
            common = common_ngrams(text_known, text_unknown, n=n, model=model, tokenizer=tokenizer)
            hc = highest_common(common)

            if hc is None:
                count_val = None
                ngram_str = None
            else:
                # Expecting (number, ngrams)
                try:
                    count_val, ngrams_obj = hc
                except Exception:
                    # Fallback: treat whole object as the ngram payload and set count None
                    count_val = None
                    ngrams_obj = hc
                ngram_str = _pick_ngram_string(ngrams_obj)

        rows.append({
            "problem": problem,
            "known_author": known_author,
            "unknown_author": unknown_author,
            "known_doc_id": known_doc_id,
            "unknown_doc_id": unknown_doc_id,
            "highest_common_count": count_val,      # extracted number
            "highest_common_ngram": ngram_str,      # tokens joined by ' '
        })

        if print_progress and total:
            if (i % max(1, total // 50) == 0) or (i == total):
                pct = int(i * 100 / total)
                print(f"\rProcessed {i}/{total} ({pct}%)", end="")

    if print_progress:
        print()

    return pd.DataFrame(rows)


In [5]:
corpus = "Enron"
data_type = "training"

known_loc = f"/Volumes/BCross/datasets/author_verification/{data_type}/{corpus}/known_raw.jsonl"
known = read_jsonl(known_loc)
known = apply_temp_doc_id(known)

unknown_loc = f"/Volumes/BCross/datasets/author_verification/{data_type}/{corpus}/unknown_raw.jsonl"
unknown = read_jsonl(unknown_loc)
unknown_df = apply_temp_doc_id(unknown)

metadata_loc = f"/Volumes/BCross/datasets/author_verification/{data_type}/metadata.rds"
metadata = read_rds(metadata_loc)
filtered_metadata = metadata[metadata['corpus'] == corpus]
agg_metadata = build_metadata_df(filtered_metadata, known, unknown)

In [6]:
filtered_metadata.head()

Unnamed: 0,problem,corpus,known_author,unknown_author
3214,Andy.zipper vs Andy.zipper,Enron,Andy.zipper,Andy.zipper
3215,Andy.zipper vs Barry.tycholiz,Enron,Andy.zipper,Barry.tycholiz
3216,Barry.tycholiz vs Barry.tycholiz,Enron,Barry.tycholiz,Barry.tycholiz
3217,Barry.tycholiz vs Benjamin.rogers,Enron,Barry.tycholiz,Benjamin.rogers
3218,Benjamin.rogers vs Benjamin.rogers,Enron,Benjamin.rogers,Benjamin.rogers


In [7]:
tokenizer, model = load_model("/Volumes/BCross/models/Qwen 2.5/Qwen2.5-0.5B-Instruct")

In [8]:
# problems = largest_common_ngram_problems(
#     agg_metadata,
#     known,
#     unknown,
#     n=2,
#     model=model,
#     tokenizer=tokenizer
# )

# write_jsonl(problems, f"/Users/user/Documents/test_data/n-gram_tracing/{corpus}_{data_type}_agg.jsonl")

In [9]:
problems = read_jsonl(f"/Users/user/Documents/test_data/n-gram_tracing/{corpus}_{data_type}_agg.jsonl")

In [10]:
problems.head()

Unnamed: 0,problem,known_author,unknown_author,known_doc_id,unknown_doc_id,highest_common_count,highest_common_ngram
0,Andy.zipper vs Andy.zipper,Andy.zipper,Andy.zipper,andy_zipper_mail_1,andy_zipper_mail_2,5,. Ġi Ġam Ġworking Ġon
1,Andy.zipper vs Andy.zipper,Andy.zipper,Andy.zipper,andy_zipper_mail_3,andy_zipper_mail_2,6,"Ġin Ġh ouston , Ġbut Ġi"
2,Andy.zipper vs Andy.zipper,Andy.zipper,Andy.zipper,andy_zipper_mail_4,andy_zipper_mail_2,5,Ġare Ġtrying Ġto Ġaccomplish .
3,Andy.zipper vs Andy.zipper,Andy.zipper,Andy.zipper,andy_zipper_mail_5,andy_zipper_mail_2,4,. Ġi Ġthink Ġthe
4,Andy.zipper vs Barry.tycholiz,Andy.zipper,Barry.tycholiz,andy_zipper_mail_1,barry_tycholiz_mail_2,5,. Ġi Ġdon 't Ġreally


In [11]:
same_author_problems = problems[problems['known_author'] == problems['unknown_author']].copy()
same_author_problems.sort_values(["highest_common_count"], ascending=[False], inplace=True)
same_author_problems[(same_author_problems['highest_common_count'] > 5) & (same_author_problems['highest_common_count'] <= 20)]

Unnamed: 0,problem,known_author,unknown_author,known_doc_id,unknown_doc_id,highest_common_count,highest_common_ngram
16,Benjamin.rogers vs Benjamin.rogers,Benjamin.rogers,Benjamin.rogers,benjamin_rogers_mail_2,benjamin_rogers_mail_1,13,. Ġplease Ġgive Ġus Ġa Ġcall Ġif Ġyou Ġhave Ġa...
124,Elizabeth.sager vs Elizabeth.sager,Elizabeth.sager,Elizabeth.sager,elizabeth_sager_mail_1,elizabeth_sager_mail_5,13,? Ġhope Ġall Ġis Ġwell Ġand Ġi 'll Ġtalk Ġto Ġ...
175,Jeff.skilling vs Jeff.skilling,Jeff.skilling,Jeff.skilling,jeff_skilling_mail_3,jeff_skilling_mail_1,11,Ġd ottie Ġk err Ġd ottie Ġk err -s olutions .
18,Benjamin.rogers vs Benjamin.rogers,Benjamin.rogers,Benjamin.rogers,benjamin_rogers_mail_4,benjamin_rogers_mail_1,10,. Ġplease Ġlet Ġme Ġknow Ġif Ġyou Ġhave Ġany Ġ...
211,Kam.keiser vs Kam.keiser,Kam.keiser,Kam.keiser,kam_keiser_mail_2,kam_keiser_mail_4,9,. Ġlet Ġme Ġknow Ġif Ġthere Ġis Ġanything Ġelse
68,D.thomas vs D.thomas,D.thomas,D.thomas,d_thomas_mail_5,d_thomas_mail_3,9,"a , Ġhow Ġare Ġyou Ġdoing Ġthis Ġmorning ?"
212,Kam.keiser vs Kam.keiser,Kam.keiser,Kam.keiser,kam_keiser_mail_3,kam_keiser_mail_4,9,Ġbut Ġi Ġwanted Ġto Ġget Ġyou Ġsomething Ġto Ġ...
85,Daren.farmer vs Daren.farmer,Daren.farmer,Daren.farmer,daren_farmer_mail_2,daren_farmer_mail_3,8,"Ġhave Ġ 1 0 , 0 0 0"
132,Errol.mclaughlin vs Errol.mclaughlin,Errol.mclaughlin,Errol.mclaughlin,errol_mclaughlin_mail_1,errol_mclaughlin_mail_3,8,"9 , Ġ 2 0 0 1 ."
135,Errol.mclaughlin vs Errol.mclaughlin,Errol.mclaughlin,Errol.mclaughlin,errol_mclaughlin_mail_5,errol_mclaughlin_mail_3,8,"9 , Ġ 2 0 0 1 ."


In [12]:
diff_author_problems = problems[problems['known_author'] != problems['unknown_author']].copy()
diff_author_problems.sort_values(["highest_common_count"], ascending=[False], inplace=True)
diff_author_problems[(diff_author_problems['highest_common_count'] >= 5) & (diff_author_problems['highest_common_count'] <= 200)].head(10)

Unnamed: 0,problem,known_author,unknown_author,known_doc_id,unknown_doc_id,highest_common_count,highest_common_ngram
188,Jeffrey.shankman vs Joannie.williamson,Jeffrey.shankman,Joannie.williamson,jeffrey_shankman_mail_4,joannie_williamson_mail_4,8,. Ġif Ġyou Ġare Ġnot Ġthe Ġintended Ġrecipient
27,Bill.williams vs Cara.semperger,Bill.williams,Cara.semperger,bill_williams_mail_4,cara_semperger_mail_4,8,. Ġplease Ġlet Ġme Ġknow Ġif Ġyou Ġhave
129,Elizabeth.sager vs Errol.mclaughlin,Elizabeth.sager,Errol.mclaughlin,elizabeth_sager_mail_2,errol_mclaughlin_mail_3,7,Ġi Ġwill Ġbe Ġout Ġof Ġthe Ġoffice
186,Jeffrey.shankman vs Joannie.williamson,Jeffrey.shankman,Joannie.williamson,jeffrey_shankman_mail_2,joannie_williamson_mail_4,7,Ġplease Ġdo Ġnot Ġhesitate Ġto Ġcontact Ġme
103,Darron.giron vs David.delainey,Darron.giron,David.delainey,darron_giron_mail_3,david_delainey_mail_3,6,Ġin Ġthe Ġnext Ġcouple Ġof Ġweeks
55,Cooper.richey vs D.steffes,Cooper.richey,D.steffes,cooper_richey_mail_1,d_steffes_mail_3,6,Ġand Ġi Ġwanted Ġto Ġmake Ġsure
219,Kate.symes vs Andy.zipper,Kate.symes,Andy.zipper,kate_symes_mail_3,andy_zipper_mail_2,6,. Ġplease Ġlet Ġme Ġknow Ġif
21,Benjamin.rogers vs Bill.williams,Benjamin.rogers,Bill.williams,benjamin_rogers_mail_4,bill_williams_mail_1,6,Ġand Ġif Ġyou Ġhave Ġany Ġquestions
128,Elizabeth.sager vs Errol.mclaughlin,Elizabeth.sager,Errol.mclaughlin,elizabeth_sager_mail_1,errol_mclaughlin_mail_3,6,. Ġif Ġyou Ġhave Ġany Ġquestions
77,Dan.hyvl vs Dana.davis,Dan.hyvl,Dana.davis,dan_hyvl_mail_4,dana_davis_mail_1,5,Ġlet Ġme Ġknow Ġif Ġthere


In [13]:
known_doc = 'jeffrey_shankman_mail_4'
unknown_doc = 'joannie_williamson_mail_4'

known_text = known[known['doc_id'] == known_doc].reset_index().loc[0, 'text']
unknown_text = unknown[unknown['doc_id'] == unknown_doc].reset_index().loc[0, 'text']

common_ngram_dict = common_ngrams(known_text, unknown_text, n=2, model=model, tokenizer=tokenizer)
pretty_print_common_ngrams(common_ngram_dict, tokenizer=tokenizer)

2-grams (28): [' a great', ' and other', ' and then', ' back in', ' e-mail', ' for the', ' i believe', ' jeff', ' me to', ' of our', ' of the', ' of this', ' on the', ' the sender', ' to be', ' to contact', ' to the', ' we will', ' week,', ' with the', ' would be', ' you have', ' you on', ', but', ', i', ', please', '. please', '9,']
3-grams (6): [' 24', ' and any attachments', ' and will be', ' please do not', ' would like to', '. i will']
4-grams (2): [' this message and any', ' this message, and']
8-grams (1): ['. if you are not the intended recipient']


## Combined Text Option

We can also combine the texts by author in the known dataframe, we may just wish to compare these views.

In [14]:
from typing import Iterable, Sequence, Union
import pandas as pd

def concat_text_by_ids(
    doc_ids: Iterable,
    df: pd.DataFrame,
    id_col: str = "doc_id",
    text_col: str = "text",
    sep: str = "\n",
    unique: bool = False,   # de-dupe doc_ids while preserving order
    strict: bool = False,   # raise if any doc_id is missing
    dropna: bool = True,    # drop rows where text is NaN
) -> str:
    """
    Return a single string with texts for the given doc_ids concatenated by `sep`.

    - Order follows `doc_ids`.
    - If a doc_id matches multiple rows, their texts are joined by `sep` first,
      then that block is joined into the overall result (also using `sep`).
    """
    # Optionally de-duplicate the provided IDs, preserving order
    if unique:
        seen = set()
        doc_ids = [d for d in doc_ids if not (d in seen or seen.add(d))]

    # Optionally drop NaNs in the text column
    if dropna:
        df = df[df[text_col].notna()].copy()

    # Build a mapping: doc_id -> [texts...], preserving row order
    grouped = df.groupby(id_col, sort=False)[text_col].apply(list).to_dict()

    parts = []
    missing = []
    for d in doc_ids:
        texts = grouped.get(d)
        if texts is None:
            missing.append(d)
            continue
        parts.append(sep.join(str(t) for t in texts))

    if strict and missing:
        raise KeyError(f"Missing doc_ids in dataframe: {missing}")

    # If not strict, we just skip missing IDs
    return sep.join(parts)

def concat_text_by_group(
    df: pd.DataFrame,
    group_cols: Union[str, Sequence[str]] = "author",
    text_col: str = "text",
    sep: str = "\n",
    dropna: bool = True,
    keep_group_order: bool = True,              # keep first-seen group order
    keep_row_order_within_group: bool = True,   # keep row order inside each group
    output_col: str = "concat_text",
) -> pd.DataFrame:
    """
    Group `df` by one or more columns and concatenate each group's `text_col`
    joined by `sep`. Returns a DataFrame with the group columns plus `output_col`.

    Parameters
    ----------
    group_cols : str | Sequence[str]
        A single column name or a list/tuple of column names to group by.
    """
    # Normalize group_cols to a list
    if isinstance(group_cols, str):
        group_cols = [group_cols]
    else:
        group_cols = list(group_cols)

    # Validate columns
    required = set(group_cols + [text_col])
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise KeyError(f"Missing columns in DataFrame: {missing}")

    # Minimal copy
    df2 = df[group_cols + [text_col]].copy()

    if dropna:
        df2 = df2[df2[text_col].notna()]

    # Ensure text is string
    df2[text_col] = df2[text_col].astype(str)

    # Control row order within groups
    if not keep_row_order_within_group:
        # Basic option: sort texts lexicographically within each group
        df2 = df2.sort_values(group_cols + [text_col])

    # Group and concatenate
    out = (
        df2.groupby(group_cols, sort=not keep_group_order)[text_col]
           .apply(lambda s: sep.join(s.tolist()))
           .reset_index()
           .rename(columns={text_col: output_col})
    )
    return out



In [15]:
known_profile = concat_text_by_group(known, group_cols=['corpus', 'author', 'texttype'])

In [16]:
# known_author = "Jeffrey.shankman"
# unknown_author = "Joannie.williamson"

# known_docs = known[known['author'] == known_author]['doc_id'].unique().tolist()
# known_text = concat_text_by_ids(known_docs, known)

# unknown_text = unknown[unknown['author'] == unknown_author].reset_index().loc[0, 'text']

# common_ngram_dict = common_ngrams(known_text, unknown_text, n=2, model=model, tokenizer=tokenizer)
# pretty_print_common_ngrams(common_ngram_dict, tokenizer=tokenizer)

In [17]:
from typing import Any, Dict, List, Optional, Sequence, Tuple
from collections import defaultdict
import pandas as pd

def largest_common_ngram_profile_problems(
    metadata: pd.DataFrame,
    known: pd.DataFrame,
    unknown: pd.DataFrame,
    n: int,
    model: Any = None,
    tokenizer: Any = None,
    lowercase: bool = True,         # passed through to common_ngrams
    include_subgrams: bool = False, # passed through to common_ngrams if supported
    sep: str = "\n",                # newline separator when concatenating
    print_progress: bool = True,
) -> pd.DataFrame:
    """
    For each metadata row (problem, known_author, unknown_author):
      1) In `known`, filter to `known_author`, then group by (corpus, author, texttype).
         Concatenate each group's texts (preserving row order) with `sep`, then
         concatenate those group blobs (also with `sep`) to make ONE big `known_text`.
      2) In `unknown`, filter to `unknown_author`. If multiple rows exist, concatenate with `sep`
         to make ONE big `unknown_text`.
      3) Compute common_ngrams(known_text, unknown_text, n, ...) and take the largest/common
         profile via `highest_common`.

    Returns columns in this exact order:
      ['problem','known_author','unknown_author',
       'known_text','unknown_text',
       'highest_common_count','highest_common_ngram']
    """

    # --- validations ---
    required_meta = ["problem", "known_author", "unknown_author"]
    miss_meta = [c for c in required_meta if c not in metadata.columns]
    if miss_meta:
        raise ValueError(f"`metadata` missing columns: {miss_meta}")

    for df_name, df, req_cols in [
        ("known", known, ["corpus", "author", "texttype", "text"]),
        ("unknown", unknown, ["author", "text"]),
    ]:
        missing = [c for c in req_cols if c not in df.columns]
        if missing:
            raise ValueError(f"`{df_name}` missing columns: {missing}")

    # --- helpers ---
    def _pick_ngram_string(ngrams: Any) -> Optional[str]:
        """Pick a deterministic n-gram and join tokens with spaces."""
        if ngrams is None:
            return None
        candidate = ngrams
        if isinstance(ngrams, (set, list)) and ngrams:
            try:
                candidate = sorted(ngrams)[0]
            except Exception:
                candidate = next(iter(ngrams))
        if isinstance(candidate, (tuple, list)):
            return " ".join(map(str, candidate))
        return str(candidate)

    def _highest_common_fallback(common: Dict[int, Any]) -> Optional[Tuple[int, Any]]:
        """Fallback if `highest_common` isn't defined."""
        if not common:
            return None
        longest_n = max(common.keys())
        grams = common[longest_n]
        try:
            count = len(grams)
        except Exception:
            count = None
        return (count, grams)

    _highest_common = globals().get("highest_common", _highest_common_fallback)

    def _concat_known_for_author(author: Any) -> Optional[str]:
        ksub = known.loc[known["author"] == author, ["corpus", "author", "texttype", "text"]].copy()
        if ksub.empty:
            return None
        ksub = ksub[ksub["text"].notna()]
        if ksub.empty:
            return None
        ksub["text"] = ksub["text"].astype(str)

        parts: List[str] = []
        # First-seen order for groups and rows within groups
        for _, g in ksub.groupby(["corpus", "author", "texttype"], sort=False):
            grp_text = sep.join(g["text"].tolist())
            if grp_text:
                parts.append(grp_text)
        return sep.join(parts) if parts else None

    def _concat_unknown_for_author(author: Any) -> Optional[str]:
        usub = unknown.loc[unknown["author"] == author, ["author", "text"]].copy()
        if usub.empty:
            return None
        usub = usub[usub["text"].notna()]
        if usub.empty:
            return None
        usub["text"] = usub["text"].astype(str)
        return sep.join(usub["text"].tolist())

    # --- main loop ---
    rows: List[Dict[str, Any]] = []
    total = len(metadata)
    it = metadata[required_meta].itertuples(index=False, name="MetaRow")

    for i, meta_row in enumerate(it, 1):
        problem, known_author, unknown_author = meta_row

        known_text   = _concat_known_for_author(known_author)
        unknown_text = _concat_unknown_for_author(unknown_author)

        if not known_text or not unknown_text:
            count_val = None
            ngram_str = None
        else:
            # Compute common n-grams
            try:
                common = common_ngrams(
                    known_text,
                    unknown_text,
                    n=n,
                    model=model,
                    tokenizer=tokenizer,
                    include_subgrams=include_subgrams,
                    lowercase=lowercase,  # requires your updated common_ngrams
                )
            except TypeError:
                # Backward-compat if your common_ngrams doesn't take lowercase/include_subgrams
                common = common_ngrams(
                    known_text,
                    unknown_text,
                    n=n,
                    model=model,
                    tokenizer=tokenizer,
                )

            hc = _highest_common(common) if common else None
            if hc is None:
                count_val = None
                ngram_str = None
            else:
                try:
                    count_val, ngrams_obj = hc
                except Exception:
                    count_val = None
                    ngrams_obj = hc
                ngram_str = _pick_ngram_string(ngrams_obj)

        rows.append({
            "problem": problem,
            "known_author": known_author,
            "unknown_author": unknown_author,
            "known_text": known_text,
            "unknown_text": unknown_text,
            "highest_common_count": count_val,
            "highest_common_ngram": ngram_str,
        })

        if print_progress and total:
            if (i % max(1, total // 50) == 0) or (i == total):
                pct = int(i * 100 / total)
                print(f"\rProcessed {i}/{total} ({pct}%)", end="")

    if print_progress:
        print()

    # Ensure column order as requested
    cols = [
        "problem", "known_author", "unknown_author",
        "known_text", "unknown_text",
        "highest_common_count", "highest_common_ngram",
    ]
    df_out = pd.DataFrame(rows)
    return df_out.reindex(columns=cols)

In [None]:
profile_problems = largest_common_ngram_profile_problems(
    metadata=filtered_metadata,
    known=known,
    unknown=unknown,
    n=2,
    model=model,
    tokenizer=tokenizer,
    lowercase=True,
    include_subgrams=False,
    sep="\n",
    print_progress=True,
)

In [None]:
write_jsonl(profile_problems, f"/Users/user/Documents/test_data/n-gram_tracing/{corpus}_{data_type}_profile.jsonl")

In [None]:
profile_problems

Unnamed: 0,problem,known_author,unknown_author,known_text,unknown_text,highest_common_count,highest_common_ngram
0,142.196.88.228 vs 142.196.88.228,142.196.88.228,142.196.88.228,The article that is being referred to via the ...,"Furthermore, given the nearly parallel emergen...",89,Ġlove Ġthe Ġway Ġthe Ġarticle Ġincludes Ġthe Ġ...
1,142.196.88.228 vs Aban1313,142.196.88.228,Aban1313,The article that is being referred to via the ...,Leopold Amery also notes in his diary on Aug 4...,3,".Ċ also ,"
2,A_Man_In_Black vs A_Man_In_Black,A_Man_In_Black,A_Man_In_Black,Nobody's seen fit to comment on these organiza...,These characters frequently wield a large gun ...,52,"i Ġthink Ġan Ġimage Ġof Ġa Ġbusty , Ġlong -leg..."
3,A_Man_In_Black vs Bankhallbretherton,A_Man_In_Black,Bankhallbretherton,Nobody's seen fit to comment on these organiza...,Maybe you need to realise that you need to kno...,3,"Ġat Ġall ,"
4,Aban1313 vs Aban1313,Aban1313,Aban1313,Oh.and enjoy our 10 billion donation to the IM...,Leopold Amery also notes in his diary on Aug 4...,6,.Ċ aban 1 3 1 3
...,...,...,...,...,...,...,...
145,Haymaker vs HeadleyDown,Haymaker,HeadleyDown,Even if you choose not to have any faith in th...,I'm just wondering how unique it may or may no...,3,.Ċ i 'm
146,HeadleyDown vs HeadleyDown,HeadleyDown,HeadleyDown,I made these points Remove redundancy Meta mod...,I'm just wondering how unique it may or may no...,4,.Ċ the Ġn lp
147,HeadleyDown vs Hipocrite,HeadleyDown,Hipocrite,I made these points Remove redundancy Meta mod...,Elect someone who gets all of this to ArbCom.\...,5,Ġon Ġthis Ġtalk Ġpage .Ċ
148,Hipocrite vs Hipocrite,Hipocrite,Hipocrite,It appears that reliable sources are using the...,Elect someone who gets all of this to ArbCom.\...,4,Ġi Ġdon 't Ġsee


In [None]:
same_author_problems = profile_problems[profile_problems['known_author'] == profile_problems['unknown_author']].copy()
same_author_problems.sort_values(["highest_common_count"], ascending=[False], inplace=True)
same_author_problems[(same_author_problems['highest_common_count'] > 3) & (same_author_problems['highest_common_count'] <= 10)].head(20)

Unnamed: 0,problem,known_author,unknown_author,known_text,unknown_text,highest_common_count,highest_common_ngram
126,Greg_L vs Greg_L,Greg_L,Greg_L,It amounts to obviousness obscured with talk o...,It s a practice I suspect would benefit Wikipe...,9,", Ġthey Ġshould Ġhave Ġparticipated Ġin Ġthe Ġ..."
144,Haymaker vs Haymaker,Haymaker,Haymaker,Even if you choose not to have any faith in th...,"See, I can play that game to, consensus is sol...",9,"Ġat Ġthe Ġend Ġof Ġthe Ġday , Ġwe 're"
56,Cla68 vs Cla68,Cla68,Cla68,More insults and confrontational language dire...,Tempers appear to be getting a little short in...,8,i Ġagree Ġwith Ġnorth 8 0 0 0
112,Fixentries vs Fixentries,Fixentries,Fixentries,Given that any of it came from electronic medi...,I will help on the article but I haven't been ...,8,Ġthe Ġindividual Ġher it ability Ġof Ġintellig...
82,DonaNobisPacem vs DonaNobisPacem,DonaNobisPacem,DonaNobisPacem,It points out the medical community does not u...,Agreed that this is crucial to point out - the...,8,Ġafter Ġ 1 8 - 2 0 Ġweeks
118,Fragments_of_Jade vs Fragments_of_Jade,Fragments_of_Jade,Fragments_of_Jade,"You have no room to talk about it, since you'v...",You have been stalking me all over Wikipedia a...,8,"Ġme , Ġand Ġit 's Ġgetting Ġold .Ċ"
58,Classicjupiter2 vs Classicjupiter2,Classicjupiter2,Classicjupiter2,"I have been editing on here for years, sir, pl...",Its just a trolling attempt at flamebait.\nI w...,7,Ġon Ġthe Ġsurreal ism Ġdiscussion Ġpage .Ċ
130,Gwen_Gale vs Gwen_Gale,Gwen_Gale,Gwen_Gale,"As an aside, throughout that war, by far most ...","I linked to above, seems off by a few years - ...",6,"Ġspeaking Ġonly Ġfor Ġmyself , Ġi"
110,Fipplet vs Fipplet,Fipplet,Fipplet,I don't edit any other articles for the moment...,Some extracts There is not a supermajority vie...,6,Ġdoesn 't Ġmean Ġwe Ġshould Ġstate
90,EdJohnston vs EdJohnston,EdJohnston,EdJohnston,"Thank you, If you have limited space and you c...","Also, people who are aggrieved want a chance t...",6,Ġdon 't Ġsee Ġany Ġreason Ġto


In [None]:
diff_author_problems = profile_problems[profile_problems['known_author'] != profile_problems['unknown_author']].copy()
diff_author_problems.sort_values(["highest_common_count"], ascending=[False], inplace=True)
diff_author_problems[(diff_author_problems['highest_common_count'] > 3) & (diff_author_problems['highest_common_count'] <= 10)].head(20)

Unnamed: 0,problem,known_author,unknown_author,known_text,unknown_text,highest_common_count,highest_common_ngram
147,HeadleyDown vs Hipocrite,HeadleyDown,Hipocrite,I made these points Remove redundancy Meta mod...,Elect someone who gets all of this to ArbCom.\...,5,Ġon Ġthis Ġtalk Ġpage .Ċ
131,Gwen_Gale vs Habap,Gwen_Gale,Habap,"As an aside, throughout that war, by far most ...",I had no idea he'd polled better with the radi...,5,", Ġbut Ġi Ġdon 't"
121,Fyunck(click) vs Garda40,Fyunck(click),Garda40,But I will always look to make things better f...,Well considering the fact that edit summaries ...,5,Ġ 2 0 1 1
53,Caboga vs Chanakyathegreat,Caboga,Chanakyathegreat,Direktor you might have forgotten this edit la...,The report from the government of India about ...,5,Ġ 2 0 0 8
87,Dweller vs Ecelan,Dweller,Ecelan,Take your pick from hundreds of non-specialist...,"As conclusion, Mogroviejo states that no gover...",5,.Ċ i Ġdon 't Ġthink
95,Enemesis vs Equanimous1,Enemesis,Equanimous1,That link that siafu left sort of backs up wha...,If you have graduated with a doctorate in heal...,5,Ġthe Ġcurrent Ġstate Ġof Ġthe
75,David_Shankbone vs Delicious_carbuncle,David_Shankbone,Delicious_carbuncle,I definitely think the Met Opera one looks muc...,"Lord G n, once again, this particular discussi...",4,.Ċ i 'm Ġnot
89,Ecelan vs EdJohnston,Ecelan,EdJohnston,"You delete a reference by a scholar, one of th...","Also, people who are aggrieved want a chance t...",4,.Ċ i Ġdon 't
57,Cla68 vs Classicjupiter2,Cla68,Classicjupiter2,More insults and confrontational language dire...,Its just a trolling attempt at flamebait.\nI w...,4,", Ġin Ġorder Ġto"
67,Cptnono vs CyberAnth,Cptnono,CyberAnth,Further edit warring without using the talk pa...,"Yes, I can understand how that would feel.\nIf...",4,Ġto Ġfix Ġit .Ċ
