In [None]:
import sys
import os
import re
import itertools

import pandas as pd

from typing import Any, Dict, List, Sequence, Set, Tuple
from collections import defaultdict

In [7]:
sys.path.append(os.path.abspath('../../src'))

from read_and_write_docs import read_jsonl, read_rds
from tokenize_and_score import load_model
from utils import apply_temp_doc_id, build_metadata_df

In [8]:
def common_ngrams(
    text1: str,
    text2: str,
    n: int,
    model: Any = None,
    tokenizer: Any = None,
    include_subgrams: bool = False,
) -> Dict[int, Set[Tuple[Any, ...]]]:
    """
    Return shared n-grams of length >= n between two texts.

    If include_subgrams is False (default), remove any shared n-gram that is a
    contiguous subspan of a longer shared n-gram. (So a 5-gram that’s part of a
    shared 6-gram is excluded; unrelated 5-grams remain.)
    """
    if n < 1:
        raise ValueError("n must be >= 1")

    def _word_tokens(s: str) -> List[str]:
        return re.findall(r"\w+", s.casefold())

    def _hf_tokens(txt: str) -> List[Any]:
        if hasattr(tokenizer, "tokenize"):
            return list(tokenizer.tokenize(txt))
        enc = tokenizer(
            txt,
            add_special_tokens=False,
            return_attention_mask=False,
            return_token_type_ids=False,
        )
        input_ids = enc.get("input_ids", [])
        if input_ids and isinstance(input_ids[0], (list, tuple)):
            input_ids = input_ids[0]
        if hasattr(tokenizer, "convert_ids_to_tokens"):
            return tokenizer.convert_ids_to_tokens(input_ids)
        return input_ids

    def _ngrams_by_len(seq: Sequence[Any], min_n: int) -> Dict[int, Set[Tuple[Any, ...]]]:
        out: Dict[int, Set[Tuple[Any, ...]]] = {}
        L = len(seq)
        for k in range(min_n, L + 1):
            s: Set[Tuple[Any, ...]] = set()
            for i in range(0, L - k + 1):
                s.add(tuple(seq[i : i + k]))
            if s:
                out[k] = s
        return out

    token_mode = (model is not None) and (tokenizer is not None)
    seq1 = _hf_tokens(text1) if token_mode else _word_tokens(text1)
    seq2 = _hf_tokens(text2) if token_mode else _word_tokens(text2)

    ngrams1 = _ngrams_by_len(seq1, n)
    ngrams2 = _ngrams_by_len(seq2, n)

    common: Dict[int, Set[Tuple[Any, ...]]] = {}
    for k in set(ngrams1.keys()).intersection(ngrams2.keys()):
        inter = ngrams1[k] & ngrams2[k]
        if inter:
            common[k] = inter

    if include_subgrams or not common:
        return common

    # Remove n-grams that are contiguous subspans of any longer shared n-gram
    to_remove: Dict[int, Set[Tuple[Any, ...]]] = defaultdict(set)
    lengths = sorted(common.keys())
    for k in lengths:
        # For each longer length, generate all contiguous subspans down to n
        for longer_k in [L for L in lengths if L > k]:
            for g in common[longer_k]:
                # produce all subspans of length k from g
                for i in range(0, longer_k - k + 1):
                    to_remove[k].add(g[i : i + k])

    # Apply removals
    for k, rem in to_remove.items():
        if k in common:
            common[k] = {g for g in common[k] if g not in rem}
            if not common[k]:
                del common[k]

    return common


def highest_common(common: Dict[int, Set[Tuple[Any, ...]]]) -> Tuple[int, Set[Tuple[Any, ...]]]:
    """
    Given the dict returned by `common_ngrams`, return (max_n, ngrams_at_max).
    If there are none, returns (0, empty set).
    """
    if not common:
        return 0, set()
    max_k = max(common.keys())
    return max_k, common[max_k]

In [9]:
def largest_common_ngram_problems(
    metadata: pd.DataFrame,
    known: pd.DataFrame,
    unknown: pd.DataFrame,
    n: int,  # kept for compatibility in case your helpers use it
    model: Any = None,
    tokenizer: Any = None,
    print_progress: bool = True,
) -> pd.DataFrame:
    """
    For each metadata row (keeps, problem, known_author, unknown_author, known_doc_id, unknown_doc_id),
    filter `known`/`unknown` by doc_id, take text via .reset_index().loc[0, 'text'],
    compute common_ngrams(...) then highest_common(common), and extract:
      - highes_common_count: the number (first element)
      - highes_common_ngram: the n-gram as a space-joined string

    Returns columns:
      ['keeps','problem','known_author','unknown_author','known_doc_id','unknown_doc_id',
       'highes_common_count','highes_common_ngram']
    """
    required_meta_cols = [
        "problem", "known_author", "unknown_author",
        "known_doc_id", "unknown_doc_id",
    ]
    missing_meta = [c for c in required_meta_cols if c not in metadata.columns]
    if missing_meta:
        raise ValueError(f"metadata missing columns: {missing_meta}")

    for df_name, df in [("known", known), ("unknown", unknown)]:
        if "doc_id" not in df.columns:
            raise ValueError(f"'{df_name}' is missing required column 'doc_id'")
        if "text" not in df.columns:
            raise ValueError(f"'{df_name}' is missing required column 'text'")

    def _pick_ngram_string(ngrams: Any) -> str:
        """
        Accepts one of:
          - a tuple/list of tokens (single n-gram),
          - a set/list of n-gram tuples (choose deterministic first),
          - an already-joined string.
        Returns a single space-joined n-gram string.
        """
        # If we received a collection of n-grams, pick a deterministic one
        if isinstance(ngrams, (set, list, tuple)) and ngrams and isinstance(next(iter(ngrams)), (tuple, list, str)):
            # If it's a set/list of tuples/lists/strings, sort deterministically
            if isinstance(ngrams, (set, list)) and ngrams and not isinstance(ngrams, str):
                try:
                    candidate = sorted(ngrams)[0]
                except Exception:
                    candidate = next(iter(ngrams))
            else:
                candidate = ngrams  # already a single n-gram tuple/list/str
        else:
            candidate = ngrams

        # If candidate is a sequence of tokens, join with spaces; otherwise cast to str
        if isinstance(candidate, (tuple, list)):
            return " ".join(map(str, candidate))
        return str(candidate)

    rows: List[Dict[str, Any]] = []
    total = len(metadata)
    it = metadata[required_meta_cols].itertuples(index=False, name="MetaRow")

    for i, row in enumerate(it, 1):
        problem, known_author, unknown_author, known_doc_id, unknown_doc_id = row

        kdf = known.loc[known["doc_id"] == known_doc_id].reset_index(drop=True)
        udf = unknown.loc[unknown["doc_id"] == unknown_doc_id].reset_index(drop=True)

        if kdf.empty or udf.empty:
            count_val = None
            ngram_str = None
        else:
            text_known = kdf.loc[0, "text"]
            text_unknown = udf.loc[0, "text"]

            # If your common_ngrams expects n, switch to: common_ngrams(text_known, text_unknown, n)
            common = common_ngrams(text_known, text_unknown, n=n)
            hc = highest_common(common)

            if hc is None:
                count_val = None
                ngram_str = None
            else:
                # Expecting (number, ngrams)
                try:
                    count_val, ngrams_obj = hc
                except Exception:
                    # Fallback: treat whole object as the ngram payload and set count None
                    count_val = None
                    ngrams_obj = hc
                ngram_str = _pick_ngram_string(ngrams_obj)

        rows.append({
            "problem": problem,
            "known_author": known_author,
            "unknown_author": unknown_author,
            "known_doc_id": known_doc_id,
            "unknown_doc_id": unknown_doc_id,
            "highest_common_count": count_val,      # extracted number
            "highest_common_ngram": ngram_str,      # tokens joined by ' '
        })

        if print_progress and total:
            if (i % max(1, total // 50) == 0) or (i == total):
                pct = int(i * 100 / total)
                print(f"\rProcessed {i}/{total} ({pct}%)", end="")

    if print_progress:
        print()

    return pd.DataFrame(rows)


In [10]:
known_loc = "/Volumes/BCross/datasets/author_verification/training/Wiki/known_raw.jsonl"
known = read_jsonl(known_loc)
known = apply_temp_doc_id(known)

known_loc = "/Volumes/BCross/datasets/author_verification/training/Wiki/known_corpus_split/hipocrite_text_3.jsonl"
known_text = read_jsonl(known_loc).loc[0, 'text']

unknown_loc = "/Volumes/BCross/datasets/author_verification/training/Wiki/unknown_raw.jsonl"
unknown = read_jsonl(unknown_loc)
unknown_df = apply_temp_doc_id(unknown)

unknown_text = unknown_df[unknown_df['doc_id']=='hipocrite_text_4'].reset_index().loc[0, 'text']

metadata_loc = "/Volumes/BCross/datasets/author_verification/training/metadata.rds"
metadata = read_rds(metadata_loc)
filtered_metadata = metadata[metadata['corpus'] == 'Wiki']
agg_metadata = build_metadata_df(filtered_metadata, known, unknown)

In [9]:
wiki_problems = largest_common_ngram_problems(agg_metadata, known, unknown, n=2)

Processed 450/450 (100%)


In [10]:
same_author_problems = wiki_problems[wiki_problems['known_author'] == wiki_problems['unknown_author']]
same_author_problems.sort_values(["highest_common_count"], ascending=[False], inplace=True)
same_author_problems[(same_author_problems['highest_common_count'] > 3) & (same_author_problems['highest_common_count'] <= 10)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,problem,known_author,unknown_author,known_doc_id,unknown_doc_id,highest_common_count,highest_common_ngram
379,Greg_L vs Greg_L,Greg_L,Greg_L,greg_l_text_11,greg_l_text_10,9,allege harbor views that are contrary to the c...
432,Haymaker vs Haymaker,Haymaker,Haymaker,haymaker_text_3,haymaker_text_2,8,at the end of the day we re
270,EdJohnston vs EdJohnston,EdJohnston,EdJohnston,edjohnston_text_2,edjohnston_text_5,7,i don t see any reason to
72,Atama vs Atama,Atama,Atama,atama_text_1,atama_text_5,7,the article there shouldn t be any
331,Fipplet vs Fipplet,Fipplet,Fipplet,fipplet_text_2,fipplet_text_5,6,doesn t mean we should state
...,...,...,...,...,...,...,...
356,Fragments_of_Jade vs Fragments_of_Jade,Fragments_of_Jade,Fragments_of_Jade,fragments_of_jade_text_4,fragments_of_jade_text_10,4,on my talk page
421,Hardyplants vs Hardyplants,Hardyplants,Hardyplants,hardyplants_text_2,hardyplants_text_4,4,that it is a
367,Garda40 vs Garda40,Garda40,Garda40,garda40_text_3,garda40_text_1,4,is not an issue
373,George vs George,George,George,george_text_10,george_text_13,4,i don t have


In [11]:
diff_author_problems = wiki_problems[wiki_problems['known_author'] != wiki_problems['unknown_author']]
diff_author_problems.sort_values(["highest_common_count"], ascending=[False], inplace=True)
diff_author_problems[(diff_author_problems['highest_common_count'] > 2) & (diff_author_problems['highest_common_count'] <= 10)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,problem,known_author,unknown_author,known_doc_id,unknown_doc_id,highest_common_count,highest_common_ngram
135,Britmax vs BrotherDarksoul,Britmax,BrotherDarksoul,britmax_text_1,brotherdarksoul_text_4,5,there is no need to
328,Falcon9x5 vs Fipplet,Falcon9x5,Fipplet,falcon9x5_text_3,fipplet_text_5,5,i don t know if
287,Enemesis vs Equanimous1,Enemesis,Equanimous1,enemesis_text_3,equanimous1_text_5,5,the current state of the
286,Enemesis vs Equanimous1,Enemesis,Equanimous1,enemesis_text_2,equanimous1_text_5,5,the current state of the
364,Fyunck(click) vs Garda40,Fyunck(click),Garda40,fyunck_click_text_12,garda40_text_1,4,i m not sure
...,...,...,...,...,...,...,...
27,AlanBarnet vs Alanyst,AlanBarnet,Alanyst,alanbarnet_text_10,alanyst_text_10,3,out of the
75,Atama vs Athenean,Atama,Athenean,atama_text_1,athenean_text_5,3,a lot of
166,Chanakyathegreat vs Cla68,Chanakyathegreat,Cla68,chanakyathegreat_text_11,cla68_text_1,3,added to the
29,AlanBarnet vs Alanyst,AlanBarnet,Alanyst,alanbarnet_text_4,alanyst_text_10,3,in their own


## Pick a test

Chose a same author and different author test containing at least one of the same authors.

In [20]:
same_author_known = known[known['doc_id'] == 'fipplet_text_2'].reset_index().loc[0, 'text']
same_author_unknown = unknown[unknown['doc_id'] == 'fipplet_text_5'].reset_index().loc[0, 'text']

diff_author_known = known[known['doc_id'] == 'falcon9x5_text_3'].reset_index().loc[0, 'text']
diff_author_unknown = unknown[unknown['doc_id'] == 'fipplet_text_5'].reset_index().loc[0, 'text']

In [27]:
same_author_common = common_ngrams(same_author_known, same_author_unknown, n=2)
diff_author_common = common_ngrams(diff_author_known, diff_author_unknown, n=2)

In [28]:
diff_author_common

{2: {('about', 'the'),
  ('at', 'least'),
  ('been', 'a'),
  ('calling', 'it'),
  ('definition', 'of'),
  ('if', 'you'),
  ('is', 'the'),
  ('isn', 't'),
  ('it', 's'),
  ('that', 'the'),
  ('that', 'was'),
  ('this', 'is'),
  ('this', 'to'),
  ('to', 'be'),
  ('want', 'to'),
  ('was', 'the'),
  ('with', 'the')},
 3: {('part', 'of', 'the')},
 4: {('i', 'don', 't', 'think')},
 5: {('i', 'don', 't', 'know', 'if')}}

In [29]:
highest_common(diff_author_common)

(5, {('i', 'don', 't', 'know', 'if')})

In [None]:
def largest_common_ngram_df(
    directory: str,
    n: int,
    model: Any = None,
    tokenizer: Any = None,
    ordered_pairs: bool = False,
    print_progress: bool = True,
) -> pd.DataFrame:
    """
    Scan .jsonl files in `directory` and, for each file pair, compute:
      - largest shared n-gram length (>= n)
      - all n-grams at that largest length (as strings joined by spaces)

    Returns a DataFrame with columns:
      ['text_1', 'text_2', 'largest_n', 'largest_ngrams']

    Notes:
      - Uses `read_jsonl(path).loc[0, 'text']` to load each file (assumes single row with 'text').
      - If BOTH `model` and `tokenizer` are provided, tokenizes first and works over tokens.
        Otherwise uses a simple word tokenizer (casefolded, punctuation stripped).
      - `ordered_pairs=False` yields unordered pairs (A,B) once. Set True to include (B,A).
      - Set `print_progress=False` to silence progress prints.
    """
    if n < 1:
        raise ValueError("n must be >= 1")

    # Gather just .jsonl files
    paths = [
        os.path.join(directory, fn)
        for fn in os.listdir(directory)
        if fn.lower().endswith(".jsonl")
    ]
    paths.sort()
    if len(paths) < 2:
        raise ValueError("Need at least two .jsonl files in the directory.")

    if print_progress:
        print(f"Found {len(paths)} .jsonl files in: {directory}")

    # --- Load texts using your read_jsonl ---
    texts: Dict[str, str] = {}
    for p in paths:
        df = read_jsonl(p)
        if "text" not in df.columns:
            raise ValueError(f"'text' column not found in {p}")
        if df.shape[0] < 1:
            raise ValueError(f"No rows found in {p}")
        texts[p] = str(df.loc[0, "text"])

    # --- Tokenization helpers ---
    def _word_tokens(s: str) -> List[str]:
        return re.findall(r"\w+", s.casefold())

    def _hf_tokens(txt: str) -> List[Any]:
        if hasattr(tokenizer, "tokenize"):
            return list(tokenizer.tokenize(txt))
        enc = tokenizer(
            txt,
            add_special_tokens=False,
            return_attention_mask=False,
            return_token_type_ids=False,
        )
        input_ids = enc.get("input_ids", [])
        if input_ids and isinstance(input_ids[0], (list, tuple)):
            input_ids = input_ids[0]
        if hasattr(tokenizer, "convert_ids_to_tokens"):
            return tokenizer.convert_ids_to_tokens(input_ids)
        return input_ids

    token_mode = (model is not None) and (tokenizer is not None)

    # Pre-tokenize all docs once, with progress
    docs: Dict[str, List[Any]] = {}
    if print_progress:
        print("Tokenizing documents...")
    for i, p in enumerate(paths, 1):
        docs[p] = _hf_tokens(texts[p]) if token_mode else _word_tokens(texts[p])
        if print_progress:
            interval = max(1, len(paths) // 20)
            if (i % interval == 0) or (i == len(paths)):
                pct = int(i * 100 / len(paths))
                sys.stdout.write(f"\r  Tokenized {i}/{len(paths)} ({pct}%)")
                sys.stdout.flush()
    if print_progress:
        print()

    # Cache n-gram sets per doc per k to reuse across many pairs
    cache: Dict[str, Dict[int, Set[Tuple[Any, ...]]]] = {p: {} for p in paths}

    def ngram_set(seq: Sequence[Any], k: int, key: str) -> Set[Tuple[Any, ...]]:
        cdoc = cache[key]
        if k in cdoc:
            return cdoc[k]
        L = len(seq)
        s: Set[Tuple[Any, ...]] = set()
        if k <= L:
            for i in range(L - k + 1):
                s.add(tuple(seq[i : i + k]))
        cdoc[k] = s
        return s

    # Pair generator
    pair_iter = (
        itertools.permutations(paths, 2) if ordered_pairs
        else itertools.combinations(paths, 2)
    )
    total_pairs = len(paths) * (len(paths) - 1) if ordered_pairs else (len(paths) * (len(paths) - 1)) // 2
    if print_progress:
        print(f"Comparing {total_pairs} {'ordered' if ordered_pairs else 'unordered'} pairs...")

    rows = []
    for idx, (p1, p2) in enumerate(pair_iter, 1):
        seq1, seq2 = docs[p1], docs[p2]
        largest = 0
        largest_set: Set[Tuple[Any, ...]] = set()

        if len(seq1) >= n and len(seq2) >= n:
            upper = min(len(seq1), len(seq2))
            k = n
            while k <= upper:
                s1 = ngram_set(seq1, k, p1)
                s2 = ngram_set(seq2, k, p2)
                if not s1 or not s2:
                    break
                inter = s1 & s2
                if not inter:
                    break
                largest = k
                largest_set = inter
                k += 1

        # Convert the winning n-grams (tuples) into space-joined strings
        largest_ngrams = [" ".join(map(str, tup)) for tup in sorted(largest_set)]

        rows.append({
            "text_1": os.path.basename(p1),
            "text_2": os.path.basename(p2),
            "largest_n": int(largest),
            "largest_ngrams": largest_ngrams,   # list[str]
        })

        if print_progress:
            interval = max(1, total_pairs // 100)
            if (idx % interval == 0) or (idx == total_pairs):
                pct = int(idx * 100 / total_pairs)
                sys.stdout.write(f"\r  Progress {idx}/{total_pairs} ({pct}%)")
                sys.stdout.flush()

    if print_progress:
        print("\nDone.")

    df = (
        pd.DataFrame(rows)
        .sort_values(["largest_n", "text_1", "text_2"], ascending=[False, True, True])
        .reset_index(drop=True)
    )
    return df


In [None]:
import os
import sys
from typing import Any, Dict, Set, Tuple
import pandas as pd

# assumes `read_jsonl` and `common_ngrams` are already defined/imported

def largest_common_ngram_df(
    directory: str,
    n: int,
    model: Any = None,
    tokenizer: Any = None,
    ordered_pairs: bool = False,
    print_progress: bool = True,
) -> pd.DataFrame:
    """
    Scan .jsonl files in `directory` and, for each file pair, compute:
      - largest shared n-gram length (>= n)
      - all n-grams at that largest length (as strings joined by spaces)

    Returns a DataFrame with columns:
      ['text_1', 'text_2', 'largest_n', 'largest_ngrams']

    Notes:
      - Uses `read_jsonl(path).loc[0, 'text']` to load each file (assumes single row with 'text').
      - Delegates n-gram logic to `common_ngrams(..., include_subgrams=False)`.
      - If BOTH `model` and `tokenizer` are provided, `common_ngrams` will use token mode; otherwise word mode.
      - `ordered_pairs=False` yields unordered pairs (A,B) once. Set True to include (B,A).
      - Set `print_progress=False` to silence progress prints.
    """
    if n < 1:
        raise ValueError("n must be >= 1")

    # Gather just .jsonl files
    paths = [
        os.path.join(directory, fn)
        for fn in os.listdir(directory)
        if fn.lower().endswith(".jsonl")
    ]
    paths.sort()
    if len(paths) < 2:
        raise ValueError("Need at least two .jsonl files in the directory.")

    if print_progress:
        print(f"Found {len(paths)} .jsonl files in: {directory}")

    # --- Load texts using your read_jsonl ---
    texts: Dict[str, str] = {}
    for p in paths:
        df = read_jsonl(p)
        if "text" not in df.columns:
            raise ValueError(f"'text' column not found in {p}")
        if df.shape[0] < 1:
            raise ValueError(f"No rows found in {p}")
        texts[p] = str(df.loc[0, "text"])

    # Pair generator
    pair_iter = (
        itertools.permutations(paths, 2) if ordered_pairs
        else itertools.combinations(paths, 2)
    )
    total_pairs = len(paths) * (len(paths) - 1) if ordered_pairs else (len(paths) * (len(paths) - 1)) // 2
    if print_progress:
        print(f"Comparing {total_pairs} {'ordered' if ordered_pairs else 'unordered'} pairs...")

    rows: List[Dict[str, Any]] = []
    for idx, (p1, p2) in enumerate(pair_iter, 1):
        # Use common_ngrams to get only the longest shared n-grams
        cmn = common_ngrams(
            texts[p1],
            texts[p2],
            n,
            model=model,
            tokenizer=tokenizer,
            include_subgrams=False,  # keep only the globally-longest shared n-grams
        )

        if cmn:
            largest_k = max(cmn.keys())
            ng_set: Set[Tuple[Any, ...]] = cmn[largest_k]
        else:
            largest_k = 0
            ng_set = set()

        # Convert the winning n-grams (tuples) into space-joined strings
        largest_ngrams = [" ".join(map(str, tup)) for tup in sorted(ng_set)]

        rows.append({
            "text_1": os.path.basename(p1),
            "text_2": os.path.basename(p2),
            "largest_n": int(largest_k),
            "largest_ngrams": largest_ngrams,   # list[str]
        })

        if print_progress:
            interval = max(1, total_pairs // 100)
            if (idx % interval == 0) or (idx == total_pairs):
                pct = int(idx * 100 / total_pairs)
                sys.stdout.write(f"\r  Progress {idx}/{total_pairs} ({pct}%)")
                sys.stdout.flush()

    if print_progress:
        print("\nDone.")

    df = (
        pd.DataFrame(rows)
        .sort_values(["largest_n", "text_1", "text_2"], ascending=[False, True, True])
        .reset_index(drop=True)
    )
    return df


In [39]:
results = largest_common_ngram_df(
    directory='/Volumes/BCross/datasets/author_verification/training/Wiki/known_corpus_split/',
    n=2,
    model=None,
    tokenizer=None,
    ordered_pairs=False
)

Found 225 .jsonl files in: /Volumes/BCross/datasets/author_verification/training/Wiki/known_corpus_split/
Tokenizing documents...
  Tokenized 225/225 (100%)
Comparing 25200 unordered pairs...
  Progress 25200/25200 (100%)
Done.


In [46]:
results[(results['largest_n']>3) & (results['largest_n']<8)]

Unnamed: 0,text_1,text_2,largest_n,largest_ngrams
19,bfigura_text_4.jsonl,falcon9x5_text_2.jsonl,7,[as far as i m aware it]
20,caboga_text_3.jsonl,caboga_text_5.jsonl,7,[a reliable source for the name kabudzic]
21,classicjupiter2_text_4.jsonl,david_shankbone_text_5.jsonl,7,[just wanted to drop you a note]
22,classicjupiter2_text_4.jsonl,habap_text_3.jsonl,7,[just wanted to drop you a note]
23,david_shankbone_text_5.jsonl,habap_text_3.jsonl,7,[just wanted to drop you a note]
...,...,...,...,...
2318,hardyplants_text_2.jsonl,hardyplants_text_3.jsonl,4,[that it is a]
2319,hardyplants_text_3.jsonl,headleydown_text_2.jsonl,4,[on the talk page]
2320,hardyplants_text_3.jsonl,headleydown_text_3.jsonl,4,[on the talk page]
2321,haymaker_text_3.jsonl,hipocrite_text_3.jsonl,4,[you choose not to]


In [31]:
results = largest_common_ngram_df(
    directory='/Volumes/BCross/datasets/author_verification/training/Wiki/known_corpus_split/',
    n=2,
    model=None,
    tokenizer=None,
    ordered_pairs=False
)

Found 225 .jsonl files in: /Volumes/BCross/datasets/author_verification/training/Wiki/known_corpus_split/
Comparing 25200 unordered pairs...
  Progress 252/25200 (1%)

KeyboardInterrupt: 

In [None]:
results[(results['largest_n']>3) & (results['largest_n']<8)]

In [14]:
tokenizer, model = load_model("/Volumes/BCross/models/Qwen 2.5/Qwen2.5-0.5B-Instruct")

In [15]:
base_dir = '/Volumes/BCross/datasets/author_verification/training/Wiki/known_corpus_split/'

text_1 = read_jsonl(f"{base_dir}caboga_text_1.jsonl").loc[0, 'text']
text_2 = read_jsonl(f"{base_dir}hardyplants_text_2.jsonl").loc[0, 'text']

common = common_ngrams(text_1, text_2, n=2)

In [16]:
common_tokens = common_ngrams(text_1, text_2, n=2, tokenizer=tokenizer, model=model)

In [17]:
common_tokens

{2: {('.Ċ', 'The'),
  ('Ġ', '1'),
  ('Ġand', 'Ġthe'),
  ('Ġduring', 'Ġthe'),
  ('Ġin', 'Ġthe'),
  ('Ġof', 'Ġa'),
  ('Ġof', 'Ġnon'),
  ('Ġof', 'Ġthe'),
  ('Ġon', 'Ġthe'),
  ('Ġthat', 'Ġis'),
  ('Ġthat', 'Ġwas'),
  ('Ġwithin', 'Ġthe'),
  ('Ġyou', 'Ġare')},
 3: {('Ġa', 'Ġnumber', 'Ġof')},
 6: {('Ġfrom', 'Ġone', 'Ġgeneration', 'Ġto', 'Ġthe', 'Ġnext')}}

In [19]:
common

{2: {('and', 'the'),
  ('during', 'the'),
  ('if', 'you'),
  ('in', 'the'),
  ('of', 'a'),
  ('of', 'non'),
  ('of', 'the'),
  ('on', 'the'),
  ('that', 'is'),
  ('that', 'was'),
  ('within', 'the'),
  ('you', 'are')},
 3: {('a', 'number', 'of')},
 6: {('from', 'one', 'generation', 'to', 'the', 'next')}}

In [None]:
def highest_common(common: Dict[int, Set[Tuple[Any, ...]]]) -> Tuple[int, Set[Tuple[Any, ...]]]:
    """
    Given the dict returned by `common_ngrams`, return (max_n, ngrams_at_max).
    If there are none, returns (0, empty set).
    """
    if not common:
        return 0, set()
    max_k = max(common.keys())
    return max_k, common[max_k]

# Usage
max_k, ngrams_at_max = highest_common(common)
if max_k == 0:
    print("No shared n-grams ≥ n.")
else:
    print(f"Largest shared n = {max_k} ({len(ngrams_at_max)} n-grams)")
    # Print them all as readable strings (optional)
    for tup in sorted(ngrams_at_max):
        print(" ".join(map(str, tup)))


Largest shared n = 6 (1 n-grams)
from one generation to the next
