# Common n-gram filtering test

From first exploration some 2-grams contain only punctuation and a single word. Want to test how to remove this automatically before sending to the LLM .

In [1]:
import sys

from from_root import from_root

In [2]:

# Ensure we can import from src/
sys.path.insert(0, str(from_root("src")))

from read_and_write_docs import read_jsonl
from utils import apply_temp_doc_id
from n_gram_functions import (
    common_ngrams,
    filter_ngrams,
    pretty_print_common_ngrams
)
from model_loading import load_model, distinct_special_chars

## Load model and get special characters

In [3]:
tokenizer, model = load_model("/Users/user/Documents/models/Qwen2.5-0.5B-Instruct")

In [4]:
special_tokens = distinct_special_chars(tokenizer=tokenizer)

## Load texts

In [5]:
corpus = "Wiki"
data_type = "test"

known_loc = f"/Volumes/BCross/datasets/author_verification/{data_type}/{corpus}/known_raw.jsonl"
known_loc = f"/Users/user/Documents/test_data/{corpus}/known_raw.jsonl"
known = read_jsonl(known_loc)
known = apply_temp_doc_id(known)

unknown_loc = f"/Volumes/BCross/datasets/author_verification/{data_type}/{corpus}/unknown_raw.jsonl"
unknown_loc = f"/Users/user/Documents/test_data/{corpus}/unknown_raw.jsonl"
unknown = read_jsonl(unknown_loc)
unknown_df = apply_temp_doc_id(unknown)

In [6]:
t1 = known[known['doc_id'] == 'hootmag_text_1'].reset_index().loc[0, 'text']
t2 = unknown[unknown['doc_id'] == 'hootmag_text_13'].reset_index().loc[0, 'text']

In [7]:
print(t1)

The article indicates that the HDI data are estimates for 2012.
I answered all of your questions - even though I thought they were irrelevant to this discussion, and that's why I expect you to answer all of my questions - even though you think they are irrelevant to this discussion.
I understand your position, so you don't have to repeat it.
I agree with you that if userSilvertrial has external sources only, then that won't be sufficient.
However, what if userSilvertrial has 'more than just external' sources?


In [8]:
print(t2)

Indeed, this is pure logic Just think about the following hypothetical case On Saturday, the first user posts a hidden post, which is going to be unhidden on Tuesday, automatically.
In your view, this case involves a prohibitted kind of changing the first user's post, because it's a hidden post on Sunday when it has already been responded to, and it's an automatically unhidden post on Tuseday.
Now, if a third party un-hides according to your suggestion the first user's post - before Tuesday, say on Monday, then we get to a second case being very similar to the first prohibited one it's a hidden post on Sunday when it has already been responded to, and it's an unhidden post on Monday after it has already been un-hidden by the third party, who has done that according to your suggestion.
Logically, one must infer the following just as in your opinion the first case is prohibited because it involves a change between Sunday when the post is hidden and Tuesday when it has already been unhidd

## Get common n-grams between the two texts

In [9]:
common = common_ngrams(t1, t2, 2, model=model, tokenizer=tokenizer)

In [10]:
common

{2: {("'t", 'Ġhave'),
  (',', 'Ġand'),
  (',', 'Ġso'),
  (',', 'Ġthen'),
  ('.Ċ', 'i'),
  ('Ġthis', 'Ġdiscussion'),
  ('Ġto', 'Ġthis')}}

In [11]:
filtered = filter_ngrams(common, special_tokens=special_tokens)

In [14]:
filtered

{2: {("'t", 'Ġhave'), ('Ġthis', 'Ġdiscussion'), ('Ġto', 'Ġthis')}}

In [21]:
from typing import Any, Dict, Set, Tuple, List, Union
import string

def pretty_print_common_ngrams(
    common: Dict[int, Set[Tuple[Any, ...]]],
    sep: str = " ",
    order: str = "count_desc",      # "count_desc" | "len_asc" | "len_desc"
    tokenizer=None,                 # Optional HuggingFace tokenizer
    return_format: str = "print",   # "print" | "flat" | "grouped"
    show_raw: bool = False          # If True, include raw token forms
) -> Union[None, List[Union[str, Tuple[str, str]]], Dict[int, List[Union[str, Tuple[str, str]]]]]:
    """
    Pretty-print or return shared n-grams, optionally paired with raw form.
    If show_raw=True, each output element is a tuple (pretty_str, raw_repr).
    raw_repr is a string representing the token tuple, e.g. "('Ġhello', 'Ġworld')".
    """

    if not common:
        if return_format == "print":
            print("{}")
            return None
        return [] if return_format == "flat" else {}

    def stringify_ngram(ngram: Tuple[Any, ...]) -> str:
        """Convert to human-readable text, using tokenizer if available."""
        if tokenizer is None:
            return sep.join(map(str, ngram))

        toks = list(ngram)
        # If all ints, decode in one shot
        if all(isinstance(t, int) for t in toks):
            return tokenizer.decode(
                toks,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False,
            )

        # Otherwise, convert each token (id or str) to string form
        specials = set(getattr(tokenizer, "all_special_tokens", []))
        norm_tokens: List[str] = []
        for t in toks:
            if isinstance(t, int):
                norm_tokens.append(tokenizer.convert_ids_to_tokens(t))
            else:
                norm_tokens.append(str(t))

        # Filter out special tokens like <s>, </s>
        norm_tokens = [t for t in norm_tokens if t not in specials]

        return tokenizer.convert_tokens_to_string(norm_tokens)

    def raw_repr_of_ngram(ngram: Tuple[Any, ...]) -> str:
        """Return a string showing the raw token tuple, as tokens or ints."""
        # We want something like "('Ġhello', 'Ġworld')" or "(12, 34, 56)" or mixed
        return "(" + ", ".join(repr(tok) for tok in ngram) + ")"

    # Build grouped mapping: for each n, a list of pretty or (pretty, raw)
    grouped: Dict[int, List[Union[str, Tuple[str, str]]]] = {}
    for n, grams in common.items():
        out_list: List[Union[str, Tuple[str, str]]] = []
        for g in sorted(grams):
            pretty = stringify_ngram(g)
            if show_raw:
                raw = raw_repr_of_ngram(g)
                out_list.append( (pretty, raw) )
            else:
                out_list.append(pretty)
        grouped[n] = out_list

    # Order the groups by your `order`
    if order == "count_desc":
        items = sorted(grouped.items(), key=lambda kv: (-len(kv[1]), kv[0]))
    elif order == "len_asc":
        items = sorted(grouped.items(), key=lambda kv: kv[0])
    elif order == "len_desc":
        items = sorted(grouped.items(), key=lambda kv: -kv[0])
    else:
        items = sorted(grouped.items(), key=lambda kv: (-len(kv[1]), kv[0]))

    if return_format == "flat":
        flat: List[Union[str, Tuple[str, str]]] = []
        for _, lst in items:
            flat.extend(lst)
        return flat

    if return_format == "grouped":
        return grouped

    # print mode
    for n, lst in items:
        if show_raw:
            # print each as "pretty (raw: …)"
            pretty_with_raw = [f"{p}  [raw: {r}]" for (p, r) in lst]
            print(f"{n}-grams ({len(lst)}): {pretty_with_raw}")
        else:
            print(f"{n}-grams ({len(lst)}): {lst}")

    return None


In [23]:
pretty_print_common_ngrams(filtered, tokenizer=tokenizer, order='len_desc', return_format='flat', show_raw=False)

["'t have", ' this discussion', ' to this']

## Trial with some edge cases

In [None]:
edge_case_ngrams = {
    2: {
        # ---------- your originals ----------
        # Word + Punctuation — should be filtered
        ("Ġhello", "!"),
        # Punctuation + Word — should be filtered
        (",", "Ġhowever"),
        # Word + no space — should be filtered (single word)
        ("Ġwe", "'re"),
        # Both punctuation — should be filtered
        ("!", "?"),
        # Two real words — should be kept
        ("Ġmachine", "Ġlearning"),
        # Both tokens without space — should be filtered (likely one word)
        ("un", "breakable"),
        # First token has no special char, second does — should be kept
        ("Super", "Ġfragile"),
        # First token is special (e.g., newline), second is word — may be kept or filtered
        ("Ċ", "Ġstart"),
        # Two formatting characters — should be filtered
        ("Ċ", "Ċ"),
        # Long word, subword chunks — should be filtered as single word
        (".Ċ", "pneu"),
        # Word with trailing punctuation inside same token — should be kept
        ("Ġhello!", "Ġworld"),
        # Word with attached contraction — likely one word
        ("Ġi", "'m"),

        # ---------- additions: punctuation + word / word + punctuation ----------
        # Word + punctuation — should be filtered
        ("Ġword", "."),
        # Punctuation + word — should be filtered
        ("(", "Ġparenthetical"),
        # Word + punctuation — should be filtered
        ("Ġend", ")"),
        # Punctuation + word (quote) — should be filtered
        ('"', "Ġquoted"),
        # Word + Unicode dash — should be filtered
        ("Ġdash", "—"),

        # ---------- additions: explicit whitespace markers before/after words ----------
        # Newline marker + word — may be kept or filtered per your policy
        ("Ċ", "ĠTitle"),
        # Word + newline marker — may be kept or filtered
        ("Ġtrail", "Ċ"),
        # Tab/VT/FF/CR markers + word — usually filtered as formatting
        ("ĉ", "Ġtabbed"),          # tab
        ("ċ", "Ġvtabbed"),         # vertical tab
        ("Č", "Ġformfeed"),        # form feed
        ("č", "Ġcarriage"),        # carriage return

        # ---------- additions: NBSP remap glyphs (byte-level) ----------
        # Two visible glyphs (NBSP bytes) — should be filtered (all-special)
        ("Â", "Ơ"),
        # NBSP pair then a word — should be filtered (special + word)
        ("ÂƠ", "Ġword"),
        # Word then NBSP pair — should be filtered (word + special)
        ("Ġword", "ÂƠ"),

        # ---------- additions: SentencePiece-style space marker ----------
        # Bare SP space marker + word — should be filtered (special + word)
        ("▁", "Hello"),
        # SP "<unk>"-like token + word — should be filtered (special-ish + word)
        ("▁<unk>", "world"),
        # Two SP-prefixed words (both with spaces inside token) — should be kept
        ("▁we", "▁test"),

        # ---------- additions: pure-special bigrams ----------
        # Newline + tab — should be filtered
        ("Ċ", "ĉ"),
        # Two SP markers — should be filtered
        ("▁", "▁"),
    },

    3: {
        # ---------- your originals ----------
        # Subword sequence from a single long word
        ("Ġsuper", "cal", "ifragilistic"),
        # Punctuation + subword + word — likely formatting garbage
        ("Ċ", "i", "'m"),
        # Real phrase with spacing — should be kept
        ("Ġnatural", "Ġlanguage", "Ġprocessing"),
        # Repeated punctuation — should be filtered
        ("Ċ", "Ċ", "Ċ"),
        # A word broken badly into subtokens without spacing — should be filtered
        ("bio", "tech", "nology"),
        # First token has no space, rest do — should be kept
        ("Bio", "Ġtech", "Ġboom"),
        # Two contractions — likely one semantic word
        ("Ġthey", "'d", "Ġ've"),

        # ---------- additions: punctuation + word + punctuation ----------
        # Punct + word + punct — should be filtered
        ("(", "Ġaside", ")"),
        # Word + punct + newline — should be filtered
        ("Ġhello", "!", "Ċ"),
        # ASCII punct around word — should be filtered
        ("-", "Ġbreak", "-"),

        # ---------- additions: whitespace markers across positions ----------
        # Newline + words — may be kept or filtered per rule
        ("Ċ", "Ġnew", "Ġparagraph"),
        # Tab + words — should be filtered (formatting)
        ("ĉ", "Ġtabbed", "Ġline"),
        # Form feed + words — should be filtered (formatting)
        ("Č", "Ġform", "Ġfeed"),
        # Carriage return + words — should be filtered (formatting)
        ("č", "Ġcarriage", "Ġreturn"),

        # ---------- additions: NBSP glyphs around words ----------
        # NBSP pair then a word — should be filtered (mostly special)
        ("Â", "Ơ", "Ġspace"),
        # Word + NBSP pair + word — may be filtered (special sandwiched)
        ("Ġbefore", "ÂƠ", "Ġafter"),

        # ---------- additions: SentencePiece mixes ----------
        # SP marker + word + punct — should be filtered
        ("▁", "Hello", ","),
        # SP "<unk>" + word + newline — should be filtered
        ("▁<unk>", "Ġtoken", "Ċ"),
        # Three SP-prefixed words — should be kept
        ("▁we", "▁are", "▁testing"),
        # Two SP markers then a word — should be filtered
        ("▁", "▁", "word"),

        # ---------- additions: mostly/all special ----------
        # Double newline then a word — may be filtered
        ("Ċ", "Ċ", "Ġstart"),
        # SP marker + newline + SP marker — should be filtered
        ("▁", "Ċ", "▁"),
    }
}


In [None]:
edge_case_ngrams

In [None]:
filter_ngrams(edge_case_ngrams, special_tokens=special_tokens)
