In [1]:
import ast
import re
import sys
import unicodedata

import pandas as pd

from from_root import from_root

sys.path.insert(0, str(from_root("src")))

from model_loading import load_model, distinct_special_chars

In [2]:
base_loc = "/Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs"

file_name = "raw_all_tokens.xlsx"

file_loc = f"{base_loc}/{file_name}"

In [3]:
model_loc = "/Volumes/BCross/models/gpt2"

tokenizer, model = load_model(model_loc)

In [4]:
special_tokens = distinct_special_chars(tokenizer=tokenizer)
special_tokens[0:5]

['Ġ', 'ĉ', 'Ċ', 'č', 'ċ']

In [5]:
def ensure_tokens_are_lists(df):
    df = df.copy()
    df["tokens"] = df["tokens"].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else x
    )
    return df

In [6]:
def filter_min_length(df, min_tokens: int = 2):
    
    df = df[df['num_tokens'] >= min_tokens]
    
    return df.copy()

In [7]:
def filter_only_special_tokens(df: pd.DataFrame, special_token_list) -> pd.DataFrame:
    """
    REMOVE rows where df['tokens'] contains ONLY special tokens (and at least 1 token).
    Assumes df['tokens'] is a list of token strings per row.
    """
    special = set(special_token_list)
    only_special = df["tokens"].apply(lambda toks: bool(toks) and all(t in special for t in toks))
    return df.loc[~only_special].copy()

In [8]:
def filter_only_numbers_and_special_tokens(df: pd.DataFrame, special_token_list) -> pd.DataFrame:
    """
    REMOVE rows where df['tokens'] contains ONLY numbers plus any strings in special_token_list.
    Example removed: ["<special>19", "<special>97", "112"]  (assuming "<special>" is in special_token_list)
    Assumes df['tokens'] is a list (items can be str/int).
    """
    special = sorted(set(map(str, special_token_list)), key=len, reverse=True)

    def is_only_numbers_plus_special(toks) -> bool:
        s = "".join(map(str, toks))  # combine tokens into one string
        for sp in special:
            s = s.replace(sp, "")
        s = "".join(s.split())       # drop any remaining whitespace
        return bool(s) and bool(re.fullmatch(r"\d+", s))

    mask = df["tokens"].apply(is_only_numbers_plus_special)
    return df.loc[~mask].copy()

In [9]:
def filter_only_punct_and_special_tokens(df: pd.DataFrame, special_token_list) -> pd.DataFrame:
    """
    REMOVE rows where df['tokens'] contains ONLY punctuation and/or special tokens,
    allowing special tokens to appear as substrings inside tokens (e.g. "<special>,").
    Assumes df['tokens'] is a list (items will be cast to str).
    """
    specials = sorted(set(map(str, special_token_list)), key=len, reverse=True)
    special_re = re.compile("|".join(re.escape(s) for s in specials)) if specials else None

    def strip_specials(s: str) -> str:
        s = str(s)
        return special_re.sub("", s) if special_re else s

    def is_punct_or_empty_after_strip(tok) -> bool:
        rem = strip_specials(tok)
        rem = "".join(rem.split())  # drop whitespace
        if rem == "":
            return True
        return all(unicodedata.category(ch).startswith("P") for ch in rem)

    only_punct_or_special = df["tokens"].apply(
        lambda toks: bool(toks) and all(is_punct_or_empty_after_strip(t) for t in toks)
    )

    return df.loc[~only_punct_or_special].copy()


In [10]:
def filter_at_least_n_minus_1_specials(df: pd.DataFrame, special_token_list, n: int | None = None) -> pd.DataFrame:
    """
    REMOVE rows where:
      - if n is set: len(tokens) == n AND (# special-only tokens) >= n-1
      - if n is None: (# special-only tokens) >= len(tokens)-1  (i.e., at most 1 non-special)

    Handles specials embedded inside tokens by stripping special substrings first.
    Assumes df['tokens'] is a list (items cast to str).
    """
    specials = sorted(set(map(str, special_token_list)), key=len, reverse=True)
    special_re = re.compile("|".join(re.escape(s) for s in specials)) if specials else None

    def is_special_only(tok) -> bool:
        s = str(tok)
        if special_re:
            s = special_re.sub("", s)
        s = "".join(s.split())  # drop whitespace
        return s == ""

    def should_remove(toks) -> bool:
        if not toks:
            return False
        L = len(toks)
        if n is not None and L != n:
            return False
        special_count = sum(is_special_only(t) for t in toks)
        threshold = (n - 1) if n is not None else (L - 1)
        return special_count >= threshold

    mask = df["tokens"].apply(should_remove)
    return df.loc[~mask].copy()


In [11]:
def filter_zero_special_tokens(df: pd.DataFrame, special_token_list) -> pd.DataFrame:
    """
    REMOVE rows where df['tokens'] contains ZERO special tokens.
    Treats specials as matching anywhere inside a token (substring match).
    Assumes df['tokens'] is a list (items cast to str).
    """
    specials = sorted(set(map(str, special_token_list)), key=len, reverse=True)
    special_re = re.compile("|".join(re.escape(s) for s in specials)) if specials else None

    def has_any_special(toks) -> bool:
        if not special_re:
            return False
        return any(bool(special_re.search(str(t))) for t in toks)

    zero_special = df["tokens"].apply(lambda toks: not has_any_special(toks))
    return df.loc[~zero_special].copy()


In [12]:
def filter_leading_specials_single_word(df: pd.DataFrame, special_token_list) -> pd.DataFrame:
    """
    REMOVE rows where:
      1) tokens that contain any special substring appear only at the START of the list
         (pattern: True...True, False...False with at least one True and one False)
      2) after removing special substrings and stripping whitespace, the concatenation is a single word
         (letters with optional ' or - parts, e.g. david's, co-op)

    Assumes df['tokens'] is a list (items cast to str).
    """
    specials = sorted(set(map(str, special_token_list)), key=len, reverse=True)
    special_re = re.compile("|".join(re.escape(s) for s in specials)) if specials else None
    word_re = re.compile(r"[A-Za-z]+(?:[’'-][A-Za-z]+)*")

    def should_remove(toks) -> bool:
        if not toks or not special_re:
            return False

        toks = list(map(str, toks))
        has_special = [bool(special_re.search(t)) for t in toks]

        # must be specials only at start: True* then False* (and both parts non-empty)
        if not any(has_special) or all(has_special):
            return False
        seen_non = False
        for hs in has_special:
            if not hs:
                seen_non = True
            elif seen_non:
                return False  # special appears after a non-special token

        s = "".join(toks)
        s = special_re.sub("", s).strip()
        return bool(s) and bool(word_re.fullmatch(s))

    mask = df["tokens"].apply(should_remove)
    return df.loc[~mask].copy()


In [13]:

raw_tokens = pd.read_excel(file_loc)
raw_tokens = ensure_tokens_are_lists(raw_tokens)
print("Original token count:", len(raw_tokens))

filtered_tokens = filter_min_length(raw_tokens, min_tokens=2)
print("After filter_min_length:", len(filtered_tokens))

filtered_tokens = filter_only_special_tokens(filtered_tokens, special_tokens)
print("After filter_only_special_tokens:", len(filtered_tokens))

filtered_tokens = filter_only_numbers_and_special_tokens(filtered_tokens, special_tokens)
print("After filter_only_number_and_special_tokens:", len(filtered_tokens))

filtered_tokens = filter_only_punct_and_special_tokens(filtered_tokens, special_tokens)
print("After filter_only_punct_and_special_tokens:", len(filtered_tokens))

filtered_tokens = filter_at_least_n_minus_1_specials(filtered_tokens, special_tokens)
print("After filter_at_least_n_minus_1_specials:", len(filtered_tokens))

filtered_tokens = filter_zero_special_tokens(filtered_tokens, special_tokens)
print("After filter_zero_special_tokens:", len(filtered_tokens))

Original token count: 71857
After filter_min_length: 71850
After filter_only_special_tokens: 71849
After filter_only_number_and_special_tokens: 71834
After filter_only_punct_and_special_tokens: 71690
After filter_at_least_n_minus_1_specials: 70954
After filter_zero_special_tokens: 69529


In [14]:
filtered_tokens.sample(50)

Unnamed: 0,phrase,tokens,num_tokens
28740,big brother,"[Ġbig, Ġbrother]",2
59326,. my fianc,"[., Ġmy, Ġfian, c]",4
14811,surnames,"[Ġsurn, ames]",2
6426,locke,"[Ġloc, ke]",2
21168,spaceland,"[Ġspac, eland]",2
2460,. let,"[., Ġlet]",2
43518,"the sand,","[Ġthe, Ġsand, ,]",3
63586,he is going to,"[Ġhe, Ġis, Ġgoing, Ġto]",4
34192,up\ni,"[Ġup, Ċ, i]",3
10863,houston,"[Ġh, ouston]",2


In [15]:
filtered_tokens.head(50)

Unnamed: 0,phrase,tokens,num_tokens
218,", i","[,, Ġi]",2
222,nw,"[Ġn, w]",2
226,", a","[,, Ġa]",2
229,wp,"[Ġw, p]",2
236,uk,"[Ġu, k]",2
242,ae,"[Ġa, e]",2
244,gt,"[Ġg, t]",2
247,dp,"[Ġd, p]",2
250,1-,"[Ġ1, -]",2
254,. g,"[., Ġg]",2
