In [58]:
import ast
import re
import sys
import unicodedata

import pandas as pd

from from_root import from_root

sys.path.insert(0, str(from_root("src")))

from model_loading import load_model, distinct_special_chars

In [4]:
base_loc = "/Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs"

file_name = "raw_all_tokens.xlsx"

file_loc = f"{base_loc}/{file_name}"

In [16]:
model_loc = "/Volumes/BCross/models/gpt2"

tokenizer, model = load_model(model_loc)

In [18]:
special_tokens = distinct_special_chars(tokenizer=tokenizer)
special_tokens[0:5]

['Ġ', 'ĉ', 'Ċ', 'č', 'ċ']

In [26]:
def ensure_tokens_are_lists(df):
    df = df.copy()
    df["tokens"] = df["tokens"].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else x
    )
    return df

In [27]:
def filter_min_length(df, min_tokens: int = 2):
    
    df = df[df['num_tokens'] >= min_tokens]
    
    return df.copy()

In [32]:
def filter_only_special_tokens(df: pd.DataFrame, special_token_list) -> pd.DataFrame:
    """
    REMOVE rows where df['tokens'] contains ONLY special tokens (and at least 1 token).
    Assumes df['tokens'] is a list of token strings per row.
    """
    special = set(special_token_list)
    only_special = df["tokens"].apply(lambda toks: bool(toks) and all(t in special for t in toks))
    return df.loc[~only_special].copy()

In [54]:
def filter_only_numbers_and_special_tokens(df: pd.DataFrame, special_token_list) -> pd.DataFrame:
    """
    REMOVE rows where df['tokens'] contains ONLY numbers plus any strings in special_token_list.
    Example removed: ["<special>19", "<special>97", "112"]  (assuming "<special>" is in special_token_list)
    Assumes df['tokens'] is a list (items can be str/int).
    """
    special = sorted(set(map(str, special_token_list)), key=len, reverse=True)

    def is_only_numbers_plus_special(toks) -> bool:
        s = "".join(map(str, toks))  # combine tokens into one string
        for sp in special:
            s = s.replace(sp, "")
        s = "".join(s.split())       # drop any remaining whitespace
        return bool(s) and bool(re.fullmatch(r"\d+", s))

    mask = df["tokens"].apply(is_only_numbers_plus_special)
    return df.loc[~mask].copy()

In [59]:
def filter_only_punct_and_special_tokens(df: pd.DataFrame, special_token_list) -> pd.DataFrame:
    """
    REMOVE rows where df['tokens'] contains ONLY punctuation and/or special tokens,
    allowing special tokens to appear as substrings inside tokens (e.g. "<special>,").
    Assumes df['tokens'] is a list (items will be cast to str).
    """
    specials = sorted(set(map(str, special_token_list)), key=len, reverse=True)
    special_re = re.compile("|".join(re.escape(s) for s in specials)) if specials else None

    def strip_specials(s: str) -> str:
        s = str(s)
        return special_re.sub("", s) if special_re else s

    def is_punct_or_empty_after_strip(tok) -> bool:
        rem = strip_specials(tok)
        rem = "".join(rem.split())  # drop whitespace
        if rem == "":
            return True
        return all(unicodedata.category(ch).startswith("P") for ch in rem)

    only_punct_or_special = df["tokens"].apply(
        lambda toks: bool(toks) and all(is_punct_or_empty_after_strip(t) for t in toks)
    )

    return df.loc[~only_punct_or_special].copy()


In [70]:
def filter_at_least_n_minus_1_specials(df: pd.DataFrame, special_token_list, n: int | None = None) -> pd.DataFrame:
    """
    REMOVE rows where:
      - if n is set: len(tokens) == n AND (# special-only tokens) >= n-1
      - if n is None: (# special-only tokens) >= len(tokens)-1  (i.e., at most 1 non-special)

    Handles specials embedded inside tokens by stripping special substrings first.
    Assumes df['tokens'] is a list (items cast to str).
    """
    specials = sorted(set(map(str, special_token_list)), key=len, reverse=True)
    special_re = re.compile("|".join(re.escape(s) for s in specials)) if specials else None

    def is_special_only(tok) -> bool:
        s = str(tok)
        if special_re:
            s = special_re.sub("", s)
        s = "".join(s.split())  # drop whitespace
        return s == ""

    def should_remove(toks) -> bool:
        if not toks:
            return False
        L = len(toks)
        if n is not None and L != n:
            return False
        special_count = sum(is_special_only(t) for t in toks)
        threshold = (n - 1) if n is not None else (L - 1)
        return special_count >= threshold

    mask = df["tokens"].apply(should_remove)
    return df.loc[~mask].copy()


In [71]:
raw_tokens = pd.read_excel(file_loc)
raw_tokens = ensure_tokens_are_lists(raw_tokens)
print("Original token count:", len(raw_tokens))

filtered_tokens = filter_min_length(raw_tokens, min_tokens=2)
print("After filter_min_length:", len(filtered_tokens))

filtered_tokens = filter_only_special_tokens(filtered_tokens, special_tokens)
print("After filter_only_special_tokens:", len(filtered_tokens))

filtered_tokens = filter_only_numbers_and_special_tokens(filtered_tokens, special_tokens)
print("After filter_only_number_and_special_tokens:", len(filtered_tokens))

filtered_tokens = filter_only_punct_and_special_tokens(filtered_tokens, special_tokens)
print("After filter_only_punct_and_special_tokens:", len(filtered_tokens))

filtered_tokens = filter_at_least_n_minus_1_specials(filtered_tokens, special_tokens)
print("After filter_at_least_n_minus_1_specials:", len(filtered_tokens))

Original token count: 49866
After filter_min_length: 49862
After filter_only_special_tokens: 49861
After filter_only_number_and_special_tokens: 49850
After filter_only_punct_and_special_tokens: 49728
After filter_at_least_n_minus_1_specials: 49021


In [73]:
filtered_tokens.sample(50)

Unnamed: 0,phrase,tokens,num_tokens
4735,. dear,"[., Ġdear]",2
16349,"purposes,","[Ġpurposes, ,]",2
32389,", thank god","[,, Ġthank, Ġgod]",3
31369,if we meet,"[Ġif, Ġwe, Ġmeet]",3
4897,mmm so,"[mmm, Ġso]",2
31799,in any way,"[Ġin, Ġany, Ġway]",3
39397,"language models,","[Ġlanguage, Ġmodels, ,]",3
32481,", when they","[,, Ġwhen, Ġthey]",3
10751,which is,"[Ġwhich, Ġis]",2
37956,a single token,"[Ġa, Ġsingle, Ġtoken]",3


In [74]:
filtered_tokens.head(50)

Unnamed: 0,phrase,tokens,num_tokens
4,’,"[âĢ, Ļ]",2
12,"a,","[a, ,]",2
13,-i,"[-, i]",2
16,"e,","[e, ,]",2
17,c.,"[c, .]",2
32,",i","[,, i]",2
33,",k","[,, k]",2
34,",s","[,, s]",2
41,;t,"[;, t]",2
49,.i,"[., i]",2


In [15]:
filtered_tokens.head(20)

Unnamed: 0,phrase,tokens,num_tokens
4,’,"['âĢ', 'Ļ']",2
5,\ni,"['Ċ', 'i']",2
6,?\n,"['?', 'Ċ']",2
7,\na,"['Ċ', 'a']",2
8,!\n,"['!', 'Ċ']",2
9,.\n,"['.', 'Ċ']",2
10,",","[',', 'Ġ']",2
11,\nd,"['Ċ', 'd']",2
12,"a,","['a', ',']",2
13,-i,"['-', 'i']",2


In [7]:
from transformers import AutoTokenizer

tok = AutoTokenizer.from_pretrained("/Volumes/BCross/models/gpt2")

print("special_tokens_map:", tok.special_tokens_map)
print("all_special_tokens:", tok.all_special_tokens)
print("all_special_ids:", tok.all_special_ids)

special_tokens_map: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}
all_special_tokens: ['<|endoftext|>']
all_special_ids: [50256]


In [10]:
special_tokens = distinct_special_chars(tokenizer=tok)


In [12]:
special_tokens[0:5]

['Ġ', 'ĉ', 'Ċ', 'č', 'ċ']