# TL;DR
Removing stopwords obviously allows to reduce the number of tokens per narratives. The percentage of narratives whose lenths > to max_length have been divided by 3-4. We tried to filter words whose tf-idf are in stopwords' tf-idf range. But this method is not scalable.

In [None]:
import json
import os
import re
from concurrent.futures import ThreadPoolExecutor
from functools import partial

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import spacy
import tiktoken
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.notebook import tqdm
from transformers import BertTokenizerFast

In [None]:
# spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

In [None]:
with open("../data/01_primary/one_hot_mapping.json", "r") as f:
    mapping = json.load(f)

with open("../data/01_primary/abs_decoder_domain_specific.json", "r") as f:
    decoder = json.load(f)

In [None]:
df1b_train = pd.read_parquet("../data/01_primary/asrs_data_primary_train.parquet")
df1b_trained_filtered = df1b_train.copy()

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
berttokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tiktokenizer = tiktoken.get_encoding("cl100k_base")

# Count tokens per narratives

## W/ berttokenizer

In [None]:
bert_count = [
    len(berttokenizer.encode(narrative)) for narrative in df1b_train.narrative.tolist()
]
sns.histplot(bert_count, bins=200, stat="percent")
plt.axvline(x=512, color="red", linestyle="--")
plt.text(
    512 + 30,
    plt.ylim()[1] * 0.7,
    f"max length = 512",
    color="red",
    rotation=90,
    fontdict={"fontsize": 8},
)
plt.text(
    700,
    plt.ylim()[1] * 0.5,
    f"{(len(np.where(np.array(bert_count)>512)[0])/len(bert_count)) * 100:.2f}% > max length",
    color="blue",
    fontdict={"fontsize": 12},
)
plt.title("Train set narratives lengths distribution")
plt.xlabel("Token Length")
plt.ylabel("Percentage (%)")
plt.show()

## W/ tiktokenier

In [None]:
tik_count = [
    len(tiktokenizer.encode(narrative)) for narrative in df1b_train.narrative.tolist()
]
sns.histplot(tik_count, bins=200, stat="percent")
plt.axvline(x=512, color="red", linestyle="--")
plt.text(
    512 + 30,
    plt.ylim()[1] * 0.7,
    f"max length = 512",
    color="red",
    rotation=90,
    fontdict={"fontsize": 8},
)
plt.text(
    700,
    plt.ylim()[1] * 0.5,
    f"{(len(np.where(np.array(tik_count)>512)[0])/len(tik_count)) * 100:.2f}% > max length",
    color="blue",
    fontdict={"fontsize": 12},
)
plt.title("Train set narratives lengths distribution")
plt.xlabel("Token Length")
plt.ylabel("Percentage (%)")
plt.show()

# Remove stop words

In [None]:
def remove_stopwords(text: "str", lang: "spacy.Language"):
    pattern = re.compile(r"\s+([.,!?;:])")
    docs = nlp(text)
    filtered = " ".join([token.text for token in docs if not token.is_stop])
    filtered = pattern.sub(r"\1", filtered)
    return filtered

In [None]:
df1b_trained_filtered.loc[[True] * len(df1b_trained_filtered), "narrative"] = (
    df1b_trained_filtered.narrative.apply(lambda t: remove_stopwords(text=t, lang=nlp))
)

In [None]:
bert_count = [
    len(berttokenizer.encode(narrative))
    for narrative in df1b_trained_filtered.narrative.tolist()
]
sns.histplot(bert_count, bins=200, stat="percent")
plt.axvline(x=512, color="red", linestyle="--")
plt.text(
    512 + 30,
    plt.ylim()[1] * 0.7,
    f"max length = 512",
    color="red",
    rotation=90,
    fontdict={"fontsize": 8},
)
plt.text(
    700,
    plt.ylim()[1] * 0.5,
    f"{(len(np.where(np.array(bert_count)>512)[0])/len(bert_count)) * 100:.2f}% > max length",
    color="blue",
    fontdict={"fontsize": 12},
)
plt.title("Train set narratives W/o stopwords lengths distribution")
plt.xlabel("Token Length")
plt.ylabel("Percentage (%)")
plt.show()

In [None]:
tik_count = [
    len(tiktokenizer.encode(narrative))
    for narrative in df1b_trained_filtered.narrative.tolist()
]
sns.histplot(tik_count, bins=200, stat="percent")
plt.axvline(x=512, color="red", linestyle="--")
plt.text(
    512 + 30,
    plt.ylim()[1] * 0.7,
    f"max length = 512",
    color="red",
    rotation=90,
    fontdict={"fontsize": 8},
)
plt.text(
    700,
    plt.ylim()[1] * 0.5,
    f"{(len(np.where(np.array(tik_count)>512)[0])/len(tik_count)) * 100:.2f}% > max length",
    color="blue",
    fontdict={"fontsize": 12},
)
plt.title("Train set narratives W/o stopwords lengths distribution")
plt.xlabel("Token Length")
plt.ylabel("Percentage (%)")
plt.show()

# CLEAN OUT FREQUENT WORDS, DECODE ABBS AND CLEAN USELESS PUNCS

## Compute TFIDF for filtering

In [None]:
vectorizer = TfidfVectorizer(dtype=np.float32, lowercase=False, norm="l1")

df = pd.DataFrame.sparse.from_spmatrix(
    vectorizer.fit_transform(raw_documents=df1b_trained_filtered.narrative.tolist()),
    columns=vectorizer.get_feature_names_out(),
)

##  decode abbs

In [None]:
def build_decoder_pattern(decoder: dict) -> re.Pattern:
    terms = set()
    for key in decoder:
        terms.update({key, key.lower(), key.capitalize()} if key.isupper() else {key})
    escaped_terms = [r"(?<!\w)" + re.escape(term) + r"(?!\w)" for term in terms]
    return re.compile(r"(" + "|".join(escaped_terms) + r")")

In [None]:
def decode_abs(text: "str", pattern: "re.Pattern", decoder: "dict[str, str]"):
    matched_abbs = [
        (abb.upper() if (abb.istitle() or abb.islower()) else abb)
        for abb in set(pattern.findall(text.strip().replace(" / ", "/")))
    ]
    for abb in matched_abbs:
        text = text.replace(abb, decoder[abb])
    return text

## Clean punc

In [None]:
def clean_punc(text: "str"):
    t = re.sub(
        r"([a-zA-Z0-9])\s+([.,!?;:])", r"\1\2", text
    )  # Coller la ponctuation au mot précédent
    t = text = re.sub(
        r"(\w)([!?])\2+", r"\1\2", t
    )  # Réduire les ponctuations répétées identiques (!!, ??) à une seule
    t = re.sub(
        r"(\w)[!?]{2,}", r"\1?", t
    )  # Remplacer les séquences mixtes de !? ou ?! ou !?!? etc. par ?
    t = re.sub(
        r"([.,;!?])[.,;!?]{2,}", r"\1", t
    )  # Supprimer les groupes de ponctuation trop longs (ex: "!!!", ",,,", etc.)
    # Supprimer les ponctuations orphelines (début ou fin ou entre espaces)
    t = re.sub(r"(^|\s)[.,;!?](?=\s|$)", r"\1", t)
    t = re.sub(r"^[.,;!?]\s+", "", t)
    # Nettoyer les espaces multiples
    clned_text = re.sub(r"\s{2,}", " ", t).strip()
    return clned_text


clean_punc(
    "Bonjour , comment ça va ? Très bien , merci ! ;,, . Bonjour ; ensuite. Incroyable!! Quoi?? C'est fou!!?? Non?! ok. Bonjour ?!;;!! ok. Bonjour?!;;!!"
)
#

## Compute filterings

In [None]:
def process_single_narrative(
    i: "int",
    text: "str",
    lang: "spacy.Language",
    tdidf: "pd.DataFrame",
    code_patterns: "re.Pattern",
    decoder: "dict[str, str]",
    thd: "float",
    decode: "bool" = True,
):
    tdidf_row = tdidf.iloc[i]
    words = tdidf_row[tdidf_row > thd].index
    docs = lang(text)
    tokens = []
    for token in docs:
        if token.is_punct:
            tokens.append(token.text)
        else:
            if token.text in words:
                tokens.append(
                    decode_abs(text=token.text, pattern=code_patterns, decoder=decoder)
                    if decode
                    else token.text
                )
    return clean_punc(" ".join(tokens)).lower()


def filter_narratives(
    narratives: "list[str]",
    lang: "spacy.Language",
    tdidf: "pd.DataFrame",
    code_patterns: "re.Pattern",
    decoder: "dict[str, str]",
    thd: "float" = 0.0,
    decode: "bool" = True,
    max_workers: "int | None" = None,
):
    func = partial(
        process_single_narrative,
        tdidf=tdidf,
        lang=lang,
        code_patterns=code_patterns,
        decoder=decoder,
        thd=thd,
        decode=decode,
    )
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(
            tqdm(
                executor.map(lambda args: func(*args), enumerate(narratives)),
                total=len(narratives),
                desc="Filtering...",
            )
        )
    return results

In [None]:
pattern = build_decoder_pattern(decoder)

## W/o Decoding ABBS

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
filtered_narratives = filter_narratives(
    narratives=df1b_trained_filtered.narrative.tolist()[:100],
    lang=nlp,
    tdidf=df,
    thd=1e-10,
    code_patterns=pattern,
    decoder=decoder,
    decode=False,
    max_workers=100,
)

it takes 5 minutes to process 100 narratives. not sclable to 100k

## Decoding abs

In [None]:
func = partial(decode_abs, pattern=pattern, decoder=decoder)

In [None]:
dec_narratives_wo_stopwords = [
    clean_punc(func(text)) for text in df1b_trained_filtered.narrative.tolist()
]

In [None]:
bert_count = [
    len(berttokenizer.encode(narrative)) for narrative in dec_narratives_wo_stopwords
]
sns.histplot(bert_count, bins=200, stat="percent")
plt.axvline(x=512, color="red", linestyle="--")
plt.text(
    512 + 30,
    plt.ylim()[1] * 0.7,
    f"max length = 512",
    color="red",
    rotation=90,
    fontdict={"fontsize": 8},
)
plt.text(
    700,
    plt.ylim()[1] * 0.5,
    f"{(len(np.where(np.array(bert_count)>512)[0])/len(bert_count)) * 100:.2f}% > max length",
    color="blue",
    fontdict={"fontsize": 12},
)
plt.title("Decoded Train set narratives lengths distribution")
plt.xlabel("Token Length")
plt.ylabel("Percentage (%)")
plt.show()

# ONE HOT

In [None]:
def one_hot(text: "str", mapping: "dict[str, int]"):
    num_labels = [0] * len(mapping)
    labels = [label.strip().replace(" / ", "/") for label in text.split(";")]
    for label in labels:
        num_labels[mapping[label]] = 1
    return num_labels

In [None]:
df1b_trained_filtered[[True] * len(df1b_trained_filtered), "anomaly"] = (
    df1b_trained_filtered.anomaly.apply(lambda t: one_hot(text=t, mapping=mapping))
)

In [None]:
df1b_trained_filtered.to_parquet(
    "../data/01_primary/asrs_data_primary_train_stopwords.parquet"
)

In [None]:
df1b_train_decoded = df1b_trained_filtered.copy()

In [None]:
df1b_train_decoded["narrative"] = dec_narratives_wo_stopwords

In [None]:
df1b_train_decoded.loc[[True] * len(df1b_train_decoded), "anomaly"] = (
    df1b_train_decoded.anomaly.apply(lambda t: one_hot(text=t, mapping=mapping))
)

In [None]:
df1b_train_decoded.to_parquet(
    "../data/01_primary/asrs_data_primary_train_decoded.parquet"
)

# PROCESSING VALIDATION AND TEST SET

In [None]:
df1b_validation = pd.read_parquet(
    "../data/01_primary/asrs_data_primary_validation.parquet"
)
df1b_test = pd.read_parquet("../data/01_primary/asrs_data_primary_test.parquet")
df1b_validation_filtered = df1b_validation.copy()
df1b_test_filtered = df1b_test.copy()

In [None]:
df1b_validation_filtered.loc[[True] * len(df1b_validation_filtered), "narrative"] = (
    df1b_validation_filtered.narrative.apply(
        lambda t: remove_stopwords(text=t, lang=nlp)
    )
)

df1b_test_filtered.loc[[True] * len(df1b_test_filtered), "narrative"] = (
    df1b_test_filtered.narrative.apply(lambda t: remove_stopwords(text=t, lang=nlp))
)

In [None]:
val_dec_narratives_wo_stopwords = [
    clean_punc(func(text)) for text in df1b_validation_filtered.narrative.tolist()
]
test_dec_narratives_wo_stopwords = [
    clean_punc(func(text)) for text in df1b_test_filtered.narrative.tolist()
]

In [None]:
df1b_validation_filtered[[True] * len(df1b_validation_filtered), "anomaly"] = (
    df1b_validation_filtered.anomaly.apply(lambda t: one_hot(text=t, mapping=mapping))
)
df1b_test_filtered[[True] * len(df1b_test_filtered), "anomaly"] = (
    df1b_test_filtered.anomaly.apply(lambda t: one_hot(text=t, mapping=mapping))
)

In [None]:
df1b_validation_filtered.to_parquet(
    "../data/01_primary/asrs_data_primary_validation_stopwords.parquet"
)
df1b_test_filtered.to_parquet(
    "../data/01_primary/asrs_data_primary_test_stopwords.parquet"
)

In [None]:
df1b_validation_decoded = df1b_validation_filtered.copy()
df1b_test_decoded = df1b_test_filtered.copy()

In [None]:
df1b_validation_decoded["narrative"] = val_dec_narratives_wo_stopwords
df1b_test_decoded["narrative"] = test_dec_narratives_wo_stopwords

In [None]:
df1b_validation_decoded.loc[[True] * len(df1b_validation_decoded), "anomaly"] = (
    df1b_validation_decoded.anomaly.apply(lambda t: one_hot(text=t, mapping=mapping))
)
df1b_test_decoded.loc[[True] * len(df1b_test_decoded), "anomaly"] = (
    df1b_test_decoded.anomaly.apply(lambda t: one_hot(text=t, mapping=mapping))
)
df1b_validation_decoded.to_parquet(
    "../data/01_primary/asrs_data_primary_validation_decoded.parquet"
)
df1b_test_decoded.to_parquet(
    "../data/01_primary/asrs_data_primary_test_decoded.parquet"
)