In [1]:
from collections.abc import Callable
import pandas as pd
import stopwordsiso as sw
from nltk.stem import SnowballStemmer
import simplemma

LANG_MAP = {
    "ar": "arabic",
    "da": "danish",
    "nl": "dutch",
    "en": "english",
    "fi": "finnish",
    "fr": "french",
    "de": "german",
    "hu": "hungarian",
    "it": "italian",
    "nb": "norwegian",
    "pt": "portuguese",
    "ro": "romanian",
    "ru": "russian",
    "es": "spanish",
    "sv": "swedish",
}

In [2]:
def get_stemmer(locale: str) -> Callable[[str], str]:
    """
    Returns the appropriate stemmer, if one exists
    """
    if locale in LANG_MAP:
        return SnowballStemmer(LANG_MAP[locale]).stem
    return lambda x: x

In [3]:
def stem(stemmer: Callable[[str], str], utterance: list[str]) -> list[str]:
    """
    Stems the utterance after tokenizing it
    """
    if isinstance(utterance, str):
        raise TypeError("Utterance must be tokenized to be stemmed.")
    return [stemmer(token) for token in utterance]

In [4]:
def apply_stemming(data: pd.DataFrame) -> pd.DataFrame:
    """
    Apply stemming when there is a language available
    """
    # stem the utt column according to the language given by the locale column
    data["utt"] = data.apply(
        lambda row: stem(get_stemmer(row["locale"]), row["utt"]), axis=1
    )
    return data

In [None]:
def lemmatize(lemmatizer: Callable[[str], str], utterance: list[str], language: str) -> list[str]:
    """
    Lemmatizes the utterance after tokenizing it
    """
    if isinstance(utterance, str):
        raise TypeError("Utterance must be tokenized to be stemmed.")
    return [lemmatizer(token, lang=language) for token in utterance]

In [5]:
def apply_lemmatization(data: pd.DataFrame) -> pd.DataFrame:
    """
    Apply lemmatization when there is a language available
    """
    # lemmatize the utt column according to the language given by the locale column
    # if the language is not supported, utt will simply not be lemmatized
    data["utt"] = data.apply(
        lambda row: lemmatize(simplemma.lemmatize, row["utt"], row["locale"]), axis=1
    )
    return data

In [6]:
def remove_stopwords(data: pd.DataFrame) -> pd.DataFrame:
    """Remove stopwords from text"""
    # if there's no utt_text column, raise an exception
    if "utt_text" not in data.columns:
        raise Exception(
            "It's not possible to remove stopwords without tokenizing first."
        )

    # for each utt, get stopword list according to locale, and remove stopwords
    for locale in data["locale"].unique():
        sw_list = sw.stopwords(locale)
        # for each utt in locale, remove stopwords
        data.loc[data["locale"] == locale, "utt"] = data.loc[
            data["locale"] == locale, "utt"
        ].apply(lambda x: [word for word in x if word not in sw_list])
    return data