In [None]:
import pandas as pd
from nltk.stem import SnowballStemmer
import stopwordsiso as sw

LANG_MAP = {
    "ar": "arabic",
    "da": "danish",
    "nl": "dutch",
    "en": "english",
    "fi": "finnish",
    "fr": "french",
    "de": "german",
    "hu": "hungarian",
    "it": "italian",
    "nb": "norwegian",
    "pt": "portuguese",
    "ro": "romanian",
    "ru": "russian",
    "es": "spanish",
    "sv": "swedish",
}

In [None]:
class NoStemmer:
    def __init__(self, lang):
        self.lang = lang

    def stem(self, word):
        return word

In [None]:
def get_stemmer(locale: str):
    """
    Returns the approprate stemmer, if one exists
    """
    if locale in LANG_MAP:
        return SnowballStemmer(LANG_MAP[locale])
    return NoStemmer(locale)

In [None]:
def stem(stemmer, utterance: str):
    """
    Stems the utterance after tokenizing it
    """
    if isinstance(utterance, str):
        raise TypeError("Utterance must be tokenized to be stemmed.")
    return [stemmer.stem(token) for token in utterance]

In [None]:
def apply_stemming(data: pd.DataFrame):
    """
    Apply stemming when there is a language available
    """
    # stem the utt column according to the language given by the locale column
    data["utt"] = data.apply(lambda row: stem(get_stemmer(row["locale"]), row["utt"]), axis=1)
    return data

In [None]:
def remove_stopwords(df):
    """Remove stopwords from text"""
    # if there's no utt_text column, raise an exception
    if 'utt_text' not in df.columns:
        raise Exception('It\'s not possible to remove stopwords without tokenizing first.')
    
    # for each utt, get stopword list according to locale, and remove stopwords
    for locale in df['locale'].unique():
        sw_list = sw.stopwords(locale)
        # for each utt in locale, remove stopwords
        df.loc[df['locale'] == locale, 'utt'] = df.loc[df['locale'] == locale, 'utt'].apply(lambda x: [word for word in x if word not in sw_list])
    return df