In [None]:
import pandas as pd
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

LANG_MAP = {
    "ar": "arabic",
    "da": "danish",
    "nl": "dutch",
    "en": "english",
    "fi": "finnish",
    "fr": "french",
    "de": "german",
    "hu": "hungarian",
    "it": "italian",
    "nb": "norwegian",
    "pt": "portuguese",
    "ro": "romanian",
    "ru": "russian",
    "es": "spanish",
    "sv": "swedish",
}


def get_stemmer(locale: str):
    """
    Returns the approprate stemmer, if one exists
    """
    if locale in LANG_MAP:
        return SnowballStemmer(LANG_MAP[locale])
    return lambda x: x


def stem(stemmer, utterance: str) -> str:
    """
    Stems the utterance after tokenizing it
    """
    return " ".join([stemmer.stem(token) for token in word_tokenize(utterance)])


def apply_stemming(data: pd.DataFrame) -> pd.DataFrame:
    """
    Apply stemming when there is a language available
    """
    return data.assign(
        utt=data.apply(lambda x: stem(get_stemmer(x["locale"]), x["utt"]), axis=1)
    )
