In [None]:
import numpy as np
import pandas as pd
from polyglot.text import Text
from IPython.utils import io
from sklearn.preprocessing import LabelEncoder

In [None]:
def remove_punctuation(data: pd.DataFrame) -> pd.DataFrame:
    """Remove punctuation from text"""
    data["utt"] = data["utt"].str.replace(r"[^\w\s]", "", regex=True)
    return data

In [None]:
def lowercase(data: pd.DataFrame) -> pd.DataFrame:
    """Lowercase text"""
    data["utt"] = data["utt"].str.lower()
    return data

In [None]:
def tokenize(data: pd.DataFrame) -> pd.DataFrame:
    """Tokenize text"""
    with io.capture_output() as captured:
        data["utt_text"] = data["utt"].apply(Text)
        data["utt"] = data["utt_text"].apply(lambda x: x.words)
    return data

In [None]:
def drop_cols(data: pd.DataFrame) -> pd.DataFrame:
    drop = ["worker_id", "slot_method", "judgments"]
    return data.drop(drop, axis=1)

In [None]:
def encode_labels(data: pd.DataFrame) -> pd.DataFrame:
    """Encode labels"""
    le = LabelEncoder()
    le.fit(data["intent"])
    data["intent"] = le.transform(data["intent"])
    return data, le

In [None]:
def decode_labels(data: np.ndarray, le: LabelEncoder) -> np.ndarray:
    """Decode labels"""
    data = le.inverse_transform(data)
    return data