In [None]:
import nltk
import pandas as pd

from gensim import corpora
from gensim.parsing import preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm.notebook import tqdm

tqdm.pandas()

In [None]:
nltk.download("stopwords")

In [None]:
stopwords = {
    "spanish": stopwords.words("spanish"),
    "portuguese": stopwords.words("portuguese")
}

In [None]:
data = []

for language in tqdm(["spanish", "portuguese"]):
    for split in tqdm(["train", "test", "validation"]):
        df = pd.read_csv(f"./data/meli-challenge-2019/{language}.{split}.csv.gz")
        df["split"] = split
        data.append(df)

In [None]:
data = pd.concat(data, ignore_index=True)
data.head()

In [None]:
def clean_titles(row):
    title = preprocessing.strip_tags(row["title"].lower())
    title = preprocessing.strip_punctuation(title)
    title = preprocessing.strip_numeric(title)
    title = word_tokenize(title, language=row["language"])
    title = [word for word in title if word not in stopwords[row["language"]]]
    title = [word for word in title if len(word) >= 3]
    return title

data["tokenized_title"] = data.progress_apply(clean_titles, axis=1)

In [None]:
for language, lang_df in data.groupby(["language"]):
    dictionary = corpora.Dictionary(lang_df["tokenized_title"].tolist())
    dictionary.filter_extremes(no_below=2, no_above=1, keep_n=50000)
    dictionary.compactify()
    dictionary.patch_with_special_tokens({
        "[PAD]": 0,
        "[UNK]": 1
    })
    data.loc[lang_df.index, "data"] = lang_df["tokenized_title"].progress_map(
        lambda t: dictionary.doc2idx(
            document=t,
            unknown_word_index=1
        )
    )

In [None]:
data.head()

In [None]:
label_to_target = {label: index for index, label in enumerate(data["category"].unique())}
data["target"] = data["category"].progress_map(lambda l: label_to_target[l])

In [None]:
data.head()

In [None]:
for (language, split), sub_df in data.groupby(["language", "split"]):
    sub_df.to_json(
        f"./data/meli-challenge-2019/{language}.{split}.jsonl.gz",
        lines=True,
        orient="records"
    )