# Dataset Preprocessing

In [38]:
import os
import sys
import re

import polars as pl
from tqdm.notebook import tqdm
from IPython.display import clear_output

import nltk
import spacy
import unidecode

In [2]:
ROOT_PATH = '../'
DRIVE_PATH = 'Colab/ToxicityClassification'

# When on Colab, use Google Drive as the root path to persist and load data
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_PATH = os.path.join('/content/drive/My Drive/', DRIVE_PATH)
    os.makedirs(ROOT_PATH, exist_ok=True)
    os.chdir(ROOT_PATH)

## Load Dataset

In [3]:
df = pl.read_parquet(os.path.join(ROOT_PATH, 'data', 'joint', 'data.parquet.zstd'))

## Setup Toolkits

In [7]:
nltk.download('punkt')
spacy.cli.download('pt_core_news_sm')
clear_output()

In [30]:
nlp = spacy.load('pt_core_news_sm')

## Normalize

In [39]:
cleanup_re = re.compile(r'[\W\s]')
remove_double_spaces_re = re.compile(r'\s+')

base_clean = []
base_clean_lower = []
tokenized = []
lemmatized = []
no_accents = []
lemma_no_accents = []
no_stop_words = []
lemma_no_stop_words = []
no_stop_words_no_accents = []
lemma_no_stop_words_no_accents = []


# TODO: generalize each pre-processing approach into a separate function
# in a separate file.
for row in tqdm(df.iter_rows(named=True), total=len(df)):
    text: str = row['text']
    # Remove bad characters
    text = cleanup_re.sub(' ', text)
    text = remove_double_spaces_re.sub(' ', text)
    text = text.strip()
    base_clean.append(text)

    # Lowercase
    text = text.lower()
    base_clean_lower.append(text)

    # Tokenize
    # TODO: go deeper into tokenization
    tokens = [token for token in nlp(text)]
    tokenized.append([token.text for token in tokens])

    # Lemmatized
    lemmatized.append([token.lemma_ for token in tokens])

    # No accents
    no_accents.append([unidecode.unidecode(token.text) for token in tokens])
    lemma_no_accents.append([unidecode.unidecode(token.lemma_) for token in tokens])

    # No stop words
    no_stop_words.append([token.text for token in tokens if not token.is_stop])
    lemma_no_stop_words.append([token.lemma_ for token in tokens if not token.is_stop])

    # No stop words, no accents
    no_stop_words_no_accents.append([unidecode.unidecode(token.text) for token in tokens if not token.is_stop])
    lemma_no_stop_words_no_accents.append([unidecode.unidecode(token.lemma_) for token in tokens if not token.is_stop])



df_ext = df.with_columns([
    pl.Series('base_clean', base_clean),
    pl.Series('base_clean_lower', base_clean_lower),
    pl.Series('tokenized', tokenized),
    pl.Series('lemmatized', lemmatized),
    pl.Series('no_accents', no_accents),
    pl.Series('lemma_no_accents', lemma_no_accents),
    pl.Series('no_stop_words', no_stop_words),
    pl.Series('lemma_no_stop_words', lemma_no_stop_words),
    pl.Series('no_stop_words_no_accents', no_stop_words_no_accents),
    pl.Series('lemma_no_stop_words_no_accents', lemma_no_stop_words_no_accents)
])

  0%|          | 0/27952 [00:00<?, ?it/s]

In [41]:
df_ext.write_parquet(os.path.join(ROOT_PATH, "data/joint/pre_processed_data.parquet.zstd"), compression="zstd", compression_level=9)