## Setup

In [1]:
from pathlib import Path

import pandas as pd
from datasets import Dataset, load_from_disk
from transformers import AutoTokenizer

In [2]:
data_path = Path("..") / "data"
input_path = data_path / "interim"
input_file_name = "train.parquet"
output_path = data_path / "interim"
output_file_name = "toxic_comments"

In [3]:
model_name = 'distilbert-base-uncased'

## Load Data

In [4]:
df = pd.read_parquet(input_path / input_file_name)

In [5]:
df_subset = df[["comment_text", "label"]]

## Process Data

In [6]:
dataset = Dataset.from_pandas(df_subset)
dataset = dataset.train_test_split(test_size=0.2, seed=32)
dataset

DatasetDict({
    train: Dataset({
        features: ['comment_text', 'label'],
        num_rows: 1443899
    })
    test: Dataset({
        features: ['comment_text', 'label'],
        num_rows: 360975
    })
})

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
def tokenize(batch):
    return tokenizer(batch["comment_text"], padding=True, truncation=True)

In [9]:
dataset = dataset.map(tokenize, batched=True, batch_size=500)

100%|██████████| 2888/2888 [02:20<00:00, 20.53ba/s]
100%|██████████| 722/722 [00:35<00:00, 20.20ba/s]


In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'comment_text', 'input_ids', 'label'],
        num_rows: 1443899
    })
    test: Dataset({
        features: ['attention_mask', 'comment_text', 'input_ids', 'label'],
        num_rows: 360975
    })
})

## Save Data

In [11]:
dataset.save_to_disk(output_path / output_file_name)

In [12]:
dataset_reload = load_from_disk(output_path / output_file_name)

In [13]:
dataset_reload

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'comment_text', 'input_ids', 'label'],
        num_rows: 1443899
    })
    test: Dataset({
        features: ['attention_mask', 'comment_text', 'input_ids', 'label'],
        num_rows: 360975
    })
})