In [None]:
from datasets import load_dataset

# loading script
# csv, text, json, pandas

# question-answering datasets: all the text stored in a data field
# load the dataset by specifying the field argument as follows:

# By default, loading local files creates a DatasetDict object with a train split.
# squad_ko_dataset = load_dataset("json", data_files="./data/KorQuAD_v1.0_train.json", field="data")
# squad_ko_dataset["train"][0]

# include both the train and test splits in a single DatasetDict
data_files = {
    "train": "./data/KorQuAD_v1.0_train.json",
    "test": "./data/KorQuAD_v1.0_dev.json",
}
# data_files argument is quite flexible: ex. "*.json" - Unix shell pattern is available
# for more detailed information, refer to the documentation:
# https://huggingface.co/docs/datasets/loading#local-and-remote-files
squad_ko_dataset = load_dataset("json", data_files=data_files, field="data")

In [None]:
squad_ko_dataset

In [None]:
dataset = load_dataset(
    "csv", data_files="./data/smilestyle_dataset.tsv", delimiter="\t"
)

In [None]:
dataset_sample = dataset["train"].shuffle(seed=42).select(range(1000))
print(dataset_sample[:3])

In [None]:
for split in dataset.keys():
    # train, test, etc.
    print(dataset[split].unique("formal")[:2])

In [None]:
dataset = dataset.rename_column(original_column_name="formal", new_column_name="Formal")

In [None]:
# filter nones
dataset = dataset.filter(lambda x: x["android"] is not None)


# normalize teh field
def lowercase_android(example):
    return {"android": example["android"].lower()}


dataset = dataset.map(lowercase_android)

In [None]:
# create new column
def compute_android_length(example):
    return {"android_length": len(example["android"].split())}


dataset = dataset.map(compute_android_length)

In [None]:
# dataset["train"].sort("android_length", reverse=True)[:2]
dataset = dataset.filter(lambda x: x["android_length"] > 10)

In [None]:
print(dataset.num_rows)

In [None]:
import html

field = "android"
dataset = dataset.map(lambda x: {field: html.unescape(x[field])})

In [None]:
# batch execution - faster than the batched=False
# batch size is configurable but defaults to 1,000.

# It's expecially useful when you use FastTokenizer (AutoTokenizer will use fast tokenizers as default)
# FastTokenizer achieves such a speedup because behind the scenes the tokenization code is executed in Rust,
# which is language that makes it easy parallelize code execution.
dataset = dataset.map(
    lambda x: {
        field: [html.unescape(o) for o in x[field]]
    },  # list comprehension due to batched
    batched=True,
    batch_size=1000,
)

In [None]:
# multi-processing is available (not backed by Rust)
# In general, we don’t recommend using Python multiprocessing for fast tokenizers with batched=True.
dataset = dataset.map(
    lambda x: {field: [html.unescape(o) for o in x[field]]},
    batched=True,
    batch_size=1000,
    num_proc=1,
)

In [None]:
from transformers import AutoTokenizer


def tokenize_and_split(examples: dict):
    return tokenizer(
        examples["android"],
        truncation=True,
        max_length=5,
        return_overflowing_tokens=True,
    )


tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
print(tokenize_and_split(dataset["train"][:2]))

In [None]:
tokenized_dataset = dataset.map(
    tokenize_and_split,
    batched=True,
    # That doesn’t work for a Dataset,
    # so we need to either remove the columns from the old dataset or make them the same size as they are in the new dataset.
    # (due to return_overflowing_tokens=True, the original length is changed.)
    remove_columns=dataset["train"].column_names,
)

In [None]:
tokenized_dataset

In [None]:
from transformers import AutoTokenizer


def tokenize_and_split(examples: dict):
    result = tokenizer(
        examples["android"],
        truncation=True,
        max_length=5,
        return_overflowing_tokens=True,
    )
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result


tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
tokenized_dataset = dataset.map(tokenize_and_split, batched=True)

In [None]:
tokenized_dataset

In [None]:
# convert to pandas dataframe
# it switches to another format without affecting the underlying data format, which is Apache Arrow.
tokenized_dataset.set_format("pandas")

In [None]:
train_df = tokenized_dataset["train"][:]

In [None]:
# Do something we want with pandas
train_df = train_df.rename(columns={"Formal": "formal"}).reset_index(drop=True)

# create a new Dataset Object by using the Dataset.from_pandas() function as follows:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)

In [None]:
train_dataset

In [None]:
tokenized_dataset.reset_format()

In [None]:
# Datasets provides a Dataset.train_test_split() function that is based on the famous functionality from scikit-learn.
dataset_clean = tokenized_dataset["train"].train_test_split(train_size=0.8, seed=42)

In [None]:
# rename the default `test` split to `validation`
dataset_clean["validation"] = dataset_clean.pop("test")

In [None]:
# Save dataset

# Arrow: Dataset.save_to_disk()
# CSV: Dataset.to_csv()
# JSON: Dataset.to_json()
from datasets import load_from_disk

dataset_clean.save_to_disk("./data/smilestyle_dataset")
dataset_clean = load_from_disk("./data/smilestyle_dataset")

# # save as json
# for split, dataset in dataset.items():
#     dataset.to_json(f"dataset-{split}.jsonl")

# # load from json
# data_files = {
#     "train": "./data/dataset-train.jsonl",
#     "validation": "./data/dataset-validation.jsonl"
# }
# dataset_clean = load_dataset("json", data_file=data_files)