In [1]:
from datasets import load_dataset

# loading script
# csv, text, json, pandas

# question-answering datasets: all the text stored in a data field
# load the dataset by specifying the field argument as follows:

# By default, loading local files creates a DatasetDict object with a train split.
# squad_ko_dataset = load_dataset("json", data_files="./data/KorQuAD_v1.0_train.json", field="data")
# squad_ko_dataset["train"][0]

# include both the train and test splits in a single DatasetDict
data_files = {
    "train": "./data/KorQuAD_v1.0_train.json",
    "test": "./data/KorQuAD_v1.0_dev.json",
}
# data_files argument is quite flexible: ex. "*.json" - Unix shell pattern is available
# for more detailed information, refer to the documentation:
# https://huggingface.co/docs/datasets/loading#local-and-remote-files
squad_ko_dataset = load_dataset("json", data_files=data_files, field="data")

In [3]:
print(squad_ko_dataset)

DatasetDict({
    train: Dataset({
        features: ['paragraphs', 'title'],
        num_rows: 1420
    })
    test: Dataset({
        features: ['paragraphs', 'title'],
        num_rows: 140
    })
})


In [4]:
dataset = load_dataset(
    "csv", data_files="./data/smilestyle_dataset.tsv", delimiter="\t"
)
dataset_sample = dataset["train"].shuffle(seed=42).select(range(1000))
print(dataset_sample[:3])

{'formal': ['네, 기말 시험 두개가 오늘 있어서 공부하느라 잠을 못 잤어요.', None, '세계여행을 하는데 가장 기대되는 나라가 있나요?'], 'informal': ['응, 기말 시험 두개가 오늘 있어서 공부하느라 잠을 못 잤어.', None, '세계여행을 하는데 가장 기대되는 나라가 있어?'], 'android': [None, None, '세계여행. 하기. 기대됨. 국가. 있는가.'], 'azae': [None, None, None], 'chat': ['ㅇㅇ 기말 시험 두개가 오늘 있어서 공부하느라 잠을 못 잠', None, '세계여행 하는데 가장 기대되는 나라 있음?'], 'choding': ['ㅇㅇ 기말 셤 두개가 오늘이라 공부하다가 잠을 못 잠', None, '세계여행 하는데 젤 기대되는 나라 잇음?'], 'emoticon': [None, None, '세계여행 하는데 가장 기대되는 나라 있어? (⊙_⊙)？'], 'enfp': [None, None, '세계여행하는데, 가장 기대되는 나라 있어??'], 'gentle': [None, None, '세계여행을 하는데 가장 기대되는 나라가 있으십니까?'], 'halbae': [None, None, '세계여행 하는디 가장 기대가 되는 나라가 있으신가?...'], 'halmae': [None, None, None], 'joongding': ['ㅇ 기말시험 두개 오늘 있어서 공부하느라 잠 못잤음', None, '세계여행 하는데 어디가 제일 기대되냐 ㅋ'], 'king': [None, None, '세계여행을 하는 데 가장 기대되는 나라가 있는가?'], 'naruto': [None, None, '세계여행 하는데 가장 기대가 되는 나라 있냐니깐!'], 'seonbi': [None, None, '세계여행 하는데 가장 기대가 되는 나라가 있소? '], 'sosim': [None, None, '세계여행 하는데 가장 기대되는 나라 있어 혹시..?'], 'translator': [None, None, None]}


In [7]:
for split in dataset.keys():
    # train, test, etc.
    print(dataset[split].unique("formal")[:2])

['안녕하세요. 저는 고양이 6마리 키워요.', '고양이를 6마리나요? 키우는거 안 힘드세요?']


In [8]:
dataset = dataset.rename_column(original_column_name="formal", new_column_name="Formal")

In [None]:
# filter nones
dataset = dataset.filter(lambda x: x["android"] is not None)


# normalize teh field
def lowercase_android(example):
    return {"android": example["android"].lower()}


dataset = dataset.map(lowercase_android)

In [None]:
# create new column
def compute_android_length(example):
    return {"android_length": len(example["android"].split())}


dataset = dataset.map(compute_android_length)

In [None]:
# dataset["train"].sort("android_length", reverse=True)[:2]
dataset = dataset.filter(lambda x: x["android_length"] > 10)

In [13]:
print(dataset.num_rows)

{'train': 98}


In [None]:
import html

field = "android"
dataset = dataset.map(lambda x: {field: html.unescape(x[field])})

In [None]:
# batch execution - faster than the batched=False
# batch size is configurable but defaults to 1,000.

# It's especially useful when you use FastTokenizer (AutoTokenizer will use fast tokenizers as default)
# FastTokenizer achieves such a speedup because behind the scenes the tokenization code is executed in Rust,
# which is language that makes it easy parallelize code execution.
dataset = dataset.map(
    lambda x: {
        field: [html.unescape(o) for o in x[field]]
    },  # list comprehension due to batched
    batched=True,
    batch_size=1000,
)

In [None]:
# multi-processing is available (not backed by Rust)
# In general, we don’t recommend using Python multiprocessing for fast tokenizers with batched=True.
dataset = dataset.map(
    lambda x: {field: [html.unescape(o) for o in x[field]]},
    batched=True,
    batch_size=1000,
    num_proc=1,
)

In [19]:
from transformers import AutoTokenizer


def tokenize_and_split(examples: dict):
    return tokenizer(
        examples["android"],
        truncation=True,
        max_length=5,
        return_overflowing_tokens=True,
    )


tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
print(tokenize_and_split(dataset["train"][:2]))

{'input_ids': [[2, 6231, 4176, 18, 3], [2, 2296, 4034, 2048, 3], [2, 18, 3311, 4176, 3], [2, 18, 6288, 18, 3], [2, 3093, 4006, 18, 3], [2, 14793, 18, 9193, 3], [2, 4325, 18, 7926, 3], [2, 18, 11122, 18, 3], [2, 7265, 7796, 18, 3], [2, 7899, 18, 6333, 3], [2, 18, 6820, 18, 3], [2, 7317, 4031, 18, 3], [2, 6288, 18, 27305, 3], [2, 4297, 18, 10652, 3], [2, 4176, 18, 2897, 3], [2, 16465, 18, 7202, 3], [2, 18, 3019, 4176, 3], [2, 18, 6841, 18, 3], [2, 7856, 18, 2884, 3], [2, 4498, 8094, 18, 3], [2, 2897, 16465, 18, 3]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1],

In [None]:
tokenized_dataset = dataset.map(
    tokenize_and_split,
    batched=True,
    # That doesn’t work for a Dataset,
    # so we need to either remove the columns from the old dataset or make them the same size as they are in the new dataset.
    # (due to return_overflowing_tokens=True, the original length is changed.)
    remove_columns=dataset["train"].column_names,
)

In [None]:
from transformers import AutoTokenizer


def tokenize_and_split(examples: dict):
    assert len(examples) == len(dataset.column_names["train"])
    assert len(examples["android"]) == dataset.num_rows["train"]
    result = tokenizer(
        examples["android"],
        truncation=True,
        max_length=5,
        return_overflowing_tokens=True,
    )
    sample_map = result.pop("overflow_to_sample_mapping")
    for column, sentences in examples.items():
        result[column] = [sentences[i] for i in sample_map]
    return result


tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
tokenized_dataset = dataset.map(tokenize_and_split, batched=True)

In [50]:
print(tokenized_dataset.num_rows["train"])

995


In [51]:
# convert to pandas dataframe
# it switches to another format without affecting the underlying data format, which is Apache Arrow.
tokenized_dataset.set_format("pandas")

In [57]:
train_df = tokenized_dataset["train"][:]

In [58]:
# create a new Dataset Object by using the Dataset.from_pandas() function as follows:
from datasets import Dataset

# Do something we want with pandas
train_df = train_df.rename(columns={"Formal": "formal"}).reset_index(drop=True)
train_dataset = Dataset.from_pandas(train_df)

In [61]:
tokenized_dataset.reset_format()

In [62]:
# Datasets provides a Dataset.train_test_split() function that is based on the famous functionality from scikit-learn.
dataset_clean = tokenized_dataset["train"].train_test_split(train_size=0.8, seed=42)

In [63]:
dataset_clean

DatasetDict({
    train: Dataset({
        features: ['Formal', 'informal', 'android', 'azae', 'chat', 'choding', 'emoticon', 'enfp', 'gentle', 'halbae', 'halmae', 'joongding', 'king', 'naruto', 'seonbi', 'sosim', 'translator', 'android_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 796
    })
    test: Dataset({
        features: ['Formal', 'informal', 'android', 'azae', 'chat', 'choding', 'emoticon', 'enfp', 'gentle', 'halbae', 'halmae', 'joongding', 'king', 'naruto', 'seonbi', 'sosim', 'translator', 'android_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 199
    })
})

In [64]:
# rename the default `test` split to `validation`
dataset_clean["validation"] = dataset_clean.pop("test")

In [None]:
# Save dataset

# Arrow: Dataset.save_to_disk()
# CSV: Dataset.to_csv()
# JSON: Dataset.to_json()
from datasets import load_from_disk

dataset_clean.save_to_disk("./data/smilestyle_dataset")
dataset_clean = load_from_disk("./data/smilestyle_dataset")

# # save as json
# for split, dataset in dataset.items():
#     dataset.to_json(f"dataset-{split}.jsonl")

# # load from json
# data_files = {
#     "train": "./data/dataset-train.jsonl",
#     "validation": "./data/dataset-validation.jsonl"
# }
# dataset_clean = load_dataset("json", data_file=data_files)