In [131]:
from datasets import load_dataset
import re

# Get WikiText

In [132]:
dataset = load_dataset("wikitext", 'wikitext-103-v1')

  table = cls._concat_blocks(blocks, axis=0)


## Prepare dataset

Wikitext contains a lot of delimiters and empty lines, so we need to clean it up a little bit.

In [133]:
space_remover = r'\s([,?.!:;](?:\s|$))'
quote_space_remover = r'"\s*([^"]*?)\s*"'

def remove_spaces(samples):
    samples = [sample.strip() for sample in samples]
    samples = [re.sub(space_remover, r'\1', sample) for sample in samples]
    samples = [re.sub(quote_space_remover, r'"\1"', sample) for sample in samples]
    samples = [sample.replace(" '", "'") for sample in samples]
    return samples

def remove_empty(samples: list) -> list:
    samples = [sample for sample in samples if sample]
    return samples

def remove_titles(samples: list) -> list:
    samples = [sample for sample in samples if "=" not in sample]
    return samples

def remove_short(samples: list) -> list:
    samples = [sample for sample in samples if len(sample) > 300]
    return samples

def remove_unk(samples: list) -> list:
    samples = [sample for sample in samples if "<unk>" not in sample]
    return samples

def preprocess_texts(samples: list) -> list:
    samples = remove_titles(samples)
    samples = remove_spaces(samples)
    samples = remove_empty(samples)
    samples = remove_short(samples)
    samples = remove_unk(samples)
    return samples

In [134]:
dataset_train = dataset["train"][0:100000]["text"]

In [135]:
samples = preprocess_texts(dataset_train)

In [136]:
len(samples)

24656

The final total size of the dataset is about 35k samples. This amount of data should be enough for training and validation for this pretty simple task.

In [137]:
with open("../data/raw/raw_wiki_lines.txt", "w") as f:
    f.writelines([sample+"\n" for sample in samples])

## Tokenize and add labels for Token classification task.

In [138]:
import spacy
import json
nlp = spacy.load("en_core_web_sm")

In [139]:
with open("../data/raw/raw_wiki_lines.txt", "r") as f:
    data = f.readlines()

We want the following structure for dataset:

Our Labels: 
ID2LABEL = {0: "O", 1: "B-COMMA"}
sample = {"tokens": ["token1", "token2,", "token3"], "tags": [0, 1, 0]}

In [140]:
def sentence_to_sample(sentence: str) -> dict:
    sentence = sentence.strip()
    words = [word.text for word in nlp(sentence)]
    tags = []
    clean_words = []
    for i in range(len(words)-1):
        if words[i] == ',':
            continue
        if words[i+1] == ",":
            clean_words.append(words[i])
            tags.append(1)
        else:
            clean_words.append(words[i])
            tags.append(0)
    clean_words.append(words[-1])
    tags.append(0)
    assert len(tags) == len(clean_words)
    return json.dumps({"tokens": clean_words, "tags": tags})

In [141]:
formatted_samples = []
for i in data:
    sentences = i.split(".")
    for j in sentences:
        sentence = j.strip()
        if len(sentence.split()) < 10:
            continue
        formatted_samples.append(sentence_to_sample(j+"."))

len(formatted_samples)

122918

In [142]:
with open("../data/processed/wiki_data.json", "w") as f:
    f.writelines([sample+"\n" for sample in formatted_samples])

## Let's now split the formated wiki sentences into train, val, test datasets.

In [147]:
from sklearn.model_selection import train_test_split

In [148]:
with open("../data/processed/wiki_data.json", "r") as f:
    data = f.readlines()

In [149]:
train_lines, test_lines = train_test_split(data, test_size=.16, random_state=42)
train_lines, val_lines = train_test_split(train_lines, test_size=.2, random_state=42)

In [152]:
for step, lines in zip(["train", "validation", "test"], [train_lines, val_lines, test_lines]):
    with open(f"../data/processed/wiki_data_{step}.json", "w") as f:
        f.writelines(lines)

In [153]:
dataset_structure = {
    "train": "../data/processed/wiki_data_train.json",
    "validation": "../data/processed/wiki_data_validation.json",
    "test": "../data/processed/wiki_data_test.json",
}
processed_dataset = load_dataset("json", data_files=dataset_structure)

Downloading data files: 100%|██████████| 3/3 [00:00<00:00, 7570.95it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 365.45it/s]
Generating train split: 82600 examples [00:00, 564409.04 examples/s]
Generating validation split: 20651 examples [00:00, 614027.58 examples/s]
Generating test split: 19667 examples [00:00, 969482.37 examples/s]


In [158]:
processed_dataset.save_to_disk("../data/processed/wiki_comma_placement")

Saving the dataset (1/1 shards): 100%|██████████| 82600/82600 [00:00<00:00, 645010.84 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 20651/20651 [00:00<00:00, 777583.42 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 19667/19667 [00:00<00:00, 720538.22 examples/s]


## Let's upload the processed dataset to Hugginface for later usage

In [163]:
from datasets import load_from_disk
from huggingface_hub import notebook_login

In [165]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
dd = load_from_disk("../data/processed/wiki_comma_placement")
dd.push_to_hub("wiki-comma-placement")