# Create Smaller Dataset

In [None]:
with open('data/cc100/pl_1e4.txt', 'w') as f:
    for i, line in enumerate(open('data/cc100/pl.txt', 'r')):
        if i >= 1e4:
            break
        f.write(line)

# Clean Dataset

In [None]:
from datasets import load_dataset
import multiprocessing

In [None]:
PATH_IN = 'data/cc100_demo.txt'
PATH_OUT = 'data/cc100_filtered_demo'

In [None]:
def preprocess_dataset(path_in, path_out):
    raw_datasets = load_dataset('text', data_files=path_in)
    NUM_PROC = multiprocessing.cpu_count()
    
    import re
    import html as ihtml
    from bs4 import BeautifulSoup

    def clean_text(text):
        text = BeautifulSoup(ihtml.unescape(text), "lxml").text
        text = re.sub(r"http[s]?://\S+", "", text)
        text = re.sub(r"\s+", " ", text)
        return text

    filter_non_alfanum = lambda x: re.sub('[^0-9AaĄąBbCcĆćDdEeĘęFfGgHhIiJjKkLlŁłMmNnŃńOoÓóPpRrSsŚśTtUuWwYyZzŹźŻż\,\. ]+', '', x)
    filter_ratio = lambda x: len(filter_non_alfanum(x)) / len(x)
    
    raw_datasets = raw_datasets.filter(lambda x: len(x['text']) > 15, num_proc=NUM_PROC)
    raw_datasets = raw_datasets.map(lambda x: {'text':  [clean_text(y) for y in x['text']]}, batched=True, num_proc=NUM_PROC)
    raw_datasets = raw_datasets.filter(lambda x: len(x['text']) > 15 and filter_ratio(x['text']) > 0.9, num_proc=NUM_PROC)
    raw_datasets.save_to_disk(path_out)
    
preprocess_dataset(PATH_IN, PATH_OUT)

In [None]:
from datasets import load_from_disk

dedup_datasets = load_from_disk(PATH_OUT)
dedup_datasets

In [None]:
dedup_datasets.shuffle()['train'].select(range(5))[:5]

In [None]:
from transformers.models.herbert.tokenization_herbert_fast import HerbertTokenizerFast
tokenizer = HerbertTokenizerFast.from_pretrained("allegro/herbert-base-cased")

In [None]:
#after_deduplication
import glob
from datasets import concatenate_datasets

dedup_datasets = [load_dataset('json', data_files=path)['train'] for path in glob.glob('./datasets/data/*.json.gz')]
dedup_dataset = concatenate_datasets(dedup_datasets)
dedup_dataset

In [None]:
def tokenize_dataset(dedup_dataset, path_tokenized_out):
    NUM_PROC = multiprocessing.cpu_count()
    def tokenize_function(example):
        tokenized = tokenizer(example['text'], truncation=True)
        return tokenized

    tokenized_dataset = dedup_dataset.map(tokenize_function, batched=True, num_proc=NUM_PROC)
    tokenized_dataset = tokenized_dataset.remove_columns(['text', 'token_type_ids'])
    tokenized_dataset = tokenized_dataset.with_format('torch')
    tokenized_dataset = tokenized_dataset['train'].train_test_split(test_size=0.01, seed=29)
    print(tokenized_dataset)
    tokenized_dataset.save_to_disk(path_tokenized_out)
    
tokenize_dataset(dedup_datasets, 'data/tokenized_dataset_demo')

In [None]:
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask, DataCollatorWithPadding
from datasets import load_from_disk
from torch.utils.data import DataLoader

BATCH_SIZE = 8
def get_dataloaders(tokenizer, path_tokenized_dataset):
    tokenized_datasets = load_from_disk(path_tokenized_dataset)
    train_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer)
    test_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer)
    train_set = tokenized_datasets['train']
    test_set = tokenized_datasets['test']
    train = DataLoader(dataset=train_set, shuffle=True, batch_size=BATCH_SIZE, collate_fn=train_collator)
    test = DataLoader(dataset=test_set, shuffle=False, batch_size=BATCH_SIZE, collate_fn=test_collator)

    return train, test


train_loader, test_loader = get_dataloaders(tokenizer, 'data/tokenized_dataset_demo')

In [None]:
data = next(iter(test_loader))
data

In [None]:
tokenized_datasets = load_from_disk('datasets/tokenized_dataset_1e4')
tokenized_datasets

# Whole Word Masking

In [None]:
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask

In [None]:
collator = DataCollatorForWholeWordMask(tokenizer)

In [None]:
sents = [
    'Rejestracje żetonowe na przedmioty oferowane wspólnie dla wszystkich studentów UW odbywają się w serwisie UL dostępnym pod adresem',
    'Nim przystąpisz do rejestracji, przeczytaj uważnie zasady opisane w zakładce aktualności. Pamiętaj, że w rejestracji mają uczestniczyć jedynie te osoby, które zapisują się na proseminarium lub seminarium danego rodzaju (matematyczne, informatyczne) po raz pierwszy. '
]

In [None]:
def tokenize_function(example):
    tokenized = tokenizer(example, truncation=True)
    word_ids = [tokenized.word_ids(i) for i in range(len(tokenized['input_ids']))]
    tokenized['word_ids'] = word_ids 
    return tokenized
tokenize_function(sents)

In [None]:
output = tokenizer(sents)

In [None]:
output.words()

In [None]:
output.word_ids(2)

In [None]:
input_ids = output['input_ids']
collator(input_ids)

In [None]:
output.word_ids()

In [None]:
input_ids[0]

In [None]:
ds1 = load_dataset('json', data_files='datasets/data/file-000000000001.json.gz')

In [None]:
ds2 = load_dataset('json', data_files='datasets/data/file-000000000002.json.gz')

In [None]:
datasets = [data['train'] for data in datasets]

# HTML

In [None]:
import html

In [None]:
text = "I&#039;m a transformer called BERT"
html.unescape(text)

In [None]:
text = """<div>
<h1>Title</h1>
<p>A long text........ </p>
<a href=""> a link </a>
</div>"""
html.unescape(text)

In [None]:
html.unescape(html.escape(text))

In [None]:
from bs4 import BeautifulSoup

In [None]:
BeautifulSoup(html.unescape(text), "lxml").text

# Dataset Stats

### Num of Chars

### Num of Tokens