In [None]:
from datasets import load_dataset

In [None]:
BATCH_SIZE = 8

In [None]:
PATH_IN = './datasets/cc100_demo.txt'
PATH_OUT = 'datasets/cc100_filtered'

In [None]:
def preprocess_dataset(path_in, path_out):
    raw_datasets = load_dataset('text', data_files=path)
    
    import re
    import html as ihtml
    from bs4 import BeautifulSoup

    def clean_text(text):
        text = BeautifulSoup(ihtml.unescape(text), "lxml").text
        text = re.sub(r"http[s]?://\S+", "", text)
        text = re.sub(r"\s+", " ", text)
        return text

    filter_non_alfanum = lambda x: re.sub('[^0-9AaĄąBbCcĆćDdEeĘęFfGgHhIiJjKkLlŁłMmNnŃńOoÓóPpRrSsŚśTtUuWwYyZzŹźŻż\,\. ]+', '', x)
    filter_ratio = lambda x: len(filter_non_alfanum(x)) / len(x)
    
    raw_datasets = raw_datasets.filter(lambda x: len(x['text']) > 20)
    raw_datasets = raw_datasets.map(lambda x: {'text':  [clean_text(y) for y in x['text']]}, batched=True)
    raw_datasets = raw_datasets.filter(lambda x: len(x['text']) > 20 and filter_ratio(x['text']) > 0.9)
    raw_datasets.save_to_disk(path_out)
    
preprocess_dataset(PATH_IN, PATH_OUT)

In [None]:
from transformers.models.herbert.tokenization_herbert_fast import HerbertTokenizerFast
tokenizer = HerbertTokenizerFast.from_pretrained("allegro/herbert-base-cased")

In [None]:
import glob
from datasets import concatenate_datasets

dedup_datasets = [load_dataset('json', data_files=path)['train'] for path in glob.glob('./datasets/data/*.json.gz')]
dedup_dataset = concatenate_datasets(dedup_datasets)
dedup_dataset

In [None]:
def tokenize_dataset(dedup_dataset, path_tokenized_out):    
    def tokenize_function(example):
        tokenized = tokenizer(example['text'], truncation=True)
        return tokenized

    tokenized_dataset = dedup_dataset.map(tokenize_function, batched=True)
    tokenized_dataset = tokenized_dataset.remove_columns(['text', 'token_type_ids', 'hash', 'alpha_frac'])
    tokenized_dataset = tokenized_dataset.with_format('torch')
    tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.01, seed=29)
    tokenized_dataset.save_to_disk(path_tokenized_out)
    
tokenize_dataset(dedup_dataset, 'datasets/tokenized_dataset')

In [None]:
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask, DataCollatorWithPadding
from datasets import load_from_disk
from torch.utils.data import DataLoader
def get_dataloaders(tokenizer, path_tokenized_dataset):
    tokenized_datasets = load_from_disk(path_tokenized_dataset)
    train_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer)
    test_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer)
    train_set = tokenized_datasets['train']
    test_set = tokenized_datasets['test']
    train = DataLoader(dataset=train_set, shuffle=True, batch_size=BATCH_SIZE, collate_fn=train_collator)
    test = DataLoader(dataset=test_set, shuffle=False, batch_size=BATCH_SIZE, collate_fn=test_collator)

    return train, test


train_loader, test_loader = get_dataloaders(tokenizer, 'datasets/tokenized_dataset')

In [None]:
data = next(iter(test_loader))
data

# Whole Word Masking

In [None]:
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask

In [None]:
collator = DataCollatorForWholeWordMask(tokenizer)

In [None]:
sents = [
    'Rejestracje żetonowe na przedmioty oferowane wspólnie dla wszystkich studentów UW odbywają się w serwisie UL dostępnym pod adresem',
    'Nim przystąpisz do rejestracji, przeczytaj uważnie zasady opisane w zakładce aktualności. Pamiętaj, że w rejestracji mają uczestniczyć jedynie te osoby, które zapisują się na proseminarium lub seminarium danego rodzaju (matematyczne, informatyczne) po raz pierwszy. '
]

In [None]:
def tokenize_function(example):
    tokenized = tokenizer(example, truncation=True)
    word_ids = [tokenized.word_ids(i) for i in range(len(tokenized['input_ids']))]
    tokenized['word_ids'] = word_ids 
    return tokenized
tokenize_function(sents)

In [None]:
output = tokenizer(sents)

In [None]:
output.words()

In [None]:
output.word_ids(2)

In [None]:
input_ids = output['input_ids']
collator(input_ids)

In [None]:
output.word_ids()

In [None]:
input_ids[0]

In [None]:
ds1 = load_dataset('json', data_files='datasets/data/file-000000000001.json.gz')

In [None]:
ds2 = load_dataset('json', data_files='datasets/data/file-000000000002.json.gz')

In [None]:
datasets = [data['train'] for data in datasets]

# HTML

In [1]:
import html

In [2]:
text = "I&#039;m a transformer called BERT"
html.unescape(text)

"I'm a transformer called BERT"

In [3]:
text = """<div>
<h1>Title</h1>
<p>A long text........ </p>
<a href=""> a link </a>
</div>"""
html.unescape(text)

'<div>\n<h1>Title</h1>\n<p>A long text........ </p>\n<a href=""> a link </a>\n</div>'

In [5]:
html.unescape(html.escape(text))

'<div>\n<h1>Title</h1>\n<p>A long text........ </p>\n<a href=""> a link </a>\n</div>'

In [6]:
from bs4 import BeautifulSoup

In [13]:
BeautifulSoup(html.unescape(text), "lxml").text

'\nTitle\nA long text........ \n a link \n'

# Dataset Stats

### Num of Chars

### Num of Tokens