In [None]:
%load_ext autoreload
%autoreload 2

# Create Smaller Dataset

In [None]:
with open('data/cc100/pl_5e7.txt', 'w') as f:
    for i, line in enumerate(open('data/cc100/pl.txt', 'r')):
        if i >= 5e7:
            break
        f.write(line)

# Clean Dataset

In [None]:
from datasets import load_dataset
import multiprocessing

In [None]:
PATH_IN = 'data/cc100/pl_5e7.txt'
PATH_OUT = 'data/cc100_filtered_5e7'

In [None]:
def preprocess_dataset(path_in, path_out):
    raw_datasets = load_dataset('text', data_files=path_in)
    NUM_PROC = multiprocessing.cpu_count()
    
    import re
    import html as ihtml
    from bs4 import BeautifulSoup

    def clean_text(text):
        text = BeautifulSoup(ihtml.unescape(text), "lxml").text
        text = re.sub(r"http[s]?://\S+", "", text)
        text = re.sub(r"\s+", " ", text)
        return text

    filter_non_alfanum = lambda x: re.sub('[^0-9AaĄąBbCcĆćDdEeĘęFfGgHhIiJjKkLlŁłMmNnŃńOoÓóPpRrSsŚśTtUuWwYyZzŹźŻż\,\. ]+', '', x)
    filter_ratio = lambda x: len(filter_non_alfanum(x)) / len(x)
    
    raw_datasets = raw_datasets.filter(lambda x: len(x['text']) > 15, num_proc=NUM_PROC)
    raw_datasets = raw_datasets.map(lambda x: {'text':  [clean_text(y) for y in x['text']]}, batched=True, num_proc=NUM_PROC)
    raw_datasets = raw_datasets.filter(lambda x: len(x['text']) > 15 and filter_ratio(x['text']) > 0.9, num_proc=NUM_PROC)
    raw_datasets.save_to_disk(path_out)
    
preprocess_dataset(PATH_IN, PATH_OUT)

In [None]:
from datasets import load_from_disk
dedup_datasets = load_from_disk(PATH_OUT)
dedup_datasets

In [None]:
dedup_datasets[]

In [None]:
dedup_datasets.shuffle()['train'].select(range(5))[:5]

In [None]:
from transformers.models.herbert.tokenization_herbert_fast import HerbertTokenizerFast
tokenizer = HerbertTokenizerFast.from_pretrained("allegro/herbert-base-cased")

In [None]:
tokenizer.special_tokens_map

In [None]:
#after_deduplication
import glob
from datasets import concatenate_datasets

dedup_datasets = [load_dataset('json', data_files=path)['train'] for path in glob.glob('./datasets/data/*.json.gz')]
dedup_dataset = concatenate_datasets(dedup_datasets)
dedup_dataset

In [None]:
# ver1
def tokenize_dataset1(dedup_dataset, path_tokenized_out):
    NUM_PROC = multiprocessing.cpu_count()
    def tokenize_function(example):
        tokenized = tokenizer(example['text'], truncation=True)
        return tokenized

    tokenized_dataset = dedup_dataset.map(tokenize_function, batched=True, num_proc=NUM_PROC)
    tokenized_dataset = tokenized_dataset.remove_columns(['text', 'token_type_ids'])
    tokenized_dataset = tokenized_dataset.with_format('torch')
    tokenized_dataset = tokenized_dataset['train'].train_test_split(test_size=0.01, seed=29)
    print(tokenized_dataset)
    tokenized_dataset.save_to_disk(path_tokenized_out)
    
tokenize_dataset1(dedup_datasets, 'data/tokenized_dataset_demo')

In [None]:
# ver2
def get_proper_idx1(idx, context_length, words_ids):
    if idx + context_length >= len(words_ids) - 1:
        return idx + context_length, idx + context_length
    if words_ids[idx + context_length] != words_ids[idx + context_length - 1]:
        return idx + context_length, idx + context_length
    else:
        while words_ids[idx + context_length] == words_ids[idx + context_length - 1]:
            idx -= 1
        return idx + context_length, idx + context_length
            
def get_proper_idx2(idx, context_length, words_ids):
    if idx + context_length >= len(words_ids) - 1:
        return idx + context_length, idx + context_length
    if words_ids[idx + context_length - 1] == None:
        return idx + context_length, idx + context_length
    else:
        while words_ids[idx + context_length] == words_ids[idx + context_length - 1]:
            idx -= 1
        lidx = idx
        ridx = idx
        while words_ids[lidx + context_length - 1] != None:
            lidx -= 1
        while words_ids[ridx + context_length - 1] != None:
            ridx += 1
        lidx = lidx + context_length
        ridx = ridx + context_length
        idx = idx + context_length
        
        if idx - lidx < 20:
            return lidx, lidx
        elif ridx - idx < 20:
            return idx, ridx
        else:
            return idx, idx
            

def tokenize_dataset2(dedup_dataset, path_tokenized_out, context_length=400):
    NUM_PROC = multiprocessing.cpu_count()
    # nie dodawaj tokenów specjalnych
    def tokenize_function(example):
        all_input_ids = [0]
        all_words_ids = [None]
        results = tokenizer(example['text'], add_special_tokens=False)
        for i, input_ids in enumerate(results['input_ids']):
            all_input_ids.extend(input_ids)
            all_input_ids.append(tokenizer.sep_token_id)
            
            all_words_ids.extend(results.word_ids(i))
            all_words_ids.append(None)
        chunks1 = []
        i = 0
        while i < len(all_input_ids):
            j_min, j_max = get_proper_idx2(i, context_length, all_words_ids)
            # problem z ucinaniem słow
            chunks1.append([0] + all_input_ids[i: j_min])
            i = j_max
        return {'input_ids': chunks1}

    tokenized_dataset = dedup_dataset.map(tokenize_function, batched=True, num_proc=NUM_PROC, remove_columns=['text'])
    # tokenized_dataset = tokenized_dataset.remove_columns(['text', 'token_type_ids'])
    # tokenized_dataset = tokenized_dataset.with_format('torch')
    tokenized_dataset = tokenized_dataset.filter(lambda x: len(x['input_ids']) >= 30, num_proc=NUM_PROC)
    tokenized_dataset = tokenized_dataset['train'].train_test_split(test_size=0.01, seed=29)
    print(tokenized_dataset)
    tokenized_dataset.save_to_disk(path_tokenized_out)
    
tokenize_dataset2(dedup_datasets, 'data/tokenized_dataset_5e7', context_length=tokenizer.model_max_length-1)

In [None]:
from datasets import load_from_disk
from torch.utils.data import DataLoader
from models.collator import DataCollatorForWholeWordMask
BATCH_SIZE = 8
def get_dataloaders(tokenizer, path_tokenized_dataset):
    tokenized_datasets = load_from_disk(path_tokenized_dataset)
    train_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer)
    test_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer)
    train_set = tokenized_datasets['train']
    test_set = tokenized_datasets['test']
    train = DataLoader(dataset=train_set, shuffle=True, batch_size=BATCH_SIZE, collate_fn=train_collator)
    test = DataLoader(dataset=test_set, shuffle=False, batch_size=BATCH_SIZE, collate_fn=test_collator)

    return train, test


train_loader, test_loader = get_dataloaders(tokenizer, 'data/tokenized_dataset_demo2')

In [None]:
data = next(iter(train_loader))
data

In [None]:
from datasets import load_from_disk
tokenized_datasets = load_from_disk('data/tokenized_dataset_5e7')
tokenized_datasets

# HTML

In [None]:
import html

In [None]:
text = "I&#039;m a transformer called BERT"
html.unescape(text)

In [None]:
text = """<div>
<h1>Title</h1>
<p>A long text........ </p>
<a href=""> a link </a>
</div>"""
html.unescape(text)

In [None]:
html.unescape(html.escape(text))

In [None]:
from bs4 import BeautifulSoup

In [None]:
BeautifulSoup(html.unescape(text), "lxml").text

# Dataset Stats

In [None]:
from datasets import load_from_disk
tokenized_datasets = load_from_disk('data/tokenized_dataset_5e7')
tokenized_datasets

### Num of Chars

### Num of Tokens

In [None]:
import numpy as np
from tqdm.auto import tqdm
nb_tokens = 0
for input_ids in tqdm(iter(tokenized_datasets['train']), total=3776122):
    input_ids = np.array(input_ids['input_ids'])
    nb_tokens += sum(input_ids != 1)

In [None]:
nb_tokens

In [None]:
nb_tokens / 3776122