In [36]:
import os
import pandas as pd
from src.data_utils import clean_text
from transformers import BertTokenizerFast
from sklearn.model_selection import train_test_split

In [37]:
# Read txt-file
with open('data/raw_dataset.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()

# delete \n
texts = [line.strip() for line in lines if line.strip()]

# clean data
cleaned_dataset = [clean_text(text) for text in texts]


In [38]:
cleaned_dataset[:5]

[' switchfoot http twitpic com 2y1zl awww that s a bummer you shoulda got david carr of third day to do it d',
 'is upset that he can t update his facebook by texting it and might cry as a result school today also blah ',
 ' kenichan i dived many times for the ball managed to save 50 the rest go out of bounds',
 'my whole body feels itchy and like its on fire',
 ' nationwideclass no it s not behaving at all i m mad why am i here because i can t see you all over there ']

In [39]:
# save clearned txt

output_path = 'data/cleaned_data.txt'

os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, 'w', encoding='utf-8') as f:
    f.write('\n'.join(cleaned_dataset))

In [40]:
model_name = "bert-base-uncased"

tokenizer = BertTokenizerFast.from_pretrained(model_name)

In [41]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': ['PAD']})

pad_id = tokenizer.pad_token_id
sep_id = tokenizer.sep_token_id
unk_id = tokenizer.unk_token_id
vocab_size = tokenizer.vocab_size

print(f'pad_id - {pad_id}, sep_id - {sep_id}, unk_id - {unk_id}, vocab_size - {vocab_size}')

pad_id - 0, sep_id - 102, unk_id - 100, vocab_size - 30522


In [42]:
# tokenization

tokenized_data = tokenizer(cleaned_dataset, add_special_tokens=False, return_attention_mask=False)
all_ids = tokenized_data['input_ids']

In [43]:
print(all_ids[0][:20])

[6942, 13064, 8299, 1056, 9148, 25856, 2594, 4012, 1016, 2100, 2487, 2480, 2140, 22091, 2860, 2860, 2008, 1055, 1037, 26352]


In [44]:
train_sents, test_sents = train_test_split(all_ids, test_size=0.1, random_state=42)
train_sents, val_sents = train_test_split(train_sents, test_size=0.1, random_state=42)

In [45]:
# output_paths = ['data/train_data.txt', 'data/val_data.txt', 'data/test_data.txt']
# datasets = [train_sents, val_sents, test_sents]

# for num, path in enumerate(output_paths):

#     os.makedirs(os.path.dirname(path), exist_ok=True)

#     with open(path, 'w', encoding='utf-8') as f:
#         sents = ['\n'.join(str(sent)) for sent in datasets[num]]
#         f.write('\n'.join(sents))

In [46]:
print(f'Train size {len(train_sents)}, val size {len(val_sents)}, test_size {len(test_sents)}')

Train size 1296403, val size 144045, test_size 160050


In [53]:
def build_blocks_from_stream(sequences: list[list[int]],
                             seq_len: int,
                             sep_id: int = None,
                             step: int = None
                             ) -> tuple[list[list[int]], list[list[int]]]:
    """ 

    """
    stream = []
    for seq in sequences:
        if len(seq) == 0:
            continue
        stream.extend(seq)
        if sep_id is not None:
            stream.append(sep_id)
    if len(stream) < 2:
        return [], []
    
    if step is None:
        step = seq_len

    inputs, targets = [], []
    for i in range(0, len(stream)-1, step):
        inp = stream[i:i+seq_len]
        tgt = stream[i+1:i+seq_len+1]
        if len(inp) == 0:
            continue
        inputs.append(inp) 
        targets.append(tgt)
    return inputs, targets

In [54]:
SEQ_LEN = 64
STEP = 64

train_inputs, train_targets = build_blocks_from_stream(train_sents, seq_len=SEQ_LEN, sep_id=sep_id, step=STEP)
val_inputs, val_targets = build_blocks_from_stream(val_sents, seq_len=SEQ_LEN, sep_id=sep_id, step=STEP)
test_inputs, test_targets = build_blocks_from_stream(test_sents, seq_len=SEQ_LEN, sep_id=sep_id, step=STEP)

print(f'Blocks {len(train_inputs)}, {len(val_inputs)}, {len(test_inputs)}')

Blocks 362389, 40228, 44662
