#### Imports

In [1]:
import re
import os
import numpy as np
from src.data_utils import load_texts, clean_text, save_texts, tokenize, save_tokenized, load_tokenized, is_ascii, filter_by_length, train_val_test_split
from src.next_token_dataset import NextTokenDataset

  from .autonotebook import tqdm as notebook_tqdm


#### Clean dataset and save to file or load cleaned dataset

In [2]:
processed_path = 'data/dataset_processed.txt'

if os.path.exists(processed_path):
  cleaned_ascii_texts = load_texts(processed_path)
else:
  texts = load_texts('data/tweets.txt')
  cleaned_texts = [clean_text(t) for t in texts]
  cleaned_ascii_texts = [t for t in cleaned_texts if is_ascii(t)]

  save_texts(cleaned_ascii_texts, 'data/dataset_processed.txt')
  print(f'Dataset cleaned and saved: {len(cleaned_ascii_texts)} lines')

#### Tokenize cleaned dataset and save to file or load tokenized dataset

In [3]:
tokenized_path = 'data/dataset_tokenized.json'

if os.path.exists(tokenized_path):
  tokenized = load_tokenized(tokenized_path)
  print(f'Loaded from file: {len(tokenized)} samples')
else:
  tokenized = tokenize(cleaned_ascii_texts)
  save_tokenized(tokenized, tokenized_path)
  print(f'Tokenized and saved: {len(tokenized)} samples')

Loaded from file: 1596158 samples


#### Analyze lengths of samples

In [4]:
lengths = [len(t) for t in tokenized]

print(f'Min: {min(lengths)}, Max: {max(lengths)}, Mean: {np.mean(lengths):.2f}')
for p in [50, 75, 90, 95, 99]:
  print(f'P{p}: {int(np.percentile(lengths, p))}')

Min: 1, Max: 94, Mean: 16.47
P50: 15
P75: 23
P90: 29
P95: 32
P99: 37


#### Split for train, val, test

In [5]:
train_path = 'data/train.json'
val_path = 'data/val.json'
test_path = 'data/test.json'

if all(os.path.exists(p) for p in [train_path, val_path, test_path]):
  train = load_tokenized(train_path)
  val = load_tokenized(val_path)
  test = load_tokenized(test_path)
else:
  filtered = filter_by_length(tokenized=tokenized, min_length=5)

  train, val, test = train_val_test_split(data=filtered, train_ratio=0.8, val_ratio=0.1, seed=42)

  print(f'Train: {len(train)}, Val: {len(val)}, Test: {len(test)}')
  
  save_tokenized(train, 'data/train.json')
  save_tokenized(val, 'data/val.json')
  save_tokenized(test, 'data/test.json')
  
  print(f'Saved - Train: {len(train)}, Val: {len(val)}, Test: {len(test)}')

#### create nextTokenDataset

In [9]:
nextTokenDataset = NextTokenDataset(tokenized=train, max_length=40)

print(f'nextTokenDataset size: {len(nextTokenDataset)}')

x, y = nextTokenDataset[0]
print(f'X shape: {x.shape}, Y shape: {y.shape}')
print(f'X[:10]: {x[:10]}')
print(f'Y[:10]: {y[:10]}')

nextTokenDataset size: 1195832
X shape: torch.Size([39]), Y shape: torch.Size([39])
X[:10]: tensor([   64,  1256,   286,   366, 42478,     1,   389,  1708,   502,   783])
Y[:10]: tensor([ 1256,   286,   366, 42478,     1,   389,  1708,   502,   783,   640])
