#### Imports

In [1]:
import re
import os
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
from src.data_utils import (
  load_texts,
  clean_text,
  save_texts,
  tokenize,
  save_tokenized,
  load_tokenized,
  is_ascii,
  filter_by_length,
  train_val_test_split,
  prepare_tensors
)

  from .autonotebook import tqdm as notebook_tqdm


#### Clean dataset and save to file or load cleaned dataset

In [2]:
processed_path = 'data/dataset_processed.txt'

if os.path.exists(processed_path):
  cleaned_ascii_texts = load_texts(processed_path)
else:
  texts = load_texts('data/tweets.txt')
  cleaned_texts = [clean_text(t) for t in texts]
  cleaned_ascii_texts = [t for t in cleaned_texts if is_ascii(t)]

  save_texts(cleaned_ascii_texts, 'data/dataset_processed.txt')
  print(f'Dataset cleaned and saved: {len(cleaned_ascii_texts)} lines')

#### Tokenize cleaned dataset if not exists and analyze lengths of samples

In [3]:
tokenized_path = 'data/dataset_tokenized.json'

if not os.path.exists(tokenized_path):
  tokenized = tokenize(cleaned_ascii_texts)
  
  save_tokenized(tokenized, tokenized_path)
  
  print(f'Tokenized and saved: {len(tokenized)} samples')

  # analyze lengths of samples
  lengths = [len(t) for t in tokenized]

  print(f'Min: {min(lengths)}, Max: {max(lengths)}, Mean: {np.mean(lengths):.2f}')
  for p in [50, 75, 90, 95, 99]:
    print(f'P{p}: {int(np.percentile(lengths, p))}')

#### Split for train, val, test or load if exists

In [4]:
train_path = 'data/train.pt'
val_path = 'data/val.pt'
test_path = 'data/test.pt'

MAX_LENGTH = 40
BATCH_SIZE = 256

if all(os.path.exists(p) for p in [train_path, val_path, test_path]):
  train = torch.load(train_path)
  val = torch.load(val_path)
  test = torch.load(test_path)

  print('Datasets loaded from cache:')
  print(f"Train: {len(train['x'])}, Val: {len(val['x'])}, Test: {len(test['x'])}")
else:
  tokenized = load_tokenized(tokenized_path)
  print(f'Loaded tokenized data: {len(tokenized)}')

  filtered = filter_by_length(tokenized=tokenized, min_length=5)
  print(f'After filter: {len(filtered)} (length > 5 tokens)')

  train_tokenized, val_tokenized, test_tokenized = train_val_test_split(data=filtered, train_ratio=0.8, val_ratio=0.1, seed=42)
  print(f'Split - Train: {len(train_tokenized)}, Val: {len(val_tokenized)}, Test: {len(test_tokenized)}')

  train_x, train_y = prepare_tensors(train_tokenized, max_length=MAX_LENGTH)
  val_x, val_y = prepare_tensors(val_tokenized, max_length=MAX_LENGTH)
  test_x, test_y = prepare_tensors(test_tokenized, max_length=MAX_LENGTH)

  train = {'x': train_x, 'y': train_y}
  val = {'x': val_x, 'y': val_y}
  test = {'x': test_x, 'y': test_y}

  torch.save(train, train_path)
  torch.save(val, val_path)
  torch.save(test, test_path)
  
  print(f"Saved:\nTrain: {len(train['x'])}, Val: {len(val['x'])}, Test: {len(test['x'])}")

Datasets loaded from cache:
Train: 1195832, Val: 149479, Test: 149480


#### Create datasets and dataloaders

In [None]:
train_dataset = TensorDataset(train['x'], train['y'])
val_dataset = TensorDataset(val['x'], val['y'])
test_dataset = TensorDataset(test['x'], test['y'])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

#### Check batch shapes

In [10]:
x_batch, y_batch = next(iter(train_loader))
print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")
print(f"Batch shapes — X: {x_batch.shape}, Y: {y_batch.shape}")


Train: 1195832, Val: 149479, Test: 149480
Batch shapes — X: torch.Size([256, 39]), Y: torch.Size([256, 39])
