#### Imports

In [1]:
import re
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer
from torch.utils.data import TensorDataset, DataLoader
from src.lstm_model import LSTMModel
from src.data_utils import (
  load_texts,
  clean_text,
  save_texts,
  tokenize,
  save_tokenized,
  load_tokenized,
  is_ascii,
  filter_by_length,
  train_val_test_split,
  prepare_tensors
)

  from .autonotebook import tqdm as notebook_tqdm


#### Create global constants

In [2]:
TOKENIZE_MODEL = 'gpt2'

MAX_LENGTH = 40
BATCH_SIZE = 128
NUM_EPOCHS = 5

#### Clean dataset and save to file or load cleaned dataset

In [3]:
processed_path = 'data/dataset_processed.txt'

if os.path.exists(processed_path):
  cleaned_ascii_texts = load_texts(processed_path)
else:
  texts = load_texts('data/tweets.txt')
  cleaned_texts = [clean_text(t) for t in texts]
  cleaned_ascii_texts = [t for t in cleaned_texts if is_ascii(t)]

  save_texts(cleaned_ascii_texts, 'data/dataset_processed.txt')
  print(f'Dataset cleaned and saved: {len(cleaned_ascii_texts)} lines')

#### Tokenize cleaned dataset if not exists and analyze lengths of samples

In [4]:
tokenized_path = 'data/dataset_tokenized.json'

if not os.path.exists(tokenized_path):
  tokenized = tokenize(cleaned_ascii_texts, model_name=TOKENIZE_MODEL)
  
  save_tokenized(tokenized, tokenized_path)
  
  print(f'Tokenized and saved: {len(tokenized)} samples')

  # analyze lengths of samples
  lengths = [len(t) for t in tokenized]

  print(f'Min: {min(lengths)}, Max: {max(lengths)}, Mean: {np.mean(lengths):.2f}')
  for p in [50, 75, 90, 95, 99]:
    print(f'P{p}: {int(np.percentile(lengths, p))}')

#### Split for train, val, test or load if exists

In [5]:
train_path = 'data/train.pt'
val_path = 'data/val.pt'
test_path = 'data/test.pt'

if all(os.path.exists(p) for p in [train_path, val_path, test_path]):
  train = torch.load(train_path)
  val = torch.load(val_path)
  test = torch.load(test_path)

  print('Datasets loaded from cache:')
  print(f"Train: {len(train['x'])}, Val: {len(val['x'])}, Test: {len(test['x'])}")
else:
  tokenized = load_tokenized(tokenized_path)
  print(f'Loaded tokenized data: {len(tokenized)}')

  filtered = filter_by_length(tokenized=tokenized, min_length=5)
  print(f'After filter: {len(filtered)} (length > 5 tokens)')

  train_tokenized, val_tokenized, test_tokenized = train_val_test_split(data=filtered, train_ratio=0.8, val_ratio=0.1, seed=42)
  print(f'Split - Train: {len(train_tokenized)}, Val: {len(val_tokenized)}, Test: {len(test_tokenized)}')

  train_x, train_y = prepare_tensors(train_tokenized, max_length=MAX_LENGTH)
  val_x, val_y = prepare_tensors(val_tokenized, max_length=MAX_LENGTH)
  test_x, test_y = prepare_tensors(test_tokenized, max_length=MAX_LENGTH)

  train = {'x': train_x, 'y': train_y}
  val = {'x': val_x, 'y': val_y}
  test = {'x': test_x, 'y': test_y}

  torch.save(train, train_path)
  torch.save(val, val_path)
  torch.save(test, test_path)
  
  print(f"Saved:\nTrain: {len(train['x'])}, Val: {len(val['x'])}, Test: {len(test['x'])}")

Datasets loaded from cache:
Train: 1195832, Val: 149479, Test: 149480


#### Create datasets and dataloaders

In [6]:
train_dataset = TensorDataset(train['x'], train['y'])
val_dataset = TensorDataset(val['x'], val['y'])
test_dataset = TensorDataset(test['x'], test['y'])

train_loader = DataLoader(
  train_dataset,
  batch_size=BATCH_SIZE,
  shuffle=True,
  num_workers=4,
  pin_memory=True
)
val_loader = DataLoader(
  val_dataset,
  batch_size=BATCH_SIZE,
  shuffle=False,
  num_workers=4,
  pin_memory=True
)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

#### Check batch shapes

In [7]:
x_batch, y_batch = next(iter(train_loader))
print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")
print(f"Batch shapes — X: {x_batch.shape}, Y: {y_batch.shape}")

Train: 1195832, Val: 149479, Test: 149480
Batch shapes — X: torch.Size([128, 39]), Y: torch.Size([128, 39])


#### Device setup, create tokenizer and model

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

tokenizer = AutoTokenizer.from_pretrained(TOKENIZE_MODEL)

model = LSTMModel(
  vocab_size=tokenizer.vocab_size,
  hidden_dim=128,
  num_layers=2,
  dropout=0.2
)
model.to(device)

Using device: cuda


LSTMModel(
  (embedding): Embedding(50257, 128)
  (lstm): LSTM(128, 128, num_layers=2, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=128, out_features=50257, bias=True)
)

#### Check model size

In [9]:
total_params = sum(p.numel() for p in model.parameters())
print(f'Total parameters: {total_params:,}')

Total parameters: 13,180,241


#### Training setup

In [10]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=3e-3)

#### Training loop

In [11]:
for epoch in range(NUM_EPOCHS):
  model.train()
  total_loss = 0.0

  for x_batch, y_batch in train_loader:
    x_batch, y_batch = x_batch.to(device), y_batch.to(device)

    optimizer.zero_grad()
    logits = model(x_batch)

    loss = criterion(logits.view(-1, logits.size(-1)), y_batch.view(-1))

    loss.backward()
    optimizer.step()
    total_loss += loss.item()

  total_loss /= len(train_loader)
  model.eval()
  val_loss = 0.0

  with torch.no_grad():
    for x_batch, y_batch in val_loader:
      x_batch, y_batch = x_batch.to(device), y_batch.to(device)
      logits = model(x_batch)
      loss = criterion(logits.view(-1, logits.size(-1)), y_batch.view(-1))
      val_loss += loss.item()

  val_loss /= len(val_loader)

  print(f'Epoch {epoch + 1}/{NUM_EPOCHS} | Train loss: {total_loss} | Val loss: {val_loss}')

KeyboardInterrupt: 