#### Imports

In [1]:
import re
import os
import time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import TensorDataset, DataLoader

from src.lstm_model import LSTMModel
from src.eval_lstm import evaluate_lstm
from src.lstm_train import train_lstm
from src.eval_transformer_pipeline import evaluate_transformer
from src.data_utils import (
  load_texts,
  clean_text,
  save_texts,
  tokenize,
  save_tokenized,
  load_tokenized,
  is_ascii,
  filter_by_length,
  train_val_test_split,
  prepare_tensors
)

  from .autonotebook import tqdm as notebook_tqdm


#### Create global constants

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

TOKENIZE_MODEL = 'gpt2'

MAX_LENGTH = 40
BATCH_SIZE = 128
NUM_EPOCHS = 5

PRETRAINED_TRANSFORMER = 'distilgpt2'

#### Clean dataset and save to file or load cleaned dataset

In [3]:
processed_path = 'data/dataset_processed.txt'

if os.path.exists(processed_path):
  cleaned_ascii_texts = load_texts(processed_path)
else:
  texts = load_texts('data/tweets.txt')
  cleaned_texts = [clean_text(t) for t in texts]
  cleaned_ascii_texts = [t for t in cleaned_texts if is_ascii(t)]

  save_texts(cleaned_ascii_texts, 'data/dataset_processed.txt')
  print(f'Dataset cleaned and saved: {len(cleaned_ascii_texts)} lines')

#### Tokenize cleaned dataset if not exists and analyze lengths of samples

In [4]:
tokenized_path = 'data/dataset_tokenized.json'

if not os.path.exists(tokenized_path):
  tokenized = tokenize(cleaned_ascii_texts, model_name=TOKENIZE_MODEL)
  
  save_tokenized(tokenized, tokenized_path)
  
  print(f'Tokenized and saved: {len(tokenized)} samples')

  # analyze lengths of samples
  lengths = [len(t) for t in tokenized]

  print(f'Min: {min(lengths)}, Max: {max(lengths)}, Mean: {np.mean(lengths):.2f}')
  for p in [50, 75, 90, 95, 99]:
    print(f'P{p}: {int(np.percentile(lengths, p))}')
else:
  tokenized = load_tokenized(tokenized_path)
  print(f'Loaded tokenized data: {len(tokenized)}')

Loaded tokenized data: 1596158


#### Split for train, val, test or load if exists

In [5]:
train_path = 'data/train.pt'
val_path = 'data/val.pt'
test_path = 'data/test.pt'

if all(os.path.exists(p) for p in [train_path, val_path, test_path]):
  train = torch.load(train_path)
  val = torch.load(val_path)
  test = torch.load(test_path)

  print('Datasets loaded from cache:')
  print(f"Train: {len(train['x'])}, Val: {len(val['x'])}, Test: {len(test['x'])}")
else:
  filtered = filter_by_length(tokenized=tokenized, min_length=5)
  print(f'After filter: {len(filtered)} (length > 5 tokens)')

  train_tokenized, val_tokenized, test_tokenized = train_val_test_split(data=filtered, train_ratio=0.8, val_ratio=0.1, seed=42)
  print(f'Split - Train: {len(train_tokenized)}, Val: {len(val_tokenized)}, Test: {len(test_tokenized)}')

  train_x, train_y = prepare_tensors(train_tokenized, max_length=MAX_LENGTH)
  val_x, val_y = prepare_tensors(val_tokenized, max_length=MAX_LENGTH)
  test_x, test_y = prepare_tensors(test_tokenized, max_length=MAX_LENGTH)

  train = {'x': train_x, 'y': train_y}
  val = {'x': val_x, 'y': val_y}
  test = {'x': test_x, 'y': test_y}

  torch.save(train, train_path)
  torch.save(val, val_path)
  torch.save(test, test_path)
  
  print(f"Saved:\nTrain: {len(train['x'])}, Val: {len(val['x'])}, Test: {len(test['x'])}")

Datasets loaded from cache:
Train: 1195832, Val: 149479, Test: 149480


#### Create datasets and dataloaders

In [6]:
train_dataset = TensorDataset(train['x'], train['y'])
val_dataset = TensorDataset(val['x'], val['y'])
test_dataset = TensorDataset(test['x'], test['y'])

train_loader = DataLoader(
  train_dataset,
  batch_size=BATCH_SIZE,
  shuffle=True,
  num_workers=4,
  pin_memory=True
)
val_loader = DataLoader(
  val_dataset,
  batch_size=BATCH_SIZE,
  shuffle=False,
  num_workers=4,
  pin_memory=True
)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

#### Check batch shapes

In [7]:
x_batch, y_batch = next(iter(train_loader))
print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")
print(f"Batch shapes — X: {x_batch.shape}, Y: {y_batch.shape}")

Train: 1195832, Val: 149479, Test: 149480
Batch shapes — X: torch.Size([128, 39]), Y: torch.Size([128, 39])


#### Device setup, create tokenizer and model

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

tokenizer = AutoTokenizer.from_pretrained(TOKENIZE_MODEL)

model = LSTMModel(
  vocab_size=tokenizer.vocab_size,
  hidden_dim=128,
  num_layers=2,
  dropout=0.2
)
model.to(device)

Using device: cuda


LSTMModel(
  (embedding): Embedding(50257, 128)
  (lstm): LSTM(128, 128, num_layers=2, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=128, out_features=50257, bias=True)
)

#### Check model size

In [9]:
total_params = sum(p.numel() for p in model.parameters())
print(f'Total parameters: {total_params:,}')

Total parameters: 13,180,241


#### Training setup

In [10]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=3e-3)

#### Load original tokenized texts for ROUGE evaluation

In [10]:
_, val_texts, _ = train_val_test_split(tokenized, train_ratio=0.8, val_ratio=0.1, seed=42)
print(f'Val texts for ROUGE: {len(val_texts)}')

Val texts for ROUGE: 159616


#### Training loop (for VPS) uses before creating ./src/lstm_train (!)

##### Оставил ячейку, как доказательство обучения на VPS (не запускать повторно)

In [13]:
for epoch in range(NUM_EPOCHS):
  model.train()
  total_loss = 0.0
  t0 = time.time()

  for i, (x_batch, y_batch) in enumerate(train_loader):
    x_batch, y_batch = x_batch.to(device), y_batch.to(device)

    optimizer.zero_grad()
    logits = model(x_batch)

    loss = criterion(logits.view(-1, logits.size(-1)), y_batch.view(-1))

    loss.backward()
    optimizer.step()
    total_loss += loss.item()

    if (i + 1) % 500 == 0:
      elapsed = time.time() - t0
      print(f'Epoch {epoch + 1} | Batch {i + 1}/{len(train_loader)} | Loss: {loss.item():.4f} | Time: {elapsed:.1f}s')

  total_loss /= len(train_loader)
  model.eval()
  val_loss = 0.0

  with torch.no_grad():
    for x_batch, y_batch in val_loader:
      x_batch, y_batch = x_batch.to(device), y_batch.to(device)
      logits = model(x_batch)
      loss = criterion(logits.view(-1, logits.size(-1)), y_batch.view(-1))
      val_loss += loss.item()

  val_loss /= len(val_loader)

  print(f'Epoch {epoch + 1}/{NUM_EPOCHS} | Train loss: {total_loss} | Val loss: {val_loss}')

Epoch 1 | Batch 500/9343 | Loss: 6.8756 | Time: 49.1s
Epoch 1 | Batch 1000/9343 | Loss: 6.2459 | Time: 98.3s
Epoch 1 | Batch 1500/9343 | Loss: 5.9440 | Time: 148.1s
Epoch 1 | Batch 2000/9343 | Loss: 5.6597 | Time: 198.1s
Epoch 1 | Batch 2500/9343 | Loss: 5.7703 | Time: 248.1s
Epoch 1 | Batch 3000/9343 | Loss: 5.4670 | Time: 298.2s
Epoch 1 | Batch 3500/9343 | Loss: 5.4757 | Time: 348.5s
Epoch 1 | Batch 4000/9343 | Loss: 5.5342 | Time: 398.8s
Epoch 1 | Batch 4500/9343 | Loss: 5.4333 | Time: 449.1s
Epoch 1 | Batch 5000/9343 | Loss: 5.3430 | Time: 499.5s
Epoch 1 | Batch 5500/9343 | Loss: 5.3535 | Time: 549.9s
Epoch 1 | Batch 6000/9343 | Loss: 5.3936 | Time: 600.2s
Epoch 1 | Batch 6500/9343 | Loss: 5.2109 | Time: 650.6s
Epoch 1 | Batch 7000/9343 | Loss: 5.3811 | Time: 701.1s
Epoch 1 | Batch 7500/9343 | Loss: 5.2598 | Time: 751.5s
Epoch 1 | Batch 8000/9343 | Loss: 5.1361 | Time: 801.8s
Epoch 1 | Batch 8500/9343 | Loss: 5.3213 | Time: 852.2s
Epoch 1 | Batch 9000/9343 | Loss: 5.0904 | Time: 90

#### Training

##### Для последующих запусков обучения

In [None]:
history = train_lstm(
  model=model,
  train_loader=train_loader,
  val_loader=val_loader,
  tokenizer=tokenizer,
  val_texts=val_texts,
  device=device,
  num_epochs=NUM_EPOCHS
)

#### Save model

In [14]:
torch.save(model.state_dict(), 'models/lstm_model.pt')
print('Model saved')

Model saved


#### Load trained LSTM model

In [11]:
model = LSTMModel(
  vocab_size=tokenizer.vocab_size,
  hidden_dim=128,
  num_layers=2,
  dropout=0.2
)
model.load_state_dict(torch.load('models/lstm_model.pt'))
model.to(device)
model.eval()

print('LSTM trained model loaded')

LSTM trained model loaded


#### Evaluate LSTM trained model on validation set

In [13]:
lstm_results = evaluate_lstm(
  model=model,
  tokenizer=tokenizer,
  texts=val_texts,
  device=device,
  max_samples=200
)

print(f'LSTM ROUGE-1: {lstm_results["rouge1"]:.4f}')
print(f'LSTM ROUGE-2: {lstm_results["rouge2"]:.4f}')
print(f'LSTM ROUGE-L: {lstm_results["rougeL"]:.4f}')

print('\nExamples:')
for i, ex in enumerate(lstm_results['examples']):
  print(f'\n--- Example {i + 1} ---')
  print(f'Input: {ex["input"]}')
  print(f'Target: {ex["target"]}')
  print(f'Prediction: {ex["prediction"]}')

Evaluating LSTM: 100%|██████████| 200/200 [00:00<00:00, 238.93it/s]

LSTM ROUGE-1: 0.0879
LSTM ROUGE-2: 0.0172
LSTM ROUGE-L: 0.0873

Examples:

--- Example 1 ---
Input: yay! someone got
Target:  it!
Prediction:  a new

--- Example 2 ---
Input: we were forced to turn it down! my friends mom
Target:  is a spoil sport
Prediction:  is going to be

--- Example 3 ---
Input: being honest, i'm exhausted my brain is dry and i miss my
Target:  fiance. just being honest
Prediction:  friends. i'm so

--- Example 4 ---
Input: on my way to race hector hes gonna lose! and also going to the
Target:  batting cages im gonna lose!
Prediction:  airport. i'm so excited

--- Example 5 ---
Input: has cut all her
Target:  nails off
Prediction:  hair.





#### Load pretrained transformer

In [14]:
gpt_tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_TRANSFORMER)
gpt_model = AutoModelForCausalLM.from_pretrained(PRETRAINED_TRANSFORMER)
gpt_model.to(device)
gpt_model.eval()

print(f'Device: {device}')
print(f'{PRETRAINED_TRANSFORMER} parameters: {sum(p.numel() for p in gpt_model.parameters()):,}')

Device: cuda
distilgpt2 parameters: 81,912,576


#### Evaluate pretrained transformer

In [16]:
gpt_results = evaluate_transformer(
  model=gpt_model,
  tokenizer=gpt_tokenizer,
  texts=val_texts,
  device=device,
  max_samples=200,
  do_sample=False
)

print(f'{PRETRAINED_TRANSFORMER} ROUGE-1: {gpt_results["rouge1"]:.4f}')
print(f'{PRETRAINED_TRANSFORMER} ROUGE-2: {gpt_results["rouge2"]:.4f}')
print(f'{PRETRAINED_TRANSFORMER} ROUGE-L: {gpt_results["rougeL"]:.4f}')

print('\nExamples:')
for i, ex in enumerate(lstm_results['examples']):
  print(f'\n--- Example {i + 1} ---')
  print(f'Input: {ex["input"]}')
  print(f'Target: {ex["target"]}')
  print(f'Prediction: {ex["prediction"]}')

Evaluating Transformer: 100%|██████████| 200/200 [00:07<00:00, 25.49it/s]

distilgpt2 ROUGE-1: 0.0616
distilgpt2 ROUGE-2: 0.0092
distilgpt2 ROUGE-L: 0.0616

Examples:

--- Example 1 ---
Input: yay! someone got
Target:  it!
Prediction:  a new

--- Example 2 ---
Input: we were forced to turn it down! my friends mom
Target:  is a spoil sport
Prediction:  is going to be

--- Example 3 ---
Input: being honest, i'm exhausted my brain is dry and i miss my
Target:  fiance. just being honest
Prediction:  friends. i'm so

--- Example 4 ---
Input: on my way to race hector hes gonna lose! and also going to the
Target:  batting cages im gonna lose!
Prediction:  airport. i'm so excited

--- Example 5 ---
Input: has cut all her
Target:  nails off
Prediction:  hair.



