In [1]:
import sys
import os
import torch
from torch import nn
import pandas as pd
import logging
from pathlib import Path
from transformers import GPT2Tokenizer, GPT2LMHeadModel
sys.path.append('.')
sys.path.append('src')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

from data_preprocess import create_train_val_test_dataloaders_from_text_file
from common_utils import download_file, setup_logging
from lstm_model import LSTMNextTokenPredictor
from train import train_code_completion_model
from eval_transformer_pipeline import evaluate_distilgpt2_rouge

#### Все стадии логгируются в папку /logs

## Скачиваем датасет, а затем его делим 80% - 10% - 10% (train - val - test)

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")

In [4]:
download_file("code.s3.yandex.net/deep-learning/tweets.txt", "./data/tweets.txt")

Файл ./data/tweets.txt уже существует, пропускаем загрузку


'./data/tweets.txt'

In [None]:
train_loader, val_loader, test_loader = create_train_val_test_dataloaders_from_text_file(
    file_path_to_text_data='data/tweets.txt',
    tokenizer=tokenizer,
    maximum_sequence_length=512,
    batch_size_for_training=8,
    batch_size_for_validation=16,
    batch_size_for_testing=16,
    train_split_ratio=0.8,
    validation_split_ratio=0.1,
    test_split_ratio=0.1,
    number_of_dataloader_workers=2,
    random_seed_for_split=42,
    shuffle_training_data=True
)

# Проверяем работу
print("\n" + "="*50)
print("Проверка train_loader:")
for batch_index, batch_data in enumerate(train_loader):
    print(f"\nBatch {batch_index}:")
    print(f"  Input IDs shape: {batch_data['input_ids'].shape}")
    print(f"  Attention mask shape: {batch_data['attention_mask'].shape}")
    print(f"  Labels shape: {batch_data['labels'].shape}")

    if batch_index == 1:  # Показываем только 2 батча
        break

print("\n" + "="*50)
print(f"Всего батчей в train: {len(train_loader)}")
print(f"Всего батчей в val: {len(val_loader)}")
print(f"Всего батчей в test: {len(test_loader)}")

Всего строк в датасете: 1600498
Train samples: 1280398 (80.0%)
Validation samples: 160049 (10.0%)
Test samples: 160051 (10.0%)

Проверка train_loader:

Batch 0:
  Input IDs shape: torch.Size([8, 512])
  Attention mask shape: torch.Size([8, 512])
  Labels shape: torch.Size([8, 512])

Batch 1:
  Input IDs shape: torch.Size([8, 512])
  Attention mask shape: torch.Size([8, 512])
  Labels shape: torch.Size([8, 512])

Всего батчей в train_loader: 160050
Всего батчей в val_loader: 10004
Всего батчей в test_loader: 10004


## После чего создаем нашу модель

In [None]:
model = LSTMNextTokenPredictor()

model = GPT2LMHeadModel.from_pretrained("distilgpt2")

In [None]:
model = LSTMNextTokenPredictor(tokenizer.vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [None]:
evaluate_distilgpt2_rouge(tokenizer, )