In [None]:
import sys
import os
import torch
from torch import nn
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoModel
torch.backends.cudnn.benchmark = True
sys.path.append('.')
sys.path.append('src')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

from data_preprocess import create_train_val_test_dataloaders_from_text_file
from common_utils import download_file, setup_logging
from lstm_model import LSTMNextTokenPredictor
from train import train_code_completion_model
from eval_transformer_pipeline import validate_pretrained_gpt2_model

#### Все стадии логгируются в папку /logs

## Скачиваем датасет, а затем его делим 80% - 10% - 10% (train - val - test)

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained(
    "distilgpt2",
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)

tokenizer.pad_token = "[PAD]"
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token

In [4]:
download_file("code.s3.yandex.net/deep-learning/tweets.txt", "./data/tweets.txt")

Файл ./data/tweets.txt уже существует, пропускаем загрузку


'./data/tweets.txt'

In [5]:
train_loader, val_loader, test_loader = create_train_val_test_dataloaders_from_text_file(
    file_path_to_text_data='data/tweets.txt',
    tokenizer=tokenizer,
    maximum_sequence_length=256,
    minimum_sequence_length=20,
    batch_size_for_training=16,
    batch_size_for_validation=16,
    batch_size_for_testing=16,
    train_split_ratio=0.8,
    validation_split_ratio=0.1,
    test_split_ratio=0.1,
    number_of_dataloader_workers=4,
    random_seed_for_split=42,
    shuffle_training_data=True,
    max_rows_all=50000
)

# ==================== ИНСПЕКЦИЯ ДАТАСЕТА ====================
print("\n" + "="*80)
print("ИНСПЕКЦИЯ ВАЛИДАЦИОННОГО ДАТАСЕТА (первые 10 записей)")
print("="*80 + "\n")

samples_to_inspect = 10
samples_inspected = 0

for batch_index, batch_data in enumerate(val_loader):
    if samples_inspected >= samples_to_inspect:
        break

    input_ids_batch = batch_data['input_ids']
    attention_mask_batch = batch_data['attention_mask']
    labels_batch = batch_data['labels']

    actual_sequence_length = attention_mask_batch.sum(dim=1)
    batch_size = input_ids_batch.size(0)

    print(f"Батч {batch_index + 1}:")
    print(f"  Размер батча: {batch_size}")
    print(f"  Форма input_ids: {input_ids_batch.shape}")
    print(f"  Форма attention_mask: {attention_mask_batch.shape}")
    print(f"  Форма labels: {labels_batch.shape}")
    print()

    for sample_index in range(batch_size):
        if samples_inspected >= samples_to_inspect:
            break

        actual_length = int(actual_sequence_length[sample_index].item())
        context_length = actual_length * 3 // 4
        target_length = actual_length - context_length

        # Декодирование полной последовательности
        full_text = tokenizer.decode(
            input_ids_batch[sample_index, :actual_length].tolist(),
            skip_special_tokens=True
        )

        # Декодирование контекста
        context_text = tokenizer.decode(
            input_ids_batch[sample_index, :context_length].tolist(),
            skip_special_tokens=True
        )

        # Декодирование референса (целевой части)
        reference_text = tokenizer.decode(
            input_ids_batch[sample_index, context_length:actual_length].tolist(),
            skip_special_tokens=True
        )

        print(f"  Сэмпл {samples_inspected + 1}:")
        print(f"    Актуальная длина: {actual_length} токенов")
        print(f"    Длина контекста (75%): {context_length} токенов")
        print(f"    Длина цели (25%): {target_length} токенов")
        print(f"    Полный текст: '{full_text[:200]}{'...' if len(full_text) > 200 else ''}'")
        print(f"    Контекст: '{context_text[:150]}{'...' if len(context_text) > 150 else ''}'")
        print(f"    Референс: '{reference_text[:150]}{'...' if len(reference_text) > 150 else ''}'")
        print(f"    Количество уникальных токенов в сэмпле: {len(set(input_ids_batch[sample_index, :actual_length].tolist()))}")
        print()

        samples_inspected += 1

    print("-" * 80 + "\n")

print("="*80)
print("КОНЕЦ ИНСПЕКЦИИ")
print("="*80 + "\n")

Всего валидных строк в датасете: 50000
Train samples: 40000 (80.0%)
Validation samples: 5000 (10.0%)
Test samples: 5000 (10.0%)

ИНСПЕКЦИЯ ВАЛИДАЦИОННОГО ДАТАСЕТА (первые 10 записей)

Батч 1:
  Размер батча: 16
  Форма input_ids: torch.Size([16, 35])
  Форма attention_mask: torch.Size([16, 35])
  Форма labels: torch.Size([16, 35])

  Сэмпл 1:
    Актуальная длина: 26 токенов
    Длина контекста (75%): 19 токенов
    Длина цели (25%): 7 токенов
    Полный текст: 'is sorry she doesnt tweet as often as others :p but i do have a'
    Контекст: 'is sorry she doesnt tweet as often as others'
    Референс: ' :p but i do have a'
    Количество уникальных токенов в сэмпле: 16

  Сэмпл 2:
    Актуальная длина: 29 токенов
    Длина контекста (75%): 21 токенов
    Длина цели (25%): 8 токенов
    Полный текст: 'and now too many things have changed and i have nothing to say to them they left like the others they always'
    Контекст: 'and now too many things have changed and i have nothing to say to

## После чего создаем нашу модель

In [9]:
gpt2_model = GPT2LMHeadModel.from_pretrained("distilgpt2")

In [7]:
model = LSTMNextTokenPredictor(tokenizer.vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2, dropout=0.3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [8]:
try:
    model = torch.compile(model, mode='max-autotune')
    print("✅ Модель скомпилирована с torch.compile()")
except Exception as e:
    print(f"⚠️ torch.compile() не поддерживается: {e}")

✅ Модель скомпилирована с torch.compile()


### Примеры предсказаний пишутся в логи (train.log)

In [None]:
#trained_model = train_code_completion_model(model=model, train_loader=train_loader, val_loader=val_loader, tokenizer=tokenizer, n_epochs=10, lr=0.001, device="cuda" if torch.cuda.is_available() else "cpu")

Epoch 1/10 [Train]: 100%|██████████| 1000/1000 [00:33<00:00, 29.50it/s]
Epoch 1/10 [Val]: 100%|██████████| 125/125 [00:01<00:00, 83.56it/s]


Epoch 1/10 | Train Loss: 7.3415 | Val Loss: 7.2220



Epoch 2/10 [Train]: 100%|██████████| 1000/1000 [00:29<00:00, 34.46it/s]
Epoch 2/10 [Val]: 100%|██████████| 125/125 [00:01<00:00, 80.22it/s]
Epoch 2/10 [ROUGE]:   3%|▎         | 4/125 [00:00<00:13,  8.74it/s]


Epoch 2/10 | Train Loss: 7.1027 | Val Loss: 7.1411 | ROUGE-1: 0.0164 | ROUGE-2: 0.0000



Epoch 3/10 [Train]: 100%|██████████| 1000/1000 [00:29<00:00, 34.39it/s]
Epoch 3/10 [Val]: 100%|██████████| 125/125 [00:01<00:00, 80.75it/s]


Epoch 3/10 | Train Loss: 7.0653 | Val Loss: 7.1538



Epoch 4/10 [Train]: 100%|██████████| 1000/1000 [00:29<00:00, 34.34it/s]
Epoch 4/10 [Val]: 100%|██████████| 125/125 [00:01<00:00, 80.19it/s]
Epoch 4/10 [ROUGE]:   3%|▎         | 4/125 [00:00<00:14,  8.56it/s]


Epoch 4/10 | Train Loss: 7.0506 | Val Loss: 7.1714 | ROUGE-1: 0.0237 | ROUGE-2: 0.0000



Epoch 5/10 [Train]: 100%|██████████| 1000/1000 [00:29<00:00, 34.39it/s]
Epoch 5/10 [Val]: 100%|██████████| 125/125 [00:01<00:00, 79.64it/s]


Epoch 5/10 | Train Loss: 7.0382 | Val Loss: 7.1966



Epoch 6/10 [Train]: 100%|██████████| 1000/1000 [00:29<00:00, 34.22it/s]
Epoch 6/10 [Val]: 100%|██████████| 125/125 [00:01<00:00, 80.26it/s]
Epoch 6/10 [ROUGE]:   3%|▎         | 4/125 [00:00<00:13,  8.89it/s]


Epoch 6/10 | Train Loss: 7.0319 | Val Loss: 7.1857 | ROUGE-1: 0.0183 | ROUGE-2: 0.0000



Epoch 7/10 [Train]: 100%|██████████| 1000/1000 [00:29<00:00, 34.25it/s]
Epoch 7/10 [Val]: 100%|██████████| 125/125 [00:01<00:00, 79.24it/s]


Epoch 7/10 | Train Loss: 7.0229 | Val Loss: 7.1989



Epoch 8/10 [Train]: 100%|██████████| 1000/1000 [00:29<00:00, 34.38it/s]
Epoch 8/10 [Val]: 100%|██████████| 125/125 [00:01<00:00, 80.51it/s]
Epoch 8/10 [ROUGE]:   3%|▎         | 4/125 [00:00<00:13,  8.85it/s]


Epoch 8/10 | Train Loss: 7.0214 | Val Loss: 7.2020 | ROUGE-1: 0.0319 | ROUGE-2: 0.0000



Epoch 9/10 [Train]: 100%|██████████| 1000/1000 [00:29<00:00, 34.33it/s]
Epoch 9/10 [Val]: 100%|██████████| 125/125 [00:01<00:00, 80.37it/s]


Epoch 9/10 | Train Loss: 7.0161 | Val Loss: 7.2320



Epoch 10/10 [Train]: 100%|██████████| 1000/1000 [00:29<00:00, 34.32it/s]
Epoch 10/10 [Val]: 100%|██████████| 125/125 [00:01<00:00, 80.21it/s]
Epoch 10/10 [ROUGE]:   3%|▎         | 4/125 [00:00<00:13,  9.12it/s]


Epoch 10/10 | Train Loss: 7.0145 | Val Loss: 7.1972 | ROUGE-1: 0.0412 | ROUGE-2: 0.0000



In [9]:
trained_model = train_code_completion_model(model=model, train_loader=train_loader, val_loader=val_loader, tokenizer=tokenizer, n_epochs=10, lr=0.001, device="cuda" if torch.cuda.is_available() else "cpu")

Epoch 1/10 [Train]: 100%|██████████| 2500/2500 [01:06<00:00, 37.65it/s]
Epoch 1/10 [Val]: 100%|██████████| 313/313 [00:03<00:00, 103.91it/s]


Epoch 1/10 | Train Loss: 6.8676 | Val Loss: 6.7507



Epoch 2/10 [Train]: 100%|██████████| 2500/2500 [01:04<00:00, 38.77it/s]
Epoch 2/10 [Val]: 100%|██████████| 313/313 [00:03<00:00, 101.39it/s]
Epoch 2/10 [ROUGE]:   2%|▏         | 5/313 [00:00<00:34,  9.02it/s]


Epoch 2/10 | Train Loss: 6.7451 | Val Loss: 6.7469 | ROUGE-1: 0.0128 | ROUGE-2: 0.0000



Epoch 3/10 [Train]: 100%|██████████| 2500/2500 [01:04<00:00, 38.64it/s]
Epoch 3/10 [Val]: 100%|██████████| 313/313 [00:03<00:00, 102.17it/s]


Epoch 3/10 | Train Loss: 6.7321 | Val Loss: 6.7487



Epoch 4/10 [Train]: 100%|██████████| 2500/2500 [01:04<00:00, 38.70it/s]
Epoch 4/10 [Val]: 100%|██████████| 313/313 [00:03<00:00, 101.95it/s]
Epoch 4/10 [ROUGE]:   2%|▏         | 5/313 [00:00<00:31,  9.76it/s]


Epoch 4/10 | Train Loss: 6.7256 | Val Loss: 6.7510 | ROUGE-1: 0.0206 | ROUGE-2: 0.0000



Epoch 5/10 [Train]: 100%|██████████| 2500/2500 [01:04<00:00, 38.56it/s]
Epoch 5/10 [Val]: 100%|██████████| 313/313 [00:03<00:00, 101.32it/s]


Epoch 5/10 | Train Loss: 6.7202 | Val Loss: 6.7541



Epoch 6/10 [Train]: 100%|██████████| 2500/2500 [01:04<00:00, 38.64it/s]
Epoch 6/10 [Val]: 100%|██████████| 313/313 [00:03<00:00, 101.42it/s]
Epoch 6/10 [ROUGE]:   2%|▏         | 5/313 [00:00<00:31,  9.64it/s]


Epoch 6/10 | Train Loss: 6.7167 | Val Loss: 6.7554 | ROUGE-1: 0.0178 | ROUGE-2: 0.0000



Epoch 7/10 [Train]: 100%|██████████| 2500/2500 [01:04<00:00, 38.61it/s]
Epoch 7/10 [Val]: 100%|██████████| 313/313 [00:03<00:00, 101.62it/s]


Epoch 7/10 | Train Loss: 6.7150 | Val Loss: 6.7574



Epoch 8/10 [Train]: 100%|██████████| 2500/2500 [01:04<00:00, 38.59it/s]
Epoch 8/10 [Val]: 100%|██████████| 313/313 [00:03<00:00, 101.45it/s]
Epoch 8/10 [ROUGE]:   2%|▏         | 5/313 [00:00<00:31,  9.93it/s]


Epoch 8/10 | Train Loss: 6.7118 | Val Loss: 6.7571 | ROUGE-1: 0.0252 | ROUGE-2: 0.0000



Epoch 9/10 [Train]: 100%|██████████| 2500/2500 [01:04<00:00, 38.62it/s]
Epoch 9/10 [Val]: 100%|██████████| 313/313 [00:03<00:00, 101.51it/s]


Epoch 9/10 | Train Loss: 6.7097 | Val Loss: 6.7573



Epoch 10/10 [Train]: 100%|██████████| 2500/2500 [01:04<00:00, 38.63it/s]
Epoch 10/10 [Val]: 100%|██████████| 313/313 [00:03<00:00, 100.81it/s]
Epoch 10/10 [ROUGE]:   2%|▏         | 5/313 [00:00<00:31,  9.92it/s]


Epoch 10/10 | Train Loss: 6.7084 | Val Loss: 6.7578 | ROUGE-1: 0.0361 | ROUGE-2: 0.0000



In [11]:
torch.save(model.state_dict(), 'lstm_model_weights.pth')

In [None]:
#gpt_result = evaluate_distilgpt2_rouge(tokenizer=tokenizer, gpt_model=model_transformers, validation_dataloader=val_loader)

In [10]:
tokenizer.pad_token = tokenizer.eos_token

# Запуск валидации
validation_results = validate_pretrained_gpt2_model(
    tokenizer=tokenizer,
    model=gpt2_model,
    validation_dataloader=val_loader,
    device="cuda" if torch.cuda.is_available() else "cpu",
    num_prediction_samples=5,
    max_generation_length=50,
    calculate_rouge_metrics=True
)

# Доступ к результатам
print(f"\nИтоговые метрики:")
print(f"Validation Loss: {validation_results['validation_loss']:.4f}")
print(f"ROUGE-1: {validation_results['rouge1']:.4f}")
print(f"ROUGE-2: {validation_results['rouge2']:.4f}")

Validation:   0%|          | 0/313 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
Validation: 100%|██████████| 313/313 [00:11<00:00, 26.47it/s, loss=5.5385]
ROUGE calculation:   0%|          | 1/313 [00:00<02:19,  2.24it/s, R1=0.0846, R2=0.0018]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
ROUGE calculation:   1%|          | 3/313 [00:00<01:20,  3.85it/s, R1=0.0847, R2=0.0081]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
ROUGE calculation:   1%|▏         | 4/313 [00:01<01:14,  4.18it/s, R1=0.0716, R2=0.0108]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing 


РЕЗУЛЬТАТЫ ВАЛИДАЦИИ ПРЕДОБУЧЕННОЙ МОДЕЛИ
Validation Loss: 6.3300
ROUGE-1: 0.0935
ROUGE-2: 0.0222


Итоговые метрики:
Validation Loss: 6.3300
ROUGE-1: 0.0935
ROUGE-2: 0.0222





## Выводы