In [1]:
import sys
import os
import torch
from torch import nn
import pandas as pd
import logging
from pathlib import Path
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoModel
sys.path.append('.')
sys.path.append('src')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

from data_preprocess import create_train_val_test_dataloaders_from_text_file
from common_utils import download_file, setup_logging
from lstm_model import LSTMNextTokenPredictor
from train import train_code_completion_model
from eval_transformer_pipeline import evaluate_distilgpt2_rouge

#### Все стадии логгируются в папку /logs

## Скачиваем датасет, а затем его делим 80% - 10% - 10% (train - val - test)

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained(
    "distilgpt2",
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)

tokenizer.pad_token = "[PAD]"
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token

In [4]:
download_file("code.s3.yandex.net/deep-learning/tweets.txt", "./data/tweets.txt")

Файл ./data/tweets.txt уже существует, пропускаем загрузку


'./data/tweets.txt'

In [5]:
train_loader, val_loader, test_loader = create_train_val_test_dataloaders_from_text_file(
    file_path_to_text_data='data/tweets.txt',
    tokenizer=tokenizer,
    maximum_sequence_length=512,
    batch_size_for_training=8,
    batch_size_for_validation=16,
    batch_size_for_testing=16,
    train_split_ratio=0.8,
    validation_split_ratio=0.1,
    test_split_ratio=0.1,
    number_of_dataloader_workers=2,
    random_seed_for_split=42,
    shuffle_training_data=True
)


print("\n" + "="*50)
print(f"Всего батчей в train: {len(train_loader)}")
print(f"Всего батчей в val: {len(val_loader)}")
print(f"Всего батчей в test: {len(test_loader)}")

padding_side установлен в: 'left'
Всего строк в датасете: 1600498
Train samples: 1280398 (80.0%)
Validation samples: 160049 (10.0%)
Test samples: 160051 (10.0%)

Всего батчей в train: 160050
Всего батчей в val: 10004
Всего батчей в test: 10004


## После чего создаем нашу модель

In [6]:
model_transformers = GPT2LMHeadModel.from_pretrained("distilgpt2")

In [7]:
model = LSTMNextTokenPredictor(tokenizer.vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [8]:
gpt_result = evaluate_distilgpt2_rouge(tokenizer=tokenizer, gpt_model=model_transformers, validation_dataloader=val_loader)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generated:
Reference:
end


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generated:
Reference:
end


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generated:
Reference:
end


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generated:
Reference:
end


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generated:
Reference:
end


KeyboardInterrupt: 