# Импорты и конфиги

In [4]:
from src.data_utils import clean_string, create_vocabs, tokenize_text
from sklearn.model_selection import train_test_split
from src.next_token_dataset import TrainDataset, ValDataset, collate_fn
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import yaml
from src.lstm_model import LSTMModel
from src.lstm_train import model_train
from src.eval_transformer_pipeline import pretrained_model_rouge
from transformers import pipeline

In [5]:
# функция чтения конфиг
with open('configs/config.yaml', 'r', encoding='utf-8') as file:
    config = yaml.safe_load(file)

# Сбор и подготовка данных

In [6]:
# импорт датасета
with open('data/short_dataset.txt', 'r', encoding='utf-8') as file:
    texts = [line.strip() for line in file]

In [7]:
# чистим тексты
texts = list(map(clean_string, texts))
# создаем словари
vocab, reverse_vocab, vocab_size = create_vocabs(texts)
# токенизируем
indexed_texts = tokenize_text(texts, vocab)

In [8]:
# сохранение очищенного и токенизированного датасета
with open('data/dataset_processed.csv', 'w', encoding='utf-8') as file:
    for tokens in indexed_texts:
        file.write(f'{tokens}\n')

In [9]:
# разделение на обучающую, валидационную и тестовую выборки
train, temp = train_test_split(indexed_texts, test_size = .2, shuffle = False)
val, test = train_test_split(temp, test_size = .5, shuffle = False)

# сохранение выборок
with open('data/train.csv', 'w', encoding = 'utf-8') as file:
    for tokens in train:
        file.write(f'{tokens}\n')

with open('data/val.csv', 'w', encoding = 'utf-8') as file:
    for tokens in val:
        file.write(f'{tokens}\n')

with open('data/test.csv', 'w', encoding = 'utf-8') as file:
    for tokens in test:
        file.write(f'{tokens}\n')

In [10]:
# создаем Dataset-ы
train_dataset = TrainDataset(train)
val_dataset = ValDataset(val)
test_dataset = ValDataset(test)

# создаем DataLoader-ы
batch_size = config['lstm_model']['batch_size']
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

# Объявление модели

In [11]:
# объявление модели
model = LSTMModel(vocab_size)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=config['lstm_model']['lr'])

# Тренировка модели

In [12]:
# обучение модели и выведение метрик
n_epochs=config['lstm_train']['n_epochs']
result = model_train(model, optimizer, criterion, vocab_size, train_loader, val_loader, reverse_vocab, n_epochs)
print('Результаты модели LSTM')
for item in result:
    print(str(item))

Результаты модели LSTM
Epoch 1, Train Loss: 7.0070, val_ROUGE-1: 0.0174, val_ROUGE-2: 0.0000
Epoch 2, Train Loss: 6.9347, val_ROUGE-1: 0.0000, val_ROUGE-2: 0.0000
Epoch 3, Train Loss: 6.9138, val_ROUGE-1: 0.0190, val_ROUGE-2: 0.0000
Matched cases on validation (generated, target):
Unmatched cases on validation (generated, target):
('how my double to like to wanted that break together so running had me first went the im of i', 'ciscos page shoot')
('another of writers how now hurts for twitterapplication are weak outside i shock siss still dogs delivers something right hes', 'gets btr 2day')
('cereal a in hemotologistgabbers but how to are oo the', 'left for us')
('fault i dang to in mirrors the some fighting office dont up this last time my am', 'in bed with me')
('and roo to tears', 'twitter updating')


In [13]:
# сохранение весов модели
torch.save(model.state_dict(), 'models/lstm_model_weights.pth')

# Использование предобученного трансформера

In [14]:
# использование предобученного трансформера
generator = pipeline('text-generation', model='distilgpt2')
result = pretrained_model_rouge(generator, val_loader, reverse_vocab, top_k=config['transformer']['top_k'], 
    max_new_tokens=config['transformer']['max_new_tokens'])
print('Результаты предобученного трансформера')
for item in result:
    print(str(item))

Device set to use cpu


Результаты предобученного трансформера
val_ROUGE-1: 0.2224, val_ROUGE-2: 0.0957
Matched cases on validation (generated, target):
Unmatched cases on validation (generated, target):
('sleep i am so happy i will not be able to sleep i had to sleep i had to sleep', 'whoops i got a little too happy doesnt work takes me to ciscos page shoot')
('3 y4yrs amp 2 mos hope u 4yr old 3 y4yrs amp 3 mos hope', 'wow kudos 2 u 4yrs amp 3 mos hope ur 4yr old gets btr 2day')
("issues with it. I have been using it since I was the 11th grade. It's a", 'there is a higher coptodonut ratio today so there arent any left for us')
('down on one of his legs. i just wanna be like to me, i just want to be', 'missing his babe i just want him to come back home and lie in bed with me')
("first game, I've been playing this game for 1 year and I've been playing it all season", 'im getting really lazy with my twitter updating')


# Формулирование выводов