In [21]:
import sys
sys.path.append("../src")

from transformers import GPT2LMHeadModel
from yt_encoder import YTEncoder
from tqdm import tqdm

import torch
import numpy as np
import pandas as pd

from pathlib import Path

PATH_TO_DATA = Path("../data")
PATH_TO_MODELS = Path("../models")

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
tokenizer = YTEncoder.from_pretrained(str(PATH_TO_MODELS / "yt.model"))
model = GPT2LMHeadModel.from_pretrained(str(PATH_TO_MODELS / "s_gpt_2/")).to(device)

### Functions for text generation

In [5]:
def choose_from_top(probs, n=5):
    """
    Randomly(from the given probability distribution) choose the next word from the top n words.
    """
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

In [None]:
def generate_some_text(input_str, text_len = 250, min_len=0):
    """
    Make GPT-2 continue input_str with text up to text_len long.
    If model returns <EOS>-token, generation stops.
    With min_len one can set a desired minimal length of generated text, but the limit is not hard.
    If model persists, text may come out shorter.
    """

    cur_ids = torch.tensor(tokenizer.encode(input_str)).unsqueeze(0).long().to(device)

    model.eval()
    with torch.no_grad():

        for i in range(text_len):
            outputs = model(cur_ids, labels=cur_ids)
            loss, logits = outputs[:2]
            softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(only one) batch and the last predicted embedding
            next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=10) # Choose next word
            if next_token_id == 3:
                # If model returned <EOC> too early, let it try again, but i will be incremented.
                # So if model keeps choosing <EOC>, text may come out shorter than min_len.
                if i < min_len:
                    continue
                break    # Stop generation
            cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word

        output_list = list(cur_ids.squeeze().to('cpu').numpy())
        output_text = tokenizer.decode([output_list])
        return output_text


In [11]:
model.load_state_dict(torch.load(str(PATH_TO_MODELS / "gpt2_epoch_9_autostart.pt")))

<All keys matched successfully>

In [18]:
results = []
for i in tqdm(range(10000)):
    results.append(generate_some_text(" СИНОПСИС: ")[len("СИНОПСИС: "):])

100%|██████████| 10000/10000 [3:11:56<00:00,  1.15s/it] 


In [None]:
output = pd.DataFrame({"synopsis": results})
output

In [None]:
output.to_csv(PATH_TO_DATA / "synops.csv", index=False)