# START


In [1]:
import math
import re
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import torch
from datasets import load_dataset
from transformers import (
    TrainerCallback,
    GPT2Config,
    GPT2Tokenizer,
    GPT2LMHeadModel,
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    AdamW,
    TrainingArguments,
    Trainer,
)

In [2]:
# Load pretrained tokenizer and model
finetuned_model_name = 'result_models/gpt2_ft_top_15_genres/movie-plot-generator'
config=AutoConfig.from_pretrained(finetuned_model_name)
tokenizer = AutoTokenizer.from_pretrained(finetuned_model_name)
model = AutoModelForCausalLM.from_pretrained(finetuned_model_name, config=config)

generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model=model.to(device)

### Using generator framework

In [None]:
input_ids = tokenizer.encode('<BOS> <action> The lost village <SEP>', return_tensors='pt')

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3

sample_outputs = model.generate(
    input_ids,
    do_sample=True, 
    max_length=512, 
    top_k=50, 
    top_p=0.95, 
    num_return_sequences=3,
    bos_token_id=tokenizer.bos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    sep_token_id=tokenizer.sep_token_id,
    eos_token_id=tokenizer.eos_token_id,
    device=0
)

for i, sample_output in enumerate(sample_outputs):
  print()
  print(f"{ re.sub('<PAD>','',tokenizer.decode(sample_output, skip_special_tokens=False))}")

### Using pipeline framework

In [11]:
stories = generator("<BOS> <action> The lost village <SEP>", max_length=512, num_return_sequences=4)
print(*[story['generated_text'] + "\n\n\n------------------------\n" for story in stories])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<BOS> <action> The lost village <SEP> A young girl visits a local beauty fair, and she decides that there are no beauty fairs in the country anyway. She meets a handsome stranger, a famous poet. She goes there to be a poet's son. Finally she meets a rich man. He pays her a hundred dollars to attend a flower show, and she falls for the rich man. In the long run, she will become a beautiful woman and he will marry her. The plot keeps repeating as the film progresses.  June Decisions - Decisions - June 7, 1994.  Film – A series of events at the park where the girls lived in the late 1970s during the Great Depression, during which they lived in a town near Fort Greene, Indiana. In the film, they meet up with their parents on a train from Los Angeles to Detroit. They end up in a hotel in Atlanta. In the end, they meet up with their parents again. It may be the last time that they will meet up with their parents. The story starts in Los Angeles and ends in Atlanta. When the film is released,

# Load test data

In [3]:
# Load dataset from text file called "data.txt" and split into train/val
datasets = load_dataset("text", data_files="test.txt")
datasets['test'] = datasets.pop('train')
dataset = datasets['test']
del datasets
dataset

Using custom data configuration default-6da71f273e2fb6a7
Reusing dataset text (C:\Users\Anton\.cache\huggingface\datasets\text\default-6da71f273e2fb6a7\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Dataset({
    features: ['text'],
    num_rows: 1000
})

In [4]:
# Example
print(*[dataset[i]['text'] + '\n' for i in range(10)])

<BOS> <drama> <romantic drama> <romance film> <action> Days of Thunder <SEP> 
 <BOS> <romance film> Krishnagudiyil Oru Pranayakalathu <SEP> 
 <BOS> <drama> <comedy> <romance film> Annie Hall <SEP> 
 <BOS> <romance film> <comedy> <indie> Mighty Aphrodite <SEP> 
 <BOS> <thriller> <crime fiction> Phone Booth <SEP> 
 <BOS> <romantic drama> <romance film> <drama> Anna Lucasta <SEP> 
 <BOS> <drama> <comedy> Meet Me Tonight <SEP> 
 <BOS> <thriller> <short film> The Cat Piano <SEP> 
 <BOS> <short film> <family film> Hare-Abian Nights <SEP> 
 <BOS> <drama> Distant Thunder <SEP> 



In [5]:
print(torch.cuda.is_available())
print(torch.cuda.current_device())

print(torch.cuda.device(0))

print(torch.cuda.device_count())

print(torch.cuda.get_device_name(0))

True
0
<torch.cuda.device object at 0x000001BD23FDC610>
1
GeForce RTX 2060 SUPER


In [4]:
def pipeline_generate_text(text_input): 
    result = generator(text_input, max_length=512, num_return_sequences=1)[0]['generated_text']
    plot = re.sub(text_input,'',result)
    return plot
    
def generate_text(text_input):
    input_ids = tokenizer.encode(text_input, return_tensors='pt')
    input_ids=input_ids.to(device)
    
    #top-k nucleus sampling
    sample_output = model.generate(
        input_ids,
        do_sample=True, 
        max_length=512, 
        top_k=50, 
        top_p=0.95, 
        num_return_sequences=1,
        bos_token_id=tokenizer.bos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        sep_token_id=tokenizer.sep_token_id,
        eos_token_id=tokenizer.eos_token_id,
        device=0
    )[0]
    result = tokenizer.decode(sample_output, skip_special_tokens=False)
    plot = re.sub(text_input,'',result) #remove input and EOS token
    plot = re.sub('<EOS>','',plot) 
    return plot


In [None]:
use_pipeline = True

text_inputs = []
generated_plots = [] 

bar = tqdm(range(len(dataset)))

for i in bar:
    text_input = dataset[i]['text']
    plot = pipeline_generate_text(text_input) if use_pipeline else generate_text(text_input)
    text_inputs.append(text_input)
    generated_plots.append(plot)

result = pd.DataFrame({'input_text': text_inputs, 'generated_plot': generated_plots})
result.to_csv('test_result.csv')

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

In [6]:
print(*[plot + '\n\n' for plot in result['generated_plot']])

 is a series of comic strips featuring a recurring series of misbehavior, violent accidents, and sometimes murder. While many adults have experienced violence during childhood and adolescence, these serial killers also occasionally use gang violence to their advantage. The protagonists of the series are in their teens and twenties, while others are in their teens and twenties.  July July 2009

  is a married man, a married man, who is studying in a religious centre in Pune. During the marriage, he is approached by his stepfather, who refuses to consummate the marriage between his and his wife. This prompts her to give birth to a baby boy, Chandipathi. The infant boy is born in a hospital. He dies within days of birth. Chandipathi's parents are all widows. When the baby boy's parents go back and forth between the three families of his parents at night due to the sudden death of the child, they cannot give him any protection. During the same day, he is told to eat on the doorstep of his 