In [1]:
!pip install transformers # Installing the transformers library

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/88/b1/41130a228dd656a1a31ba281598a968320283f48d42782845f6ba567f00b/transformers-4.2.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 16.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 57.3MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 57.5MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=0ce91f864758

In [2]:
import transformers # transformers library
import torch # PyTorch, we are using PyTorch as our library

In [3]:
# We are going to load in GPT-2 using the transformers library
gpt_tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2-large')
# Loading in model now...
gpt_model = transformers.GPT2LMHeadModel.from_pretrained('gpt2-large')
# Takes a while to run...

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=764.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3247202234.0, style=ProgressStyle(descr…




In [4]:
## Making a function that will generate text for us ##
def gen_text(prompt_text, tokenizer, model, n_seqs=1, max_length=25):
  # n_seqs is the number of sentences to generate
  # max_length is the maximum length of the sentence
  encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
  # We are encoding the text using the gpt tokenizer. The return tensors are of type "pt"
  # since we are using PyTorch, not tensorflow
  output_sequences = model.generate(
      input_ids=encoded_prompt,
      max_length=max_length+len(encoded_prompt), # The model has to generate something, 
      # so we add the length of the original sequence to max_length
      temperature=1.0,
      top_k=0,
      top_p=0.9,
      repetition_penalty=1.2, # To ensure that we dont get repeated phrases
      do_sample=True,
      num_return_sequences=n_seqs
  ) # We feed the encoded input into the model.
  ## Getting the output ##
  if len(output_sequences.shape) > 2:
    output_sequences.squeeze_() # the _ indicates that the operation will be done in-place
  generated_sequences = []
  for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
    generated_sequence = generated_sequence.tolist()
    text = tokenizer.decode(generated_sequence)
    total_sequence = (
        prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True, )) :]
    )
    generated_sequences.append(total_sequence)
  return generated_sequences

In [5]:
# Lots of syntax errors, but now we can test our model
## One important note: in our function, on line 5, make sure that
# return_tensor is return_tensors, otherwise you will get an error like
# this:
#####
# Another important note: on line 27 of the function, instead of
# clear_up_tokenization_spaces, write clean_up_tokenization_spaces
####
gen_text("Legolas and Gimli advanced on the orcs, raising their weapons with a harrowing war cry",gpt_tokenizer,gpt_model)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['Legolas and Gimli advanced on the orcs, raising their weapons with a harrowing war cry and targeting the enemy forces.\n\n']

In [6]:
# Sequence length was too small, lets increase it
gen_text("Legolas and Gimli advanced on the orcs, raising their weapons with a harrowing war cry",
         gpt_tokenizer,
         gpt_model,
         max_length=100)
# Will take some time......

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['Legolas and Gimli advanced on the orcs, raising their weapons with a harrowing war cry. Catching sight of two more giant battleaxes that had been flung into the darkness, Rohan fell back from his rapid advance to reload his double-barreled shotguns. The orc deadclawed them easily, but the other armored great berserker leapt at the beleaguered Merry and Pippin. His axe missed its mark, but the flailing arms seemed to surprise him; he']

In [7]:
# We can demostrate n_seqs here
gen_text("Legolas and Gimli advanced on the orcs, raising their weapons with a harrowing war cry",
         gpt_tokenizer,
         gpt_model,
         max_length=40,
         n_seqs=3) # Will take even longer....

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['Legolas and Gimli advanced on the orcs, raising their weapons with a harrowing war cry.\n\n"Don\'t let your foolish gods tell you that being a part of the Night\'s Watch is easy',
 'Legolas and Gimli advanced on the orcs, raising their weapons with a harrowing war cry. They gave them two clear choices: fight or die!\n\nThe orc captains raised their axes in surrendering',
 'Legolas and Gimli advanced on the orcs, raising their weapons with a harrowing war cry. As their final volley flew they were severely wounded by a small group of Riders - two mounted archers against two']

In [None]:
# There are now 3 different outputs
# thats it