In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Adjust tokenizer settings
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

In [9]:
def generate_with_included_letters(model, tokenizer, letters, max_length=50):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    generated = ""
    input_ids = tokenizer.encode(tokenizer.bos_token, return_tensors='pt').to(device)
    attention_mask = torch.ones(input_ids.shape, device=device)

    # Process each letter from the hidden message
    for letter in letters:
        found = False
        attempts = 0
        while not found and attempts < 10:
            outputs = model.generate(
                input_ids,
                attention_mask=attention_mask,
                max_length=max_length + len(generated.split()),
                temperature=0.8,
                top_k=50,
                top_p=0.95,
                do_sample=True,
                num_return_sequences=1
            )

            if outputs.numel() == 0:
                print(f"No generation was produced for the letter '{letter}'.")
                generated += " " + letter.lower() + "..."
                break

            text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            generated_words = text.split()[len(generated.split()):]

            for word in generated_words:
                if letter.lower() in word.lower():
                    generated += " " + word
                    found = True
                    input_ids = tokenizer.encode(generated, return_tensors='pt').to(device)
                    attention_mask = torch.ones(input_ids.shape, device=device)
                    break
            attempts += 1

    # Complete the sentence naturally until a punctuation mark
    sentence_completed = False
    while not sentence_completed:
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length + len(generated.split()),
            temperature=0.8,
            top_k=50,
            top_p=0.95,
            do_sample=True,
            num_return_sequences=1
        )

        text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        for i, char in enumerate(text[len(generated):]):
            generated += char
            if char in '.?!':
                sentence_completed = True
                break

        if not sentence_completed:
            input_ids = tokenizer.encode(generated, return_tensors='pt').to(device)
            attention_mask = torch.ones(input_ids.shape, device=device)

    # Ensure the first letter of the sentence is capitalized
    generated = generated.strip()
    if generated:
        generated = generated[0].upper() + generated[1:]

    return generated

In [13]:
# Example usage
letters = ['I', 'A', 'F']
sentence = generate_with_included_letters(model, tokenizer, letters)
print(sentence)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


This about of course, is a pretty bad sign.
