In [24]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import pandas as pd
from tqdm import tqdm

# Load pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Ensure that the tokenizer uses padding on the left side
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

In [11]:
# Function to generate words starting with specific letters
def generate_with_start_letters(model, tokenizer, start_letters, max_length=50):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    generated = ""
    input_ids = tokenizer.encode(tokenizer.bos_token, return_tensors='pt').to(device)
    attention_mask = torch.ones(input_ids.shape, device=device)

    # Process each letter from the hidden message
    for letter in start_letters:
        found = False
        attempts = 0
        while not found and attempts < 10:
            outputs = model.generate(
                input_ids,
                attention_mask=attention_mask,
                max_length=max_length + len(generated.split()),
                temperature=0.8,
                top_k=50,
                top_p=0.95,
                do_sample=True,
                num_return_sequences=1
            )

            if outputs.numel() == 0:
                print(f"No generation was produced for the letter '{letter}'.")
                generated += " " + letter.lower() + "..."
                break

            text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            generated_words = text.split()[len(generated.split()):]
            
            for word in generated_words:
                if word.lower().startswith(letter.lower()):
                    generated += " " + word
                    found = True
                    input_ids = tokenizer.encode(generated, return_tensors='pt').to(device)
                    attention_mask = torch.ones(input_ids.shape, device=device)
                    break
            attempts += 1

    # After embedding the hidden message, generate naturally until a sentence terminator
    sentence_completed = False
    while not sentence_completed:
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length + len(generated.split()),
            temperature=0.8,
            top_k=50,
            top_p=0.95,
            do_sample=True,
            num_return_sequences=1
        )

        text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        for i, char in enumerate(text[len(generated):]):
            generated += char
            if char in '.?!':  # Check for end of sentence punctuation
                sentence_completed = True
                break

        # Update input_ids for continuation, if not finished
        if not sentence_completed:
            input_ids = tokenizer.encode(generated, return_tensors='pt').to(device)
            attention_mask = torch.ones(input_ids.shape, device=device)

    return generated.strip()


In [18]:
# Example usage
start_letters = ['I', 'A', 'F']
sentence = generate_with_start_letters(model, tokenizer, start_letters)
print(sentence)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


I and found him to be a big part of my life.


In [26]:
def process_movies(csv_path, output_path):
    df = pd.read_csv(csv_path)
    results = []

    # Use tqdm for the progress bar, iterating over DataFrame rows
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Movies"):
        movie_title = row['title']  # Adjust column name as per your CSV
        start_letters = [char for char in movie_title if char.isalpha()]  # Get letters from movie title
        generated_text = generate_with_start_letters(model, tokenizer, start_letters)
        results.append({'Original Title': movie_title, 'Steganographic Sentence': generated_text})

    # Save results to a new CSV file
    output_df = pd.DataFrame(results)
    output_df.to_csv(output_path, index=False)


# Example usage
process_movies('Archive/TMDb_updated.CSV', '1st_letter.csv')

Processing Movies:   0%|          | 0/10000 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Movies:   0%|          | 1/10000 [00:04<11:38:35,  4.19s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but r

KeyboardInterrupt: 