In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import pandas as pd
from tqdm import tqdm

# Load pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Ensure that the tokenizer uses padding on the left side
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Function to generate words starting with specific letters
def generate_with_start_letters(model, tokenizer, start_letters, max_new_tokens=50):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    generated = ""
    input_ids = tokenizer.encode(tokenizer.bos_token, return_tensors='pt').to(device)
    attention_mask = torch.ones(input_ids.shape, device=device)
    first_word = True  # Flag to indicate if it's the first word in the sentence

    # Process each letter from the hidden message
    for letter in start_letters:
        found = False
        attempts = 0
        while not found and attempts < 5:  # Reduced attempts
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_new_tokens,  # Changed to max_new_tokens
                temperature=0.9,  # Slightly increased temperature
                top_k=10,  # Reduced top_k
                top_p=0.95,
                do_sample=True,
                num_return_sequences=1,
                pad_token_id=tokenizer.eos_token_id
            )

            if outputs.numel() == 0:
                print(f"No generation was produced for the letter '{letter}'.")
                generated += " " + letter.lower() + "..."
                break

            text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            generated_words = text.split()[len(generated.split()):]

            for word in generated_words:
                if word.lower().startswith(letter.lower()):
                    if first_word:
                        generated += " " + word.capitalize()  # Capitalize only the first word
                        first_word = False
                    else:
                        generated += " " + word.lower()  # Lowercase subsequent words
                    found = True
                    input_ids = tokenizer.encode(generated, return_tensors='pt').to(device)
                    attention_mask = torch.ones(input_ids.shape, device=device)
                    break
            attempts += 1

    # Continue generating naturally to complete the sentence
    while True:
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,  # Changed to max_new_tokens
            temperature=0.9,  # Slightly increased temperature
            top_k=10,
            top_p=0.95,
            do_sample=True,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id
        )

        text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        for i, char in enumerate(text[len(generated):]):
            generated += char
            if char in '.?!':  # Check for end of sentence punctuation
                return generated.strip()

        # Update input_ids for continuation, ensuring attention_mask matches input_ids
        input_ids = tokenizer.encode(generated, return_tensors='pt').to(device)
        attention_mask = torch.ones(input_ids.shape, device=device)

In [None]:
# Example usage
start_letters = ['T', 'b', 'e']
steg_text = generate_with_start_letters(model, tokenizer, start_letters)
print(steg_text)

In [None]:
def process_movies(csv_path, output_path):
    df = pd.read_csv(csv_path)
    results = []

    # Use tqdm for the progress bar, iterating over DataFrame rows
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Movies"):
        movie_title = row['title']  # Adjust column name as per your CSV
        start_letters = [char for char in movie_title if char.isalpha()]  # Get letters from movie title
        generated_text = generate_with_start_letters(model, tokenizer, start_letters)
        results.append({'Original Title': movie_title, 'Steganographic Sentence': generated_text})

    # Save results to a new CSV file
    output_df = pd.DataFrame(results)
    output_df.to_csv(output_path, index=False)


# Example usage
process_movies('Archive/snip.CSV', '1st_letter.csv')