<a href="https://colab.research.google.com/github/Codesmith900/LLM/blob/master/storygen1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Short StoryGen**

**Creating dataset**

In [None]:
import csv
import random

# Some pirate-themed words and phrases
pirate_words = [
    "ship", "treasure", "captain", "crew", "ocean", "skull", "rum", "parrot",
    "cannon", "island", "pirate", "sword", "map", "blackbeard", "galleon",
    "plunder", "booty", "flag", "anchor", "sea", "storm", "voyage", "deck",
    "jolly roger", "buccaneer", "harbor", "loot", "compass", "corsair", "mast"
]

sentence_templates = [
    "The {noun} was lost during a fierce {event} at sea.",
    "Captain {name} and his {crew} sailed across the {noun}.",
    "A {noun} full of {loot} was hidden on a secret {place}.",
    "The {crew} gathered around the {noun} to plan the next raid.",
    "With a {noun} in hand, the pirate shouted orders to the {crew}.",
    "The {noun} swayed under the mighty {flag} of the {crew}.",
    "Legends say the {treasure} lies beneath the {noun}.",
    "A fierce {event} forced the crew to abandon the {noun}.",
    "The {pirate} marked the {place} on his {noun} with a red X.",
    "Rum flowed freely as the {crew} celebrated their {loot}."
]

names = [
    "Blackbeard", "Anne Bonny", "Calico Jack", "Bartholomew Roberts",
    "Mary Read", "Captain Kidd", "Henry Morgan", "Charles Vane"
]

places = ["island", "harbor", "cave", "shipwreck", "reef", "bay", "lagoon", "port"]

events = ["storm", "battle", "raid", "mutiny", "storm", "skirmish", "voyage"]

def random_word(word_list):
    return random.choice(word_list)

def generate_sentence():
    template = random.choice(sentence_templates)
    sentence = template.format(
        noun=random_word(pirate_words),
        event=random_word(events),
        name=random_word(names),
        crew=random_word(pirate_words),
        loot=random_word(pirate_words),
        place=random_word(places),
        pirate=random_word(pirate_words),
        treasure=random_word(pirate_words),
        flag=random_word(pirate_words)
    )
    return sentence

def generate_story(min_words=30):
    story = ''
    while len(story.split()) < min_words:
        sentence = generate_sentence()
        story += ' ' + sentence
    return story.strip()

def generate_title():
    # Titles are 2-4 pirate words capitalized
    return ' '.join(random.sample(pirate_words, random.randint(2, 4))).title()

def write_csv(filename='pirate_stories.csv', num_rows=50):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['title', 'story'])
        writer.writeheader()
        for _ in range(num_rows):
            title = generate_title()
            story = generate_story(30)
            writer.writerow({'title': title, 'story': story})

if __name__ == "__main__":
    write_csv()
    print("CSV file 'pirate_stories.csv' generated with 50 pirate-themed titles and stories.")


CSV file 'pirate_stories.csv' generated with 50 pirate-themed titles and stories.


In [None]:
# Step 2: Load and format the dataset
import pandas as pd
from datasets import Dataset

# Load your short story dataset
df = pd.read_csv("/content/pirate_stories.csv")

# Combine prompt and story into one input string
df["text"] = "Prompt: " + df["title"] + "\nStory:" + df["story"]

# Drop the old columns
df = df[["text"]]

# Convert to Hugging Face Dataset object
dataset = Dataset.from_pandas(df)

# Preview
print(dataset[0]["text"])



Prompt: Crew Ship Jolly Roger Cannon
Story:The voyage gathered around the parrot to plan the next raid. The captain was lost during a fierce battle at sea. The buccaneer marked the island on his blackbeard with a red X.


**Tokenization**

In [None]:
from transformers import GPT2Tokenizer

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# GPT-2 doesn't have a padding token by default, so set it to EOS token
tokenizer.pad_token = tokenizer.eos_token

# Define a function to tokenize each example
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",  # pad to max_length
        truncation=True,
        max_length=512,        # adjust max length as needed
        return_tensors="pt"
    )

# Apply tokenizer to dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Check tokenized example keys and shapes
print(tokenized_dataset[0])


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

{'text': 'Prompt: Crew Ship Jolly Roger Cannon\nStory:The voyage gathered around the parrot to plan the next raid. The captain was lost during a fierce battle at sea. The buccaneer marked the island on his blackbeard with a red X.', 'input_ids': [24129, 457, 25, 17652, 16656, 40276, 13637, 20585, 198, 11605, 25, 464, 31505, 9272, 1088, 262, 1582, 10599, 284, 1410, 262, 1306, 9513, 13, 383, 10654, 373, 2626, 1141, 257, 14800, 3344, 379, 5417, 13, 383, 809, 535, 1531, 263, 7498, 262, 7022, 319, 465, 2042, 39433, 351, 257, 2266, 1395, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 502

In [None]:
print(tokenized_dataset[0]["input_ids"])  # token IDs
print(tokenizer.decode(tokenized_dataset[0]["input_ids"]))  # decoded text


[24129, 457, 25, 17652, 16656, 40276, 13637, 20585, 198, 11605, 25, 464, 31505, 9272, 1088, 262, 1582, 10599, 284, 1410, 262, 1306, 9513, 13, 383, 10654, 373, 2626, 1141, 257, 14800, 3344, 379, 5417, 13, 383, 809, 535, 1531, 263, 7498, 262, 7022, 319, 465, 2042, 39433, 351, 257, 2266, 1395, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 5025

**Load GPT-2 Model and Train text**

In [None]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load GPT-2 model for causal language modeling
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Set padding token (required for GPT-2)
model.config.pad_token_id = tokenizer.eos_token_id

# Data collator for language modeling (no masking for causal LM)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-storygen",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=100,
    save_total_limit=2,
    logging_steps=50,
    prediction_loss_only=True,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=100,

    # Disable wandb logging:
    report_to=[],
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Start training
trainer.train()


Step,Training Loss
50,4.1952


TrainOutput(global_step=75, training_loss=3.4604233805338542, metrics={'train_runtime': 1447.0145, 'train_samples_per_second': 0.104, 'train_steps_per_second': 0.052, 'total_flos': 39193804800000.0, 'train_loss': 3.4604233805338542, 'epoch': 3.0})

In [None]:
trainer.save_model("gpt2-storygen-final")
tokenizer.save_pretrained("gpt2-storygen-final")


('gpt2-storygen-final/tokenizer_config.json',
 'gpt2-storygen-final/special_tokens_map.json',
 'gpt2-storygen-final/vocab.json',
 'gpt2-storygen-final/merges.txt',
 'gpt2-storygen-final/added_tokens.json')

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

# Loading the fine-tuned model and tokenizer
model_path = "./gpt2-storygen-final"  # or wherever you saved it
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Create a text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Define a prompt to generate a story from
prompt = "Prompt: A boy discovers a hidden world\nStory:\n"

# Generate story continuation
outputs = generator(
    prompt,
    max_length=200,      # total tokens generated (prompt + continuation)
    num_return_sequences=1,
    do_sample=True,      # enable sampling to get creative results
    top_p=0.6,           # nucleus sampling
    temperature=0.5      # creativity control
)

# Print the generated story
print(outputs[0]['generated_text'])


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Prompt: A boy discovers a hidden world
Story:
Story:Rum flowed freely as the pirate celebrated their loot. A fierce storm forced the crew to abandon the loot. A fierce galleon full of loot forced the crew to abandon the crew. A fierce galleon full of loot forced the crew to abandon the crew. A fierce galleon full of loot forced the crew to abandon the crew. A fierce galleon full of loot forced the crew to abandon the crew. A fierce galleon full of loot forced the crew to abandon the crew. A fierce galleon full of loot forced the crew to abandon the crew. A fierce galleon full of loot forced the crew to abandon the crew. A fierce galleon full of loot forced the crew to abandon the crew. A fierce galleon full of loot forced the crew to abandon the crew. A fierce galleon full of loot forced the crew to abandon the crew. A fierce galleon full of loot forced the crew to abandon the crew. A fierce galleon full of loot forced the crew to abandon the crew. A fierce galleon full of loot forced 