# Fine-Tune GPT-2 for Story & Poem Generation

This notebook allows you to fine-tune a GPT-2 model on your own text data using a free GPU on Google Colab.

## 1. Install Dependencies

In [None]:
!pip install transformers datasets accelerate torch huggingface_hub

## 2. Prepare Your Data

You need a file named `train_data.txt`. 
1. Click the **Folder icon** on the left sidebar.
2. Drag and drop your `train_data.txt` file there.

If you don't have one, run the cell below to create a small dummy dataset for testing.

In [None]:
import os

if not os.path.exists("train_data.txt"):
    print("Creating dummy train_data.txt...")
    with open("train_data.txt", "w") as f:
        f.write("Write a poem about Rain:\nThe rain falls gently on the roof, a rhythmic tapping, a soothing proof.\n\n")
        f.write("Story about The Sun:\nThe sun shone brightly over the valley. It was a day unlike any other, where the birds sang melodies of old.\n\n")
        f.write("Write a poem about Love:\nLove is a rose, a tender bloom, dispelling shadows, chasing gloom.\n\n")
        # Add more examples here to test!
    print("Created!")
else:
    print("train_data.txt already exists.")

## 3. Training Script

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

def fine_tune_gpt2(
    model_name="gpt2",
    output_dir="./finetuned_model",
    train_file="train_data.txt",
    epochs=3,
    batch_size=4
):
    # 1. Load Tokenizer and Model
    print(f"Loading {model_name}...")
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    # 2. Prepare Dataset
    print(f"Loading dataset from {train_file}...")
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_file,
        block_size=128,
        overwrite_cache=True
    )
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )

    # 3. Training Arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        save_steps=500,
        save_total_limit=2,
        prediction_loss_only=True,
        learning_rate=5e-5,
    )

    # 4. Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    # 5. Train
    print("Starting training...")
    trainer.train()
    
    # 6. Save Model locally
    print(f"Saving model to {output_dir}...")
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print("Done!")

# Run the training
fine_tune_gpt2(epochs=5, batch_size=8)

## 4. Test the Model

In [None]:
from transformers import pipeline

generator = pipeline('text-generation', model='./finetuned_model')

prompt = "Write a poem about The Moon:"
output = generator(prompt, max_length=100, num_return_sequences=1)
print(output[0]['generated_text'])

## 5. Upload to Hugging Face (Optional)
To use this model in your deployed backend, you need to push it to the Hugging Face Hub.

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model
model = GPT2LMHeadModel.from_pretrained("./finetuned_model")
tokenizer = GPT2Tokenizer.from_pretrained("./finetuned_model")

# Push to Hub
# Replace 'your-username/your-model-name' with your actual details
model_name_on_hub = "your-username/my-story-poem-gpt2"
model.push_to_hub(model_name_on_hub)
tokenizer.push_to_hub(model_name_on_hub)

print(f"Model pushed to https://huggingface.co/{model_name_on_hub}")