In [1]:
import json
import random

def generate_fake_text_generation_dataset(num_samples=1000):
    """
    Generates a fake dataset for text generation fine-tuning.
    Each sample consists of a 'prompt' (a news headline) and a 'completion' (a short article).
    """
    dataset = []
    base_topics = [
        ("Tech Innovation Boosts Economy", "A groundbreaking new technology in renewable energy has been announced, promising to significantly boost the national economy and create thousands of jobs. Experts predict a rapid adoption rate across various industries."),
        ("Local Park Undergoes Major Renovation", "The beloved community park, Green Oasis, is currently undergoing extensive renovations. New playgrounds, walking trails, and picnic areas are being added, with a grand reopening expected next spring."),
        ("New Study Reveals Health Benefits of Sleep", "Researchers have published a new study highlighting the profound benefits of adequate sleep on cognitive function and overall health. They recommend at least 7-9 hours for adults."),
        ("Art Exhibition Draws Record Crowds", "The 'Modern Masterpieces' exhibition at the City Art Museum has seen unprecedented attendance since its opening. Visitors are praising the diverse collection and immersive installations."),
        ("Sustainable Farming Initiative Launched", "A new initiative promoting sustainable farming practices has been launched in rural areas. Farmers will receive support and training to adopt eco-friendly methods for crop cultivation."),
        ("City Council Approves New Public Library", "After months of debate, the City Council has approved the construction of a new state-of-the-art public library downtown. It will feature extensive digital resources and community spaces."),
        ("Breakthrough in Cancer Research Announced", "Scientists at the leading research institute have announced a significant breakthrough in cancer treatment, offering new hope for patients worldwide. Clinical trials are set to begin next year."),
        ("Sports Team Secures Championship Title", "The local basketball team, the City Eagles, clinched the national championship in a thrilling final match. Fans celebrated late into the night, praising the team's dedication and skill."),
        ("Economic Forecast Predicts Steady Growth", "Analysts are predicting a period of steady economic growth for the next quarter, citing strong consumer spending and stable market conditions. Inflation is expected to remain controlled."),
        ("Global Summit Addresses Climate Change", "Leaders from around the world gathered for a crucial summit to discuss urgent actions against climate change. Key agreements were reached on emissions reduction targets and renewable energy investments."),
    ]

    for i in range(num_samples):
        headline, article = random.choice(base_topics)

        # Add slight variations to make them unique
        prompt = f"{headline} (Generated Sample {i+1})"
        completion = f"{article} This update is part of the daily news brief. (Generated Sample {i+1})"

        dataset.append({
            "prompt": prompt,
            "completion": completion
        })

    return dataset

if __name__ == "__main__":
    fake_text_gen_data = generate_fake_text_generation_dataset(num_samples=1000)
    print(f"Generated {len(fake_text_gen_data)} samples.")

    # Save this to a JSONL file, which is preferred by OpenAI for fine-tuning
    # Each line is a separate JSON object
    with open("fake_text_gen_dataset.jsonl", "w", encoding="utf-8") as f:
        for entry in fake_text_gen_data:
            f.write(json.dumps(entry) + "\n")
    print("Dataset saved to fake_text_gen_dataset.jsonl")

    # Print a few samples to verify
    print("\n--- First 3 Samples ---")
    for j in range(min(3, len(fake_text_gen_data))):
        print(f"Sample {j+1}:")
        print(f"  Prompt: {fake_text_gen_data[j]['prompt']}")
        print(f"  Completion: {fake_text_gen_data[j]['completion']}")
        print("-" * 20)


Generated 1000 samples.
Dataset saved to fake_text_gen_dataset.jsonl

--- First 3 Samples ---
Sample 1:
  Prompt: Art Exhibition Draws Record Crowds (Generated Sample 1)
  Completion: The 'Modern Masterpieces' exhibition at the City Art Museum has seen unprecedented attendance since its opening. Visitors are praising the diverse collection and immersive installations. This update is part of the daily news brief. (Generated Sample 1)
--------------------
Sample 2:
  Prompt: Economic Forecast Predicts Steady Growth (Generated Sample 2)
  Completion: Analysts are predicting a period of steady economic growth for the next quarter, citing strong consumer spending and stable market conditions. Inflation is expected to remain controlled. This update is part of the daily news brief. (Generated Sample 2)
--------------------
Sample 3:
  Prompt: Economic Forecast Predicts Steady Growth (Generated Sample 3)
  Completion: Analysts are predicting a period of steady economic growth for the next quar

In [6]:
import json
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import torch
import os

# Ensure datasets and transformers are installed:
# pip install datasets transformers torch

# --- 1. Load the dataset ---
def load_text_gen_dataset(file_path="fake_text_gen_dataset.jsonl"):
    """
    Loads the fake text generation dataset from a JSONL file.
    """
    data = []
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                data.append(json.loads(line))
        print(f"Successfully loaded dataset from {file_path}. Found {len(data)} samples.")
        return data
    except FileNotFoundError:
        print(f"Error: Dataset file not found at {file_path}. Please run the data generation script first.")
        return None
    except json.JSONDecodeError as e:
        print(f"Error: Could not decode JSON from a line in {file_path}. Check file format. Error: {e}")
        return None

# Load your dataset
raw_datasets = load_text_gen_dataset()

if raw_datasets is None:
    exit() # Exit if dataset loading failed

hf_dataset = Dataset.from_list(raw_datasets)
train_test_split = hf_dataset.train_test_split(test_size=0.2, seed=42)
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test']
})
print(f"Dataset split into: {dataset_dict}")


# --- 2. Load pre-trained GPT-2 model and tokenizer ---
# Using 'gpt2' as the base model.
model_checkpoint = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

print(f"Loaded tokenizer and model: {model_checkpoint}")

# GPT-2 tokenizer does not have a padding token by default.
# For batch processing, a padding token is usually needed.
# We set it to the EOS token for causal language modeling, as suggested by Hugging Face.
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id
    print(f"Set tokenizer.pad_token to tokenizer.eos_token ({tokenizer.eos_token_id})")

# --- 3. Preprocess the dataset ---
# For causal language modeling, we concatenate the prompt and completion.
# The model will then learn to generate the completion given the prompt.
max_length = 256 # Adjust max_length based on your average sequence length

def preprocess_function(examples):
    # Combine prompt and completion. Add EOS token to mark the end of a sequence.
    # The model will learn to generate text until it hits the EOS token.
    texts = [f"{p}{c}{tokenizer.eos_token}" for p, c in zip(examples["prompt"], examples["completion"])]
    return tokenizer(texts, truncation=True, max_length=max_length, padding="max_length")

print("Preprocessing training examples...")
tokenized_datasets = dataset_dict.map(
    preprocess_function,
    batched=True,
    num_proc=os.cpu_count(), # Use multiple processes for faster mapping
    remove_columns=["prompt", "completion"] # Remove original text columns
)
print("Preprocessing complete.")

# Data Collator for Language Modeling will handle batching and masking.
# For Causal Language Modeling (mlm=False), it shifts the labels for next token prediction.
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False # False for Causal Language Modeling
)


# --- 4. Define training arguments ---
output_dir = "./gpt2_fine_tuned_textgen"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch", # Evaluate every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=4, # Adjust based on your GPU memory
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./gpt2_logs",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    push_to_hub=False,
    do_train=True,
    do_eval=True,
    # Gradient accumulation and checkpointing can be useful for larger models/smaller GPUs
    # gradient_accumulation_steps=2, # Effectively doubles batch size
    # gradient_checkpointing=True, # Saves memory but slows down training
)

# --- 5. Initialize Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # Use the data collator for language modeling
)

# --- 6. Train the model ---
print("Starting GPT-2 model training...")
trainer.train()
print("GPT-2 Training complete!")

# Save the fine-tuned model and tokenizer
model_save_path = "./fine_tuned_gpt2_textgen"
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Fine-tuned GPT-2 model and tokenizer saved to {model_save_path}")

print("\nYou can now load this model for inference and text generation:")
print(f"""
from transformers import pipeline

# Load the fine-tuned model
generator = pipeline(
    'text-generation',
    model="{model_save_path}",
    tokenizer="{model_save_path}"
)

# Example generation
prompt = "Tech Innovation Boosts Economy (Generated Sample 1)"
generated_text = generator(prompt, max_length=100, num_return_sequences=1,
                           pad_token_id=tokenizer.eos_token_id)[0]['generated_text']
print(f"\\nPrompt: {{prompt}}")
print(f"\\nGenerated: {{generated_text}}")

# You might need to post-process the generated text to remove the prompt itself
# or any unwanted parts like repeated prompts/EOS tokens.
""")


Successfully loaded dataset from fake_text_gen_dataset.jsonl. Found 1000 samples.
Dataset split into: DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 200
    })
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Loaded tokenizer and model: gpt2
Set tokenizer.pad_token to tokenizer.eos_token (50256)
Preprocessing training examples...


Map (num_proc=2):   0%|          | 0/800 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Map (num_proc=2):   0%|          | 0/200 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)
  trainer = Trainer(


Preprocessing complete.
Starting GPT-2 model training...




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33makashsaini454545[0m ([33makashsaini454545-massachusetts-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,0.163,0.176269
2,0.1506,0.155068
3,0.1377,0.150793


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


GPT-2 Training complete!
Fine-tuned GPT-2 model and tokenizer saved to ./fine_tuned_gpt2_textgen

You can now load this model for inference and text generation:

from transformers import pipeline

# Load the fine-tuned model
generator = pipeline(
    'text-generation',
    model="./fine_tuned_gpt2_textgen",
    tokenizer="./fine_tuned_gpt2_textgen"
)

# Example generation
prompt = "Tech Innovation Boosts Economy (Generated Sample 1)"
generated_text = generator(prompt, max_length=100, num_return_sequences=1,
                           pad_token_id=tokenizer.eos_token_id)[0]['generated_text']
print(f"\nPrompt: {prompt}")
print(f"\nGenerated: {generated_text}")

# You might need to post-process the generated text to remove the prompt itself
# or any unwanted parts like repeated prompts/EOS tokens.



In [7]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import os

# Define the path where your fine-tuned GPT-2 model and tokenizer are saved
model_path = "./fine_tuned_gpt2_textgen"

# --- 1. Load the fine-tuned model and tokenizer ---
print(f"Loading fine-tuned GPT-2 model and tokenizer from: {model_path}")
if not os.path.exists(model_path):
    print(f"Error: Model directory not found at {model_path}. "
          "Please ensure the fine-tuning script completed successfully "
          "and saved the model to this location.")
    exit()

try:
    # Load the tokenizer first to set its pad_token if it was set during training
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)

    # Ensure the tokenizer has a padding token defined, typically the EOS token for GPT-2
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = tokenizer.eos_token_id
        print(f"Set tokenizer.pad_token to tokenizer.eos_token ({tokenizer.eos_token_id}) for prediction.")

    print("GPT-2 model and tokenizer loaded successfully!")
except Exception as e:
    print(f"Failed to load GPT-2 model or tokenizer. Error: {e}")
    print("This might happen if the model was not saved correctly or if there's a version mismatch.")
    exit()

# --- 2. Create a text generation pipeline ---
# The pipeline handles tokenization, model inference, and decoding the generated tokens.
# You can specify the device if you have a GPU (e.g., device=0 for the first GPU)
generator = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1 # Use GPU if available, else CPU
)

print(f"Text generation pipeline initialized. Using device: {'GPU' if torch.cuda.is_available() else 'CPU'}")

# --- 3. Test with example prompts ---

print("\n--- Generating Text ---")

# Example 1: Use a prompt similar to what was in your training data (a headline)
prompt1 = "Tech Innovation Boosts Economy (Generated Sample 1)"
print(f"\nPrompt: {prompt1}")
# `max_length` controls how long the generated text can be (including the prompt)
# `num_return_sequences` generates multiple distinct outputs
# `pad_token_id` is important for generation with padding
# `do_sample=True` enables sampling, `top_k` and `temperature` add randomness/creativity
generated_text1 = generator(
    prompt1,
    max_length=100, # Total length of prompt + generated completion
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,
    do_sample=True,
    top_k=50,
    temperature=0.7
)[0]['generated_text']
print(f"Generated Text 1:\n{generated_text1}")
print("-" * 30)


# Example 2: Another prompt
prompt2 = "New Study Reveals Health Benefits of Sleep (Generated Sample 2)"
print(f"\nPrompt: {prompt2}")
generated_text2 = generator(
    prompt2,
    max_length=120,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,
    do_sample=True,
    top_k=50,
    temperature=0.8
)[0]['generated_text']
print(f"Generated Text 2:\n{generated_text2}")
print("-" * 30)

# Example 3: A prompt that requires the model to 'continue' a sentence
prompt3 = "The Amazon River, located in South America,"
print(f"\nPrompt: {prompt3}")
generated_text3 = generator(
    prompt3,
    max_length=80,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,
    do_sample=True,
    top_k=50,
    temperature=0.9
)[0]['generated_text']
print(f"Generated Text 3:\n{generated_text3}")
print("-" * 30)

print("\nText generation process complete.")
print("\nTips for improving generation quality:")
print("- Experiment with `max_length`, `num_return_sequences`, `temperature`, `top_k`, `top_p` parameters.")
print("- For best results, the prompt should ideally end with the same separator used during training (if any).")
print("- You might need to post-process the generated text to remove the input prompt or any trailing special tokens like `EOS`.")



Loading fine-tuned GPT-2 model and tokenizer from: ./fine_tuned_gpt2_textgen


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


GPT-2 model and tokenizer loaded successfully!
Text generation pipeline initialized. Using device: CPU

--- Generating Text ---

Prompt: Tech Innovation Boosts Economy (Generated Sample 1)


Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generated Text 1:
Tech Innovation Boosts Economy (Generated Sample 1)A groundbreaking new technology in renewable energy has been announced, promising to significantly boost the national economy and create thousands of jobs. Experts predict a rapid adoption rate across various industries. This update is part of the daily news brief. (Generated Sample 1)A groundbreaking new technology in renewable energy has been announced, promising to significantly boost the national economy and create thousands of jobs. Experts predict a rapid adoption rate across various industries. This update is part of the daily news brief. (Generated Sample 1)Study Reveals Health Benefits of Sleep (Generated Sample 451)Researchers have published a new study highlighting the profound benefits of adequate sleep on cognitive function and overall health. They recommend at least 7-9 hours for adults. This update is part of the daily news brief. (Generated Sample 451)Researchers have published a new study highlighting

Both `max_new_tokens` (=256) and `max_length`(=80) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generated Text 2:
New Study Reveals Health Benefits of Sleep (Generated Sample 2)Researchers have published a new study highlighting the profound benefits of adequate sleep on cognitive function and overall health. They recommend at least 7-9 hours for adults. This update is part of the daily news brief. (Generated Sample 2)Researchers have published a new study highlighting the profound benefits of adequate sleep on cognitive function and overall health. They recommend at least 7-9 hours for adults. This update is part of the daily news brief. (Generated Sample 2)This update is part of the daily news brief. (Generated Sample 2)This update is part of the daily news brief. (Generated Sample 2)This update is part of the daily news brief. (Generated Sample 2)This update is part of the daily news brief. (Generated Sample 2)This update is part of the daily news brief. (Generated Sample 2)This update is part of the daily news brief. (Generated Sample 2)This update is part of the daily news b