In [1]:
!pip install transformers datasets --quiet

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
from datasets import load_dataset
import random



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


ModuleNotFoundError: No module named 'torch'

In [None]:
dataset = load_dataset('ag_news', split='train[:1000]')  # Small subset for demo

texts = [item['text'] for item in dataset]
with open("train.txt", "w", encoding="utf-8") as f:
    for line in texts:
        f.write(line + '\n')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

def generate_few_shot(prompt, max_length=500):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    sample_output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(sample_output[0], skip_special_tokens=True)

In [None]:
few_shot_prompt = f"""Headline: {texts[0]}
Headline: {texts[1]}
Headline: {texts[2]}
Headline:"""

# Test few-shot completion
print("==== FEW-SHOT (NO FINE-TUNING) ====")
print(generate_few_shot(few_shot_prompt))

==== FEW-SHOT (NO FINE-TUNING) ====
Headline: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
Headline: Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.
Headline: Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.
Headline: The U.S. Economy (Reuters) Reuters - The U.S. economy is expected to be\stronger than expected in the second quarter of the year, according to a report by the Federal Reserve.
Headline: The U.S. Economy (Reuters) Reuters - The U.S. economy is expected to be\stronger than expecte

In [None]:
def load_dataset_for_lm(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

train_dataset = load_dataset_for_lm("train.txt", tokenizer)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)



In [None]:
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    save_steps=500,
    logging_steps=100,
    do_train=True,
    do_eval=False,
    prediction_loss_only=True,
    fp16=False
)

finetune_model = GPT2LMHeadModel.from_pretrained('gpt2')

trainer = Trainer(
    model=finetune_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()
finetune_model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")



Step,Training Loss
100,3.9531
200,3.7586
300,3.3674
400,3.1957


('./gpt2-finetuned/tokenizer_config.json',
 './gpt2-finetuned/special_tokens_map.json',
 './gpt2-finetuned/vocab.json',
 './gpt2-finetuned/merges.txt',
 './gpt2-finetuned/added_tokens.json')

In [None]:
# Load your fine-tuned model and tokenizer (do this once at the start after training)
from transformers import GPT2LMHeadModel, GPT2Tokenizer

ft_model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned")
ft_tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetuned")

def generate_finetuned(prompt, max_length=500):
    input_ids = ft_tokenizer.encode(prompt, return_tensors='pt')
    sample_output = ft_model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,
        pad_token_id=ft_tokenizer.eos_token_id
    )
    return ft_tokenizer.decode(sample_output[0], skip_special_tokens=True)


In [None]:
# ===================================
# Model Comparison (Context: use after previous notebook sections)
# ===================================

# Let's compare both models over several samples for a fair evaluation.
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from IPython.display import display, Markdown

def get_bleu(reference, candidate):
    """
    Compute BLEU score between a reference string and a candidate string.
    """
    reference = [reference.split()]
    candidate = candidate.split()
    smoothie = SmoothingFunction().method4
    return sentence_bleu(reference, candidate, smoothing_function=smoothie)

# Select several test indices
test_indices = [12, 34, 56, 78, 90]  # change as appropriate for your dataset size

# Store results for display
results = []
for idx in test_indices:
    # Prompt using N-1 context, or just single line
    prompt = "Headline: " + texts[idx] + "\nHeadline:"
    ground_truth = texts[idx + 1]
    few_shot_output = generate_few_shot(prompt)
    fine_tuned_output = generate_finetuned(prompt)
    few_shot_bleu = get_bleu(ground_truth, few_shot_output)
    fine_tuned_bleu = get_bleu(ground_truth, fine_tuned_output)
    results.append({
        'Prompt': prompt,
        'Ground Truth': ground_truth,
        'Few-Shot Output': few_shot_output,
        'Few-Shot BLEU': few_shot_bleu,
        'Fine-Tuned Output': fine_tuned_output,
        'Fine-Tuned BLEU': fine_tuned_bleu
    })

# Display results in a markdown-style table
def display_results_table(results):
    table = "| Index | Prompt | Ground Truth | Few-Shot Output | Few-Shot BLEU | Fine-Tuned Output | Fine-Tuned BLEU |\n"
    table += "|---|---|---|---|---|---|---|\n"
    for i, row in enumerate(results):
        # Show just first 60 chars of prompt/outputs for compactness
        table += f"| {i} | {row['Prompt'][:60]}... | {row['Ground Truth'][:60]}... | {row['Few-Shot Output'][:60]}... | {row['Few-Shot BLEU']:.2f} | {row['Fine-Tuned Output'][:60]}... | {row['Fine-Tuned BLEU']:.2f} |\n"
    display(Markdown(table))

display_results_table(results)


| Index | Prompt | Ground Truth | Few-Shot Output | Few-Shot BLEU | Fine-Tuned Output | Fine-Tuned BLEU |
|---|---|---|---|---|---|---|
| 0 | Headline: Non-OPEC Nations Should Up Output-Purnomo  JAKARTA... | Google IPO Auction Off to Rocky Start  WASHINGTON/NEW YORK (... | Headline: Non-OPEC Nations Should Up Output-Purnomo  JAKARTA... | 0.00 | Headline: Non-OPEC Nations Should Up Output-Purnomo  JAKARTA... | 0.00 |
| 1 | Headline: Indians fill rail skills shortage Network Rail fli... | Steady as they go BEDFORD -- Scientists at NitroMed Inc. hop... | Headline: Indians fill rail skills shortage Network Rail fli... | 0.00 | Headline: Indians fill rail skills shortage Network Rail fli... | 0.00 |
| 2 | Headline: Stoking the Steamroller No other recording artist ... | Coming to The Rescue Got a unique problem? Not to worry: you... | Headline: Stoking the Steamroller No other recording artist ... | 0.00 | Headline: Stoking the Steamroller No other recording artist ... | 0.00 |
| 3 | Headline: 'Madden,' 'ESPN' Football Score in Different Ways ... | Group to Propose New High-Speed Wireless Format (Reuters) Re... | Headline: 'Madden,' 'ESPN' Football Score in Different Ways ... | 0.00 | Headline: 'Madden,' 'ESPN' Football Score in Different Ways ... | 0.01 |
| 4 | Headline: Science, Politics Collide in Election Year (AP) AP... | Building Dedicated to Columbia Astronauts (AP) AP - A former... | Headline: Science, Politics Collide in Election Year (AP) AP... | 0.00 | Headline: Science, Politics Collide in Election Year (AP) AP... | 0.00 |
