In [1]:
!pip install transformers datasets accelerate numpy==1.26.4




In [3]:
import torch
from datasets import Dataset
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    pipeline
)

print("Has GPU?:", torch.cuda.is_available(), "| Device:",
      torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


Has GPU?: False | Device: CPU


In [4]:
from datasets import Dataset

texts = [
    "Once upon a time, there was a kingdom of code.",
    "Fine-tuning GPT-2 helps in making custom generative models.",
    "Transformers are powerful and efficient for text generation.",
    "This is how we demonstrate training on small custom datasets.",
    "GPT-2 generates coherent and creative text when trained properly.",
]

dataset = Dataset.from_dict({"text": texts})

print(dataset)


Dataset({
    features: ['text'],
    num_rows: 5
})


In [5]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 requires this

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_dataset.set_format(type="torch")

print(tokenized_dataset[0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

{'input_ids': tensor([ 7454,  2402,   257,   640,    11,   612,   373,   257, 13239,   286,
         2438,    13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 

In [6]:
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained("gpt2")



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    logging_steps=10,
    save_steps=50,
    save_total_limit=1,
)


In [8]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # GPT-2 is a causal LM, not masked LM
)


In [13]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model("gpt2-finetuned")
tokenizer.save_pretrained("gpt2-finetuned")
model = GPT2LMHeadModel.from_pretrained("gpt2-finetuned")



  trainer = Trainer(


Step,Training Loss


In [14]:
import os
print(os.listdir("gpt2-finetuned"))


['merges.txt', 'special_tokens_map.json', 'training_args.bin', 'generation_config.json', 'runs', 'checkpoint-2', 'model.safetensors', 'tokenizer_config.json', 'config.json', 'vocab.json']


In [15]:
from transformers import pipeline
import torch

# Load tokenizer (agar pehle se loaded nahi hai)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Load fine-tuned model from saved directory
model = GPT2LMHeadModel.from_pretrained("gpt2-finetuned")

# Setup text-generation pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,  # GPU use karega agar available ho
)

# Aapka prompt
prompt = "Once upon a time"

# Text generate karo
output = generator(prompt, max_length=100, num_return_sequences=1)

print("\nGenerated Text:\n", output[0]["generated_text"])


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Generated Text:
 Once upon a time, there was only one man who could make a difference.

The first man in the world to do something for others was a soldier.

He was the legendary soldier of the Third Army. He was the hero of the first World War. His name was Yutaka.

The legend of Yutaka was one of the greatest battles of the war.

He was brought back from the dead when the Third Army was defeated.

The first time he fought on the battlefield, he was a hero.

He fought in the first World War. He was the first soldier who could save the world.

And he was the hero of the second World War.

He was the soldier who could save the world from evil.

The legend of Yutaka was one of the greatest battles of the war.

He was brought back from the dead when the Third Army was defeated.

The first time he fought on the battlefield, he was a hero.

He fought in the first World War. He was the hero of the second World War.

And he was the hero of the third World War.

And it was the hero of the fou