<a href="https://colab.research.google.com/github/ANUBHAV4646/PRODIGY_GAI_01/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets --quiet

from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import torch

texts = [
    {"text": "Hello, how are you doing today?"},
    {"text": "I am fine, thank you for asking."},
    {"text": "My name is Anubhav, and I love coding."},
    {"text": "Today, we are learning how to fine-tune GPT-2."},
    {"text": "Let's generate some creative and coherent text."},
    {"text": "Machine learning is an exciting field."},
    {"text": "Natural language processing enables computers to understand text."},
    {"text": "Transformers have revolutionized language models."},
    {"text": "Fine-tuning helps adapt pre-trained models to specific tasks."},
    {"text": "Keep practicing to improve your AI skills every day."}
]

dataset = Dataset.from_list(texts)

model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=64)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    save_steps=50,
    save_total_limit=2,
    logging_steps=10,
    learning_rate=5e-5,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

trainer.train()

trainer.save_model("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

print("Training complete and model saved!")


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

def generate_text(prompt, model_dir='./gpt2-finetuned', max_length=100, num_return_sequences=1):

    tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
    model = GPT2LMHeadModel.from_pretrained(model_dir)
    model.eval()


    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.eos_token_id
    )

    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return generated_texts

if __name__ == "__main__":
    prompt = input("Enter your prompt: ")
    generated_texts = generate_text(prompt, num_return_sequences=1)
    for i, text in enumerate(generated_texts):
        print(f"\nGenerated text {i+1}:\n{text}")
