<a href="https://colab.research.google.com/github/DebajyotiBindu/AI-text-Generator/blob/main/Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers[torch] datasets accelerate -U

import os
import torch
from datasets import load_dataset
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)

input_file = "/content/data.txt"
output_dir = "./model_gpt2"

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)

def group_texts(examples):

    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    block_size = 128
    total_length = (total_length // block_size) * block_size

    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

dataset = load_dataset("text", data_files={"train": input_file})

tokenized_dataset = dataset.map(
    lambda x: tokenizer(x["text"]),
    batched=True,
    remove_columns=["text"]
).filter(lambda x: len(x["input_ids"]) > 0)

lm_dataset = tokenized_dataset.map(group_texts, batched=True)

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_total_limit=1,
    logging_steps=50,
    prediction_loss_only=True,
    report_to="none"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

trainer.train()

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
!zip -r sherlock_gpt2_model.zip ./model_gpt2



📥 Loading Model...


Loading weights:   0%|          | 0/148 [00:00<?, ?it/s]

📖 Cleaning and Tokenizing Sherlock...


Map:   0%|          | 0/12306 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12306 [00:00<?, ? examples/s]

Map:   0%|          | 0/9628 [00:00<?, ? examples/s]

🚀 Starting Training on T4 GPU...


Step,Training Loss
50,3.847484
100,3.549266
150,3.478969
200,3.404456
250,3.417148
300,3.338213
350,3.166097
400,3.128868
450,3.134834
500,3.129991


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


zip error: Nothing to do! (try: zip -r sherlock_gpt2_model.zip . -i ./sherlock_gpt2)
✅ DONE! Download 'sherlock_gpt2_model.zip' from the sidebar.


In [None]:
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
!zip -r sherlock_gpt2_model.zip ./model_gpt2
print("✅ DONE! Download 'model_gpt2_model.zip' from the sidebar.")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  adding: model_gpt2/ (stored 0%)
  adding: model_gpt2/config.json (deflated 52%)
  adding: model_gpt2/checkpoint-861/ (stored 0%)
  adding: model_gpt2/checkpoint-861/config.json (deflated 52%)
  adding: model_gpt2/checkpoint-861/rng_state.pth (deflated 26%)
  adding: model_gpt2/checkpoint-861/tokenizer_config.json (deflated 48%)
  adding: model_gpt2/checkpoint-861/optimizer.pt (deflated 8%)
  adding: model_gpt2/checkpoint-861/training_args.bin (deflated 54%)
  adding: model_gpt2/checkpoint-861/generation_config.json (deflated 25%)
  adding: model_gpt2/checkpoint-861/tokenizer.json (deflated 82%)
  adding: model_gpt2/checkpoint-861/scheduler.pt (deflated 62%)
  adding: model_gpt2/checkpoint-861/trainer_state.json (deflated 70%)
  adding: model_gpt2/checkpoint-861/model.safetensors (deflated 7%)
  adding: model_gpt2/tokenizer_config.json (deflated 48%)
  adding: model_gpt2/training_args.bin (deflated 54%)
  adding: model_gpt2/generation_config.json (deflated 25%)
  adding: model_gpt2/to