In [4]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset

# 1. PREPARE THE DATA
# Load your essay from the .txt file
with open("NepalCricket.txt", "r", encoding="utf-8") as f:
    essay_text = f.read()

In [5]:
# We wrap the text in a dataset format the AI library understands
# We repeat it a few times in the list so the 'batch' isn't empty
data = {"text": [essay_text] * 10} 
dataset = Dataset.from_dict(data)

In [6]:
# 2. INITIALIZE MODEL & TOKENIZER
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# GPT-2 doesn't have a padding token by default, so we use the 'end of text' token
tokenizer.pad_token = tokenizer.eos_token

In [7]:
# 3. TOKENIZE THE ESSAY
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [8]:
# 4. SET TRAINING ARGUMENTS
# Since the essay is tiny, we increase 'num_train_epochs' so the AI sees it often
training_args = TrainingArguments(
    output_dir="./nepal_cricket_model",
    overwrite_output_dir=True,
    num_train_epochs=20,           # High epochs for a tiny dataset
    per_device_train_batch_size=2,
    save_steps=100,
    logging_steps=10,
    learning_rate=5e-5,            # A gentle nudge to the brain
    weight_decay=0.01,
)

In [9]:
# 5. THE TRAINER
# DataCollatorForLanguageModeling handles shifting labels for 'next word' prediction
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

In [10]:
# 6. START FINE-TUNING
print("Starting the journey...")
trainer.train()

Starting the journey...


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,2.7049
20,1.0618
30,0.2752
40,0.0727
50,0.0283
60,0.025
70,0.0131
80,0.0118
90,0.011
100,0.0129


TrainOutput(global_step=100, training_loss=0.42167734205722807, metrics={'train_runtime': 641.6679, 'train_samples_per_second': 0.312, 'train_steps_per_second': 0.156, 'total_flos': 52258406400000.0, 'train_loss': 0.42167734205722807, 'epoch': 20.0})

In [11]:
# 7. SAVE YOUR NEW MODEL
trainer.save_model("./nepal_cricket_model")
tokenizer.save_pretrained("./nepal_cricket_model")
print("Model graduated!")

Model graduated!
