In [1]:
!pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
# PyGPT Training - All-in-One Cell
# Just run this entire cell to train your model!

# ============================================================
# SETUP
# ============================================================
import os
import sys
import time
import pickle
from datasets import load_dataset

from src.training.train import Trainer
from src.tokenizer.tokenizer_class import BPETokenizer

print("âœ“ Imports successful\n")

# ============================================================
# LOAD TOKENIZER
# ============================================================
print("Loading tokenizer...")
with open("artifacts/tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)
    tokenizer._ensure_vocab()

print(f"âœ“ Tokenizer loaded (vocab size: {tokenizer.vocab_size})\n")

# ============================================================
# LOAD TRAINING DATA
# ============================================================
print("Loading training data...")
max_lines = 1000
dataset = load_dataset("tatsu-lab/alpaca")
train_data = dataset["train"].select(range(max_lines))

training_texts = []
with open("tokenizer_training_data/alpaca_sample_utf8.txt", "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        training_texts.append(line.strip())

print(f"âœ“ Loaded {len(training_texts)} training samples\n")

# ============================================================
# INITIALIZE TRAINER
# ============================================================
print("Initializing trainer...")
trainer = Trainer(
    tokenizer=tokenizer,
    user_input=training_texts,
    lr=1e-4,
    num_blocks=4,  # Stack 4 transformer blocks
    num_heads=8    # 8 attention heads per block
)

print("\n" + "="*60)
print("MODEL SUMMARY")
trainer.print_model_summary()
print("="*60 + "\n")

# ============================================================
# TRAIN MODEL
# ============================================================
print("Starting training...\n")
train_time = time.time()

trainer.train(
    epochs=10,
    batch_size=100,
    checkpoint_path="artifacts/training_logs/jax_training_latest.pkl",
    save_every=10
)

end_train = time.time() - train_time
print(f"\nâœ“ Training complete! Time: {end_train:.2f}s\n")

# ============================================================
# TEST GENERATION
# ============================================================
print("Testing text generation...\n")
prompt = "What is 5+5?"
generated_text = trainer.generate(prompt, max_length=50)

print("="*60)
print(f"Prompt: {prompt}")
print("="*60)
print(f"Generated: {generated_text}")
print("="*60 + "\n")

# ============================================================
# GENERATE FROM MULTIPLE PROMPTS
# ============================================================
print("Generating from multiple prompts...\n")
prompts = [
    "Describe some of the benefits of a vegetarian diet.",
    "What is the capital of France?",
    "Explain machine learning in simple terms."
]

for prompt in prompts:
    print(f"Prompt: {prompt}")
    result = trainer.generate(
        prompt,
        max_length=50,
        temperature=0.7,
        top_k=40,
        repetition_penalty=1.5
    )
    print(f"Generated: {result}")
    print("-" * 60 + "\n")

print("\nðŸŽ‰ All done!")

  from .autonotebook import tqdm as notebook_tqdm


âœ“ Imports successful

Loading tokenizer...
âœ“ Tokenizer loaded (vocab size: 2501)

Loading training data...
âœ“ Loaded 17150 training samples

Initializing trainer...

MODEL SUMMARY
MODEL ARCHITECTURE SUMMARY
Vocabulary Size:      2,501
Embedding Dimension:  256
Max Sequence Length:  256
Number of Blocks:     4
Number of Heads:      8
FFN Hidden Dimension: 1024
PARAMETER COUNTS
Embedding Layer:           705,792 parameters
Attention Layers:        1,048,576 parameters
FeedForward Layers:      2,102,272 parameters
Layer Normalization:         4,096 parameters
Output Layer:              642,757 parameters
------------------------------------------------------------
TOTAL:                   4,503,493 parameters
Model Size (float32): ~17.18 MB

Starting training...

Checkpoints will be saved to: artifacts/training_logs/jax_training_latest_2025-11-12_11-27-56.pkl


                                                            



Training interrupted by user!
Saving checkpoint before exit...
Checkpoint saved to artifacts/training_logs/jax_training_latest_2025-11-12_11-27-56.pkl
Training stopped at epoch 1/10


KeyboardInterrupt: 