In [8]:
import torch
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, AutoConfig,
    TrainingArguments, Trainer, DataCollatorForLanguageModeling
)
from datasets import Dataset, load_dataset
from safetensors.torch import save_file
import json
import os
import shutil

from huggingface_hub import HfApi, Repository, upload_file, create_repo, upload_folder
from huggingface_hub import login as hf_login

In [None]:
# HuggingFace Authentication and Setup
def setup_huggingface_auth():
    """Setup HuggingFace authentication"""
    try:
        # Option 1: Login with token (recommended for production)
        # Get your token from https://huggingface.co/settings/tokens
        hf_login(token="enter_your_token_here")
        
        # Option 2: Interactive login
        # hf_login()
        
        print("HuggingFace authentication successful!")
        return True
    except Exception as e:
        print(f"HuggingFace authentication failed: {e}")
        print("Please ensure you have a valid HuggingFace token")
        return False

# Authenticate with HuggingFace
auth_success = setup_huggingface_auth()

In [None]:
model_name = "google/gemma-3-270m"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
    attn_implementation='eager'  # Added eager attention
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Model loaded with eager attention: {model.config.model_type}")

In [None]:
def prepare_dataset():
    # Load dataset
    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
    
    # Filter out empty texts
    dataset = dataset.filter(lambda example: len(example["text"].strip()) > 0)
    
    def tokenize_function(examples):
        # Tokenise with proper padding and truncation
        tokenised = tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",  # Changed to max_length
            max_length=512,
            return_attention_mask=True,
            return_tensors=None  # Return lists, not tensors
        )
        # Labels are input_ids for causal LM
        tokenised["labels"] = tokenised["input_ids"].copy()
        return tokenised
    
    # Tokenise the dataset
    tokenised_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset.column_names,
        desc="Tokenising dataset"
    )
    
    # Set format for PyTorch
    tokenised_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    
    return tokenised_dataset

train_dataset = prepare_dataset()
print(f"Training samples: {len(train_dataset)}")

In [21]:
output_dir = "./trained_model"
os.makedirs(output_dir, exist_ok=True)

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=100,
    save_steps=1000,
    save_total_limit=2,
    prediction_loss_only=True,
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    fp16=False,  # Disabled FP16 to avoid gradient scaling issues
    bf16=True,   # Use BF16 instead if available
    gradient_checkpointing=True,
    max_grad_norm=1.0,  # Added gradient clipping
    report_to=[],
    dataloader_num_workers=0
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=None
)

In [None]:
# Fix the deprecation warning by using processing_class
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    processing_class=tokenizer,  # Changed from tokenizer to processing_class
)

print("Starting training...")
trainer.train()
print("Training completed")

In [None]:
final_save_dir = "./gemma_3_270m_trained"
os.makedirs(final_save_dir, exist_ok=True)

# Save model weights as safetensors
state_dict = model.state_dict()
safetensors_path = os.path.join(final_save_dir, "model.safetensors")
save_file(state_dict, safetensors_path)

# Save config.json
config_dict = model.config.to_dict()
with open(os.path.join(final_save_dir, "config.json"), 'w') as f:
    json.dump(config_dict, f, indent=2)

# Save generation_config.json
generation_config = {
    "bos_token_id": tokenizer.bos_token_id,
    "eos_token_id": tokenizer.eos_token_id,
    "pad_token_id": tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id,
    "do_sample": True,
    "max_new_tokens": 256,
    "temperature": 0.7,
    "top_p": 0.9
}
with open(os.path.join(final_save_dir, "generation_config.json"), 'w') as f:
    json.dump(generation_config, f, indent=2)

# Save tokenizer files (this creates all tokenizer-related files)
tokenizer.save_pretrained(final_save_dir)

print(f"Model saved to {final_save_dir}")
print("Files created:")
for file in os.listdir(final_save_dir):
    file_path = os.path.join(final_save_dir, file)
    size = os.path.getsize(file_path)
    print(f"  {file}: {size:,} bytes")

In [None]:
# Verify all required files exist
required_files = [
    "added_tokens.json",
    "config.json", 
    "generation_config.json",
    "model.safetensors",
    "special_tokens_map.json",
    "tokenizer.json",
    "tokenizer.model",
    "tokenizer_config.json"
]

missing_files = []
for file in required_files:
    if not os.path.exists(os.path.join(final_save_dir, file)):
        missing_files.append(file)

if missing_files:
    print(f"Missing files: {missing_files}")
else:
    print("✅ All required files present")