<a href="https://colab.research.google.com/github/Downforcedemon/AI/blob/main/Train_Daneel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries
!pip install transformers accelerate peft datasets torch torch_xla -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/69.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m61.4/69.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.2/69.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━

In [2]:
# import required libraries
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import torch_xla.core.xla_model as xm
from datasets import load_dataset
import os
import json



🚀 Step 2: Load and Prepare the Daneel Dialogue Dataset
📌 What We Will Do in This Step

    Load daneel_dialogue_cleaned.json into memory.
    Flatten the structured format (convert the dataset into a simple input-output format).
    Prepare data for tokenization by combining dialogues into structured training samples.

In [4]:
# Define dataset path
dataset_path = "/content/daneel_dialogue_cleaned.json"

# Load the cleaned dataset
with open(dataset_path, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# Flatten the dataset: Extract Q&A pairs
training_data = []
for entry in raw_data:
    context = entry["context"]
    for dialogue in entry["dialogue"]:
        input_text = dialogue["input"]
        output_text = dialogue["output"]

        # Format training example as a conversational turn
        formatted_sample = {
            "input": f"User: {input_text}\nDaneel: ",
            "output": output_text
        }
        training_data.append(formatted_sample)

# Print dataset sample
print(f"Total training samples: {len(training_data)}")
print("Sample entry:\n", training_data[0])


Total training samples: 11012
Sample entry:
 {'input': 'User: v1.0 formatting and some spellchecking isaac asimov the robots of dawn doubleday company, inc\nDaneel: ', 'output': 'garden city, new york by nightfall, inc'}


Step 3: TPU Strategy and Model Loading Setup

In [5]:
import torch
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments

In [6]:
def get_tpu_strategy():
  # detect TPU and set up device
  device = xm.xla_device()
  print(f"XLA device type: {device}")

  # Set up default TPU parameters
  TPU_CORES = 8
  BATCH_SIZE = 1
  TOTAL_BATCH_SIZE = BATCH_SIZE * TPU_CORES

  return device, TPU_CORES, BATCH_SIZE, TOTAL_BATCH_SIZE

In [9]:
def load_model_and_tokenizer(model_name="deepseek-ai/deepseek-coder-7b", device=None):
    # Load tokenizer
    print("Loading tokenizer...............")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    # Model Loading with memory optimizations
    print("Loading model...............")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        use_cache=False,
        low_cpu_mem_usage=True,  # Added for memory efficiency
        device_map='auto'  # Let the model handle device placement
    )

    # Enable memory optimizations
    model.gradient_checkpointing_enable()
    model.enable_input_require_grads()

    return model, tokenizer

print("\nInitializing TPU training setup...............")
try:
    device, tpu_cores, batch_size, total_batch_size = get_tpu_strategy()
    print(f"TPU Configuration:\nCores: {tpu_cores}\nBatch Size per Core: {batch_size}\nTotal Batch Size: {total_batch_size}")

    model, tokenizer = load_model_and_tokenizer(device=device)
    print("Model and tokenizer loaded successfully!")

except Exception as e:
    print(f"Error during TPU setup: {str(e)}")
    raise


Initializing TPU training setup...............
XLA device type: xla:0
TPU Configuration:
Cores: 8
Batch Size per Core: 4
Total Batch Size: 32
Loading tokenizer...............
Loading model...............


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model and tokenizer loaded successfully!


Step 4: Format Traning Data for model input


1.   convert text data into tokenized format
2.   add proper padding and attention masks
3.   ensure all inputs are the right length(512 tokens)
4.   Prepare data in Pytorch tensor format



In [10]:
# this step converts our training data into a format the model can understand
def format_training_data(training_data, tokenizer):
  formatted_data = []

  for item in training_data:
    # Combine input and output with appropriate formatting
    full_text = f"{item['input']}{item['output']}"

    # Tokenize and encode the text
    encoded = tokenizer(
        full_text,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    formatted_data.append({
        "input_ids": encoded["input_ids"].squeeze(),
        "attention_mask": encoded["attention_mask"].squeeze()
    })
  return formatted_data

# Format the data
print("Formatting training data for model input.....................")
formatted_training_data = format_training_data(training_data, tokenizer)
print(f"Formatted {len(formatted_training_data)} training examples")

# Show a sample of formatted data
print("\nSample formatted input:")
print(f"Input IDs: {formatted_training_data[0]['input_ids']}")
print(f"Attention Mask: {formatted_training_data[0]['attention_mask']}")


Formatting training data for model input.....................
Formatted 11012 training examples

Sample formatted input:
Input IDs: tensor([32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014,
        32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014,
        32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014,
        32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014,
        32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014,
        32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014,
        32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014,
        32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014,
        32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014,
        32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014,
        32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014,
        32

## Step 5: Create DataLoader for TPU Training
In this step, we:
1. Create a custom Dataset class for our formatted data
2. Set up a DataLoader with TPU optimization
3. Configure proper batch handling for TPU

The DataLoader will:
- Handle batching of our training data
- Shuffle data for better training
- Ensure TPU-compatible data delivery
- Manage memory efficiently during training


In [11]:
from torch.utils.data import Dataset, DataLoader
import torch_xla.distributed.parallel_loader as pl

In [12]:
# custom Dataset class
class DaneelDataset(Dataset):
  def __init__(self, formatted_data):
    self.data = formatted_data

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    return {
        'input_ids': self.data[idx]['input_ids'],
        'attention_mask': self.data[idx]['attention_mask']
    }

# create dataset and dataloader
print ("Creating DataLoader for TPU training...........")
train_dataset = DaneelDataset(formatted_training_data)
train_loader = DataLoader(
    train_dataset,
    batch_size=total_batch_size,
    shuffle=True,
    num_workers=0,
    pin_memory=True
)

# create TPU specific loader
train_loader = pl.MpDeviceLoader(train_loader, device)

print(f"Created DataLoader with {len(train_loader)} batches per device")

Creating DataLoader for TPU training...........
Created DataLoader with 345 batches per device


## Step 6: Training Configuration Setup
In this step, we:
1. Set up TrainingArguments for the model
2. Configure the optimizer with appropriate learning rate
3. Enable mixed precision training for TPU
4. Set up model checkpointing and logging

Key configurations:
- Using AdamW optimizer (better than standard Adam for transformers)
- Mixed precision training (fp16) for better TPU performance
- Regular model saving and logging for monitoring

In [13]:
from transformers import TrainingArguments
from torch.optim import AdamW

In [17]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="/content/daneel_model",         # Where to save model
    num_train_epochs=3,                  # Number of training epochs
    per_device_train_batch_size=1,       # Batch size per TPU core
    warmup_steps=500,                    # Number of warmup steps
    weight_decay=0.01,                   # Weight decay for regularization
    logging_steps=100,                   # Log every X steps
    save_steps=500,                      # Save model every X steps
    fp16=True,                          # Use mixed precision training
)

# Initialize optimizer
optimizer = AdamW(
    model.parameters(),
    lr=2e-5,                # Learning rate
    betas=(0.9, 0.999),    # Adam optimizer parameters
    eps=1e-8               # Small constant for numerical stability
)

print("Training configuration initialized")
print(f"Training for {training_args.num_train_epochs} epochs")
print(f"Saving model to {training_args.output_dir}")


Training configuration initialized
Training for 3 epochs
Saving model to /content/daneel_model


Step 7: Training loop


Handles batches of data
Computes loss
Updates model weights
Shows progress with tqdm
Saves regular checkpoints


Main training loop:

Runs for specified number of epochs
Saves model after each epoch
Handles interruptions gracefully
Tracks and displays training time


TPU-specific optimizations:

Uses xm.optimizer_step for TPU
Proper device placement
TPU-compatible model saving