In [1]:
# @title 1. Install Dependencies
!pip install -q torch transformers datasets accelerate huggingface_hub python-dotenv matplotlib liger-kernel 
!pip install -q git+https://github.com/KellerJordan/Muon # Install Muon Optimizer

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m269.6/269.6 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for muon-optimizer (setup.py) ... [?25l[?25hdone


In [None]:

import os
import math
import time
import glob
import re
from dataclasses import dataclass
from contextlib import nullcontext

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.distributed as dist
from torch.utils.data import IterableDataset, DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset
from huggingface_hub import HfApi, create_repo
import tqdm
import matplotlib.pyplot as plt

# Try to import Muon and Liger, fallback if missing
try:
    from muon import MuonWithAuxAdam
    HAS_MUON = True
    print("‚úÖ Muon Optimizer enabled.")
except ImportError:
    HAS_MUON = False
    print("‚ö†Ô∏è Muon not found, using AdamW.")

try:
    from liger_kernel.transformers import LigerFusedLinearCrossEntropyLoss
    HAS_LIGER = True
    print("‚úÖ Liger Kernel enabled.")
except ImportError:
    HAS_LIGER = False
    print("‚ö†Ô∏è Liger not found, using standard CrossEntropy.")



# ==========================================
# 1. CONFIGURATION
# ==========================================
@dataclass
class ModelArgs:
    # --- Architecture ---
    dim: int = 384              
    n_layers: int = 6
    n_heads: int = 6               
    n_kv_heads: int = 2
    
    # --- MoE Config ---
    num_experts: int = 8            
    num_shared_experts: int = 1
    top_k: int = 2
    expert_hidden_dim: int = 1024
    
    # --- MLA Config ---
    kv_lora_rank: int = 64         
    q_lora_rank: int = 256   
    rope_theta: float = 10000.0
    norm_eps: float = 1e-6
    
    # --- Training ---
    vocab_size: int = 50257         # GPT-2 Vocab for TinyStories
    max_seq_len: int = 512          # Short context
    batch_size: int = 16       # Fits 14GB VRAM
    gradient_accumulation_steps: int = 4
    
    # --- Optimization ---
    lr_decay_iters: int = 50000     
    warmup_iters: int = 1000
    max_lr: float = 6e-4            
    min_lr: float = 6e-5
    weight_decay_optim: float = 0.1
    clip: float = 1.0
    total_iters: int = 50000
    eval_iters: int = 200
    save_checkpoint_iter: int = 1000
    dropout: float = 0.0
    aux_loss_coef: float = 0.01
    
    # --- System ---
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    use_liger: bool = False # To debug negative loss
    dataset: str = "roneneldan/TinyStories" 
    checkpoint_dir: str = "checkpoints_tinystories"
    #hf_token: str = os.getenv("")
    hf_token: str = ""
    hf_repo_id: str = "FusionCorp/DeepSeek-V3-TinyStories"
    gradient_checkpointing: bool = False

    # Periodic Validation
    val_interval: int = 500         # Run validation every 500 steps
    val_batches: int = 20           # Number of batches to check during validation



‚úÖ Muon Optimizer enabled.
‚úÖ Liger Kernel enabled.


In [7]:
# ==========================================
# 2. MODEL DEFINITION
# ==========================================
class RMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def forward(self, x):
        var = torch.mean(x ** 2, dim=-1, keepdim=True)
        x = x * torch.rsqrt(var + self.eps)
        return x * self.weight

def apply_rotary_emb(xq, xk, freq_cis):
    xq_out = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
    xk_out = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
    freq_cis = freq_cis[:xq.shape[1]]

    freq_cis = freq_cis.view(1, xq.shape[1], 1, -1)
    
    xq_out = torch.view_as_real(xq_out * freq_cis).flatten(3)
    xk_out = torch.view_as_real(xk_out * freq_cis).flatten(3)
    return xq_out.type_as(xq), xk_out.type_as(xk)

class MLA(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.dim = args.dim
        self.n_heads = args.n_heads
        self.head_dim = args.dim // args.n_heads
        self.kv_lora_rank = args.kv_lora_rank
        
        self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)
        self.w_kv_down = nn.Linear(args.dim, args.kv_lora_rank, bias=False)
        self.w_kv_up = nn.Linear(args.kv_lora_rank, 2 * (args.n_heads * self.head_dim), bias=False)
        self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)
        
        self.q_norm = RMSNorm(self.head_dim)
        self.k_norm = RMSNorm(self.head_dim)
        self.register_buffer("freqs_cis", self.precompute_freqs_cis(args.dim // args.n_heads, args.max_seq_len))

    def precompute_freqs_cis(self, dim: int, end: int, theta: float = 10000.0):
        freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
        t = torch.arange(end, device=freqs.device)
        freqs = torch.outer(t, freqs).float()
        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
        return freqs_cis

    def forward(self, x):
        B, T, C = x.shape
        xq = self.wq(x).view(B, T, self.n_heads, self.head_dim)
        latent_kv = self.w_kv_down(x)
        kv = self.w_kv_up(latent_kv).view(B, T, 2, self.n_heads, self.head_dim)
        xk, xv = kv.unbind(2)
        
        xq, xk = self.q_norm(xq), self.k_norm(xk)
        xq, xk = apply_rotary_emb(xq, xk, self.freqs_cis)
        
        out = F.scaled_dot_product_attention(
            xq.transpose(1, 2), xk.transpose(1, 2), xv.transpose(1, 2), is_causal=True
        )
        return self.wo(out.transpose(1, 2).contiguous().view(B, T, C))

class DeepSeekMoE(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.num_experts = args.num_experts
        self.top_k = args.top_k
        self.num_shared = args.num_shared_experts
        self.gate = nn.Linear(args.dim, args.num_experts, bias=False)
        
        self.shared_experts = nn.ModuleList([
            nn.Sequential(
                nn.Linear(args.dim, args.expert_hidden_dim, bias=False), 
                nn.SiLU(), 
                nn.Linear(args.expert_hidden_dim, args.dim, bias=False)
            ) for _ in range(self.num_shared)
        ])
        self.routed_experts = nn.ModuleList([
             nn.Sequential(
                nn.Linear(args.dim, args.expert_hidden_dim, bias=False), 
                nn.SiLU(), 
                nn.Linear(args.expert_hidden_dim, args.dim, bias=False)
            ) for _ in range(self.num_experts)
        ])

        for expert in self.shared_experts:
            expert[2].res_scale = True
        
        for expert in self.routed_experts:
            expert[2].res_scale = True


    def forward(self, x):
        original_shape = x.shape
        x_flat = x.view(-1, original_shape[-1])
        
        # 1. Compute Shared Experts (always active)
        shared_out = sum(expert(x_flat) for expert in self.shared_experts)
        
        # 2. Gating and Top-K
        logits = self.gate(x_flat)
        probs = F.softmax(logits, dim=-1)
        top_k_weights, top_k_indices = torch.topk(probs, self.top_k, dim=-1)
        
        # Normalize weights so they sum to 1
        top_k_weights = top_k_weights / top_k_weights.sum(dim=-1, keepdim=True)
        
        # Aux Loss for load balancing
        aux_loss = (probs.mean(dim=0) * logits.mean(dim=0)).sum() * self.num_experts
        
        final_out = torch.zeros_like(x_flat)
        
        # 3. Process each Expert
        for i in range(self.num_experts):
            # Check which tokens and which k-slots (0 or 1) use expert i
            # mask shape: [num_tokens, top_k]
            mask = (top_k_indices == i) #sets mask to true whenever we reach the wanted expert
            if not mask.any():
                continue
            
            # We process this expert once for all tokens that need it.
            # To handle multiple slots (if a token picks the same expert twice) we iterate through the k-slots.
            for k in range(self.top_k):
                k_mask = (top_k_indices[:, k] == i)
                if k_mask.any():
                    # Run expert on the relevant tokens and multiply by routing weight
                    expert_input = x_flat[k_mask]
                    expert_output = self.routed_experts[i](expert_input)
                    
                    # Add to final output
                    final_out[k_mask] += expert_output * top_k_weights[k_mask, k].unsqueeze(-1)

        return (shared_out + final_out).view(*original_shape), aux_loss

class Block(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.attn_norm = RMSNorm(args.dim)
        self.attn = MLA(args)
        self.ffn_norm = RMSNorm(args.dim)
        self.moe = DeepSeekMoE(args)
        self.dropout = nn.Dropout(args.dropout)

    def forward(self, x):
        h = x + self.dropout(self.attn(self.attn_norm(x)))
        moe_out, aux_loss = self.moe(self.ffn_norm(h))
        return h + self.dropout(moe_out), aux_loss

class DeepSeekV3(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.args = args
        self.embedding = nn.Embedding(args.vocab_size, args.dim)
        self.layers = nn.ModuleList([Block(args) for _ in range(args.n_layers)])
        self.norm = RMSNorm(args.dim)
        self.linear_layer = nn.Linear(args.dim, args.vocab_size, bias=False)
        self.embedding.weight = self.linear_layer.weight # Weight tying
        self.last_aux_loss = 0.0
        
        if args.use_liger and HAS_LIGER:
            self.le_loss = LigerFusedLinearCrossEntropyLoss()
            
        self.apply(self._init_weights)

    def _init_weights(self, module):

        std = 0.02 

        # Check for flag
        if hasattr(module,'res_scale')  and module.res_scale:
            # Scale down by 1/sqrt(2 * n_layers)
            std *= (2 * self.args.n_layers) ** -0.5

        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)

        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)

    def forward(self, input_ids):
        x = self.embedding(input_ids)
        #total_aux_loss = 0.0

        total_aux_loss = torch.tensor(0.0, device=x.device, dtype=x.dtype)

        for layer in self.layers:
            if self.training and self.args.gradient_checkpointing:
                x, aux_loss = torch.utils.checkpoint.checkpoint(layer, x, use_reentrant=True)
            else:
                x, aux_loss = layer(x)

            # trying x = x + 1 instead                
            total_aux_loss = total_aux_loss + aux_loss
        
        x = self.norm(x)
        self.last_aux_loss = total_aux_loss
        if self.args.use_liger and self.training: return x
        return self.linear_layer(x)

# ==========================================
# 3. DATA PIPELINE
# ==========================================
def initialize_tokenizer(hf_token=None):
    try:
        tokenizer = AutoTokenizer.from_pretrained("gpt2", token=hf_token)
    except:
        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
    if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

class TinyStoriesStreamDataset(IterableDataset):
    def __init__(self, split, tokenizer, seq_len, dataset_name, hf_token=None):
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.dataset = load_dataset(dataset_name, split="train" if split == "train" else "validation", streaming=True, token=hf_token)

    def __iter__(self):
        iterator = iter(self.dataset)
        buffer = []
        while True:
            try:
                text = next(iterator)['text']
                tokens = self.tokenizer.encode(
                text,
                add_special_tokens=True,
                max_length = 1e6,
                truncation = False ) # to get rid of the HF warning

                tokens.append(self.tokenizer.eos_token_id)

                buffer.extend(tokens)
                
                while len(buffer) >= self.seq_len + 1:
                    chunk = buffer[:self.seq_len + 1]
                    buffer = buffer[self.seq_len + 1:]
                    yield {'input_ids': torch.tensor(chunk[:-1]), 'labels': torch.tensor(chunk[1:])}
            except StopIteration:
                iterator = iter(self.dataset) # Infinite loop



In [8]:
# ==========================================
# 4. TRAINING UTILS
# ==========================================
def get_lr(it, args):
    if it < args.warmup_iters: return args.max_lr * (it + 1) / args.warmup_iters
    if it > args.lr_decay_iters: return args.min_lr
    decay_ratio = (it - args.warmup_iters) / (args.lr_decay_iters - args.warmup_iters)
    return args.min_lr + 0.5 * (args.max_lr - args.min_lr) * (1.0 + math.cos(math.pi * decay_ratio))

def find_latest_checkpoint(checkpoint_dir):
    if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir, exist_ok=True); return None, 0
    files = glob.glob(os.path.join(checkpoint_dir, "checkpoint_*.pt"))
    if not files: return None, 0
    latest = max(files, key=lambda f: int(re.search(r'checkpoint_(\d+).pt', f).group(1)))
    return latest, int(re.search(r'checkpoint_(\d+).pt', latest).group(1))


def log_metrics(path, step, loss, lr):
    file_exists = os.path.exists(path)
    with open(path, "a") as f:
        if not file_exists:
            f.write("step,loss,lr\n")
        f.write(f"{step},{loss},{lr}\n")


def log_generated_story(path, step, story):
    # Check if file exists to determine if we need a main header
    file_exists = os.path.exists(path)
    
    with open(path, "a", encoding="utf-8") as f:
        # If new file, add a title
        if not file_exists:
            f.write("# üìñ DeepSeek-V3 TinyStories Generation Log\n\n")
            f.write("Tracking model progress over time.\n\n---\n\n")
        
        # Write the Step and Story in Markdown format
        f.write(f"## Step {step}\n")
        f.write(f"```\n{story}\n```\n")
        f.write(f"_Generated at: {time.strftime('%Y-%m-%d %H:%M:%S')}_\n\n---\n\n")

@torch.no_grad()
def estimate_loss(model, val_iterator, val_loader, args):
    model.eval()
    losses = []
    
    # We check 'val_batches' to get a stable average
    for _ in range(args.val_batches):
        try:
            batch = next(val_iterator)
        except StopIteration:
            # If we hit the end of the val stream, restart it
            val_iterator = iter(val_loader)
            batch = next(val_iterator)
            
        idx, targets = batch['input_ids'].to(args.device), batch['labels'].to(args.device)
        
        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
            out = model(idx)
            # Use standard CE loss for validation
            loss = F.cross_entropy(out.view(-1, args.vocab_size), targets.view(-1))
            
        losses.append(loss.item())
    
    avg_loss = sum(losses) / len(losses)
    model.train() # Switch back to training mode
    return avg_loss, val_iterator


def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True

@torch.no_grad()
def generate_story(model, tokenizer, prompt, max_new_tokens=50):
    # Determine device
    device = next(model.parameters()).device
    
    # Encode
    tokens = tokenizer.encode(prompt, add_special_tokens=True)
    input_ids = torch.tensor([tokens], dtype=torch.long, device=device)
    
    # Generate
    for _ in range(max_new_tokens):
        if input_ids.size(1) > model.args.max_seq_len:
            input_cond = input_ids[:, -model.args.max_seq_len:]
        else:
            input_cond = input_ids
            
        logits = model(input_cond)
        logits = logits[:, -1, :]
        
        # Greedy sampling for vibe check (stable)
        probs = F.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        
        input_ids = torch.cat((input_ids, next_token), dim=1)
        
        if next_token.item() == tokenizer.eos_token_id:
            break
            
    decoded = tokenizer.decode(input_ids[0].tolist(), skip_special_tokens=True)
    print(f"\nüìñ {decoded}\n")
    return decoded

In [9]:
def train():
    torch.cuda.empty_cache()
    import gc
    gc.collect()

    # Necessary for Muon to work
    if not dist.is_initialized():
        os.environ['MASTER_ADDR'] = 'localhost'
        os.environ['MASTER_PORT'] = '12355'
        dist.init_process_group(backend='nccl', rank=0, world_size=1)

    args = ModelArgs()

    # Saving to HuggingFace
    hf_api = HfApi(token=args.hf_token)

    if args.hf_repo_id and args.hf_token:
        try:
            create_repo(args.hf_repo_id, repo_type="model", exist_ok=True, token=args.hf_token)
            print(f"‚úÖ Connected to Hugging Face repo: {args.hf_repo_id}")
        except Exception as e:
            print(f"‚ö†Ô∏è Could not create/connect to HF repo: {e}")

    # --- FILE SETUP ---
    csv_name = "training_log.csv"
    stories_name = "generated_samples.md" # <--- NEW FILE
    
    local_csv_path = os.path.join(args.checkpoint_dir, csv_name)
    local_stories_path = os.path.join(args.checkpoint_dir, stories_name) # <--- NEW PATH

    # --- DOWNLOAD EXISTING LOGS (Resume Capability) ---
    if args.hf_repo_id and args.hf_token:
        # 1. Download CSV
        try:
            from huggingface_hub import hf_hub_download
            hf_hub_download(repo_id=args.hf_repo_id, filename=csv_name, local_dir=args.checkpoint_dir, token=args.hf_token)
            print(f"üìà Downloaded existing metrics log.")
        except Exception:
            print("üìà Starting new metrics log.")

        # 2. Download Stories Log (NEW)
        try:
            hf_hub_download(repo_id=args.hf_repo_id, filename=stories_name, local_dir=args.checkpoint_dir, token=args.hf_token)
            print(f"üìñ Downloaded existing story log.")
        except Exception:
            print("üìñ Starting new story log.")
    
    tokenizer = initialize_tokenizer(args.hf_token)
    print(f"üöÄ Device: {args.device} | Vocab: {len(tokenizer)}")

    model = DeepSeekV3(args).to(args.device)
    
    # Optimizer Groups
    hidden_params, other_params = [], []
    for n, p in model.named_parameters():
        if p.requires_grad:
            (hidden_params if p.ndim >= 2 and "norm" not in n and "embedding" not in n else other_params).append(p)

    if HAS_MUON:
        optimizer = MuonWithAuxAdam([
            {'params': hidden_params, 'use_muon': True, 'lr': 0.02, 'weight_decay': 0.01},
            {'params': other_params, 'use_muon': False, 'lr': args.max_lr, 'weight_decay': args.weight_decay_optim}
        ])
    else:
        optimizer = torch.optim.AdamW(model.parameters(), lr=args.max_lr, weight_decay=args.weight_decay_optim)

    # Resume Weights
    ckpt_path, start_step = find_latest_checkpoint(args.checkpoint_dir)
    if ckpt_path:
        print(f"‚è© Resuming from {start_step}")
        ckpt = torch.load(ckpt_path, map_location=args.device)
        model.load_state_dict(ckpt['model'])
        optimizer.load_state_dict(ckpt['optimizer'])

    # Data
    ds = TinyStoriesStreamDataset('train', tokenizer, args.max_seq_len, args.dataset, args.hf_token)
    loader = DataLoader(ds, batch_size=args.batch_size, num_workers=2, pin_memory=True)
    iterator = iter(loader)

    val_ds = TinyStoriesStreamDataset('validation', tokenizer, args.max_seq_len, args.dataset, args.hf_token)
    val_loader = DataLoader(val_ds, batch_size=args.batch_size, num_workers=2, pin_memory=True)
    val_iterator = iter(val_loader)

    # Training Loop
    model.train()
    pbar = tqdm.tqdm(range(start_step, args.total_iters), initial=start_step)
    accum_loss = 0
    
    print("üöÄ Initialization complete. Starting training loop...")

    for step in range(start_step, args.total_iters):
        # LR Schedule
        lr = get_lr(step, args)
        for g in optimizer.param_groups: 
            if not g.get('use_muon', False): g['lr'] = lr
        
        optimizer.zero_grad()
        for micro in range(args.gradient_accumulation_steps):
            try: batch = next(iterator)
            except: iterator = iter(loader); batch = next(iterator)
            
            idx, targets = batch['input_ids'].to(args.device), batch['labels'].to(args.device)
            
            with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
                out = model(idx)
                if args.use_liger and HAS_LIGER:
                    loss = model.le_loss(model.linear_layer.weight, out.view(-1, args.dim), targets.view(-1))
                else:
                    loss = F.cross_entropy(out.view(-1, args.vocab_size), targets.view(-1))
                
                loss = (loss + args.aux_loss_coef * model.last_aux_loss) / args.gradient_accumulation_steps
            loss.backward()
            accum_loss += loss.item()

        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        optimizer.step()

        # --- VALIDATION & SAMPLING ---
        if step % 500 == 0 and step > start_step:
            
            val_loss, val_iterator = estimate_loss(model, val_iterator, val_loader, args)
            print(f"\nüîç Step {step} | Train Loss: {accum_loss:.4f} | Val Loss: {val_loss:.4f}")
            
            print(f"‚ú® Vibe Check:")
            try:
                # Generate
                story_text = generate_story(model, tokenizer, "Once upon a time", max_new_tokens=100)
                # Log to MD file
                log_generated_story(local_stories_path, step, story_text)
            except Exception as e:
                print(f"(Generation/Logging skipped: {e})")
            
            model.train()

        if step % 10 == 0:
            print(f"Step {step} | Loss: {accum_loss:.4f} | LR: {lr:.2e}")
            log_metrics(local_csv_path, step, accum_loss, lr)

        pbar.update(1)
        pbar.set_description(f"Loss: {accum_loss:.4f} | LR: {lr:.2e}")
        accum_loss = 0

        # --- SAVING & UPLOADING ---
        if step % args.save_checkpoint_iter == 0 and step > start_step:
            ckpt_name = f"checkpoint_{step}.pt"
            path = os.path.join(args.checkpoint_dir, ckpt_name)
            
            # Save Locally
            torch.save({'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'args': args}, path)
            print(f"\nüíæ Saved locally to {path}")

            if args.hf_repo_id and args.hf_token:
                try:
                    # 1. Upload Checkpoint
                    hf_api.upload_file(
                        path_or_fileobj=path,
                        path_in_repo=f"checkpoints/{ckpt_name}",
                        repo_id=args.hf_repo_id
                    )
                    # 2. Upload CSV
                    hf_api.upload_file(
                        path_or_fileobj=local_csv_path,
                        path_in_repo=csv_name,
                        repo_id=args.hf_repo_id
                    )
                    # 3. Upload Stories Log (NEW)
                    hf_api.upload_file(
                        path_or_fileobj=local_stories_path,
                        path_in_repo=stories_name,
                        repo_id=args.hf_repo_id
                    )
                    print(f"‚òÅÔ∏è Synced Checkpoint, CSV, and Stories to Hugging Face")
                except Exception as e:
                    print(f"‚ùå HF Upload failed: {e}")

train()

‚úÖ Connected to Hugging Face repo: FusionCorp/DeepSeek-V3-TinyStories
üìà Starting new metrics log.
üìñ Starting new story log.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

üöÄ Device: cuda | Vocab: 50257


README.md: 0.00B [00:00, ?B/s]

Too many dataloader workers: 2 (max is dataset.num_shards=1). Stopping 1 dataloader workers.


üöÄ Initialization complete. Starting training loop...


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.07 GiB. GPU 0 has a total capacity of 14.74 GiB of which 3.02 GiB is free. Process 3184 has 11.71 GiB memory in use. Of the allocated memory 11.51 GiB is allocated by PyTorch, and 82.99 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)