In [1]:
!pip install huggingface_hub transformers datasets zstandard sentencepiece

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting zstandard
  Downloading zstandard-0.24.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.1 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
Downloading zstandard-0.24.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (5.5 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/5.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━

In [None]:
import json
from huggingface_hub import hf_hub_download
import shutil
import os

# replace with your repo id
repo_id = "dilip025/llama-2-7b"
filename = "step_final_6262.pt"
filename_tokenizer = "tokenizerllma.model"
# filename_texts = "texts.json"   # 👈 your uploaded JSON file

# download checkpoint + tokenizer
cached_path = hf_hub_download(repo_id=repo_id, filename=filename)
cached_path_tokenizer = hf_hub_download(repo_id=repo_id, filename=filename_tokenizer)
# cached_path_texts = hf_hub_download(repo_id=repo_id, filename=filename_texts)  # 👈 download texts.json

# copy files locally
dest_path = f"./{filename}"
dest_path_tokenizer = f"./{filename_tokenizer}"
# dest_path_texts = f"./{filename_texts}"

shutil.copy(cached_path_tokenizer, dest_path_tokenizer)
shutil.copy(cached_path, dest_path)
# shutil.copy(cached_path_texts, dest_path_texts)

print("Copied tokenizer to:", dest_path_tokenizer)
print("Tokenizer exists?", os.path.exists(dest_path_tokenizer))

print("Copied checkpoint to:", dest_path)
print("Checkpoint exists?", os.path.exists(dest_path))

# print("Copied texts.json to:", dest_path_texts)
# print("Texts file exists?", os.path.exists(dest_path_texts))

# ✅ load the texts.json into Python variable
# with open(dest_path_texts, "r", encoding="utf-8") as f:
#     texts = json.load(f)

# print("Loaded texts:", len(texts))  # just print first 10 for sanity check


step_lightning_6100.pt:   0%|          | 0.00/3.27G [00:00<?, ?B/s]

tokenizerllma.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Copied tokenizer to: ./tokenizerllma.model
Tokenizer exists? True
Copied checkpoint to: ./step_lightning_6100.pt
Checkpoint exists? True


In [3]:
dest_path

'./step_lightning_4800.pt'

In [None]:
from huggingface_hub import HfApi, HfFolder

# paste your HF token here
token = ""

# save token locally
HfFolder.save_token(token)

api = HfApi(token=token)  # initialize with token


# your repo id
repo_id = "dilip025/llama-2-7b"

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import os
import json
from tqdm import tqdm
from typing import Dict, Any, List
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import DataLoader, Dataset
from datetime import datetime
from datasets import load_dataset
from transformers import LlamaTokenizer
from transformers import get_cosine_schedule_with_warmup
# from lion_pytorch import Lion

# --- 1. Satori "Akasha" Final Model Architecture ---

class RotaryPositionalEmbeddings(nn.Module):
    def __init__(self, dim, max_seq_len=8192, base=10000):
        super().__init__()
        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer("inv_freq", inv_freq)
        self.max_seq_len = max_seq_len

    def forward(self, seq_len: int, device: torch.device):
        t = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype)
        freqs = torch.outer(t, self.inv_freq)
        return torch.cat((freqs, freqs), dim=-1).cos(), torch.cat((freqs, freqs), dim=-1).sin()

    def apply_rotary_emb(self, x, cos, sin):
        def rotate_half(x):
            x1, x2 = x[..., :x.shape[-1]//2], x[..., x.shape[-1]//2:]
            return torch.cat((-x2, x1), dim=-1)
        
        # FIX 1: Simplified and robust RoPE application
        def apply_rotary(tensor, cos, sin):
            # Adjust shape for broadcasting based on tensor dimensions
            if tensor.ndim == 4: # For Q/K in attention (B, H, T, D_head)
                cos = cos.unsqueeze(1)
                sin = sin.unsqueeze(1)
            # No unsqueeze needed for chunk summaries (B, NumChunks, D_model)
            return (tensor * cos) + (rotate_half(tensor) * sin)

        if isinstance(x, tuple):
            q, k = x
            return apply_rotary(q, cos, sin), apply_rotary(k, cos, sin)
        
        return apply_rotary(x, cos, sin)

class RMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))
    def _norm(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
    def forward(self, x):
        return self._norm(x.float()).type_as(x) * self.weight

class SwiGLU(nn.Module):
    def __init__(self, in_features, out_features, bias=False):
        super().__init__()
        self.linear_gate = nn.Linear(in_features, out_features, bias=bias)
        self.linear_up = nn.Linear(in_features, out_features, bias=bias)
        self.act_fn = nn.SiLU()
    def forward(self, x):
        return self.act_fn(self.linear_gate(x)) * self.linear_up(x)

class VivekaMoE(nn.Module):
    def __init__(self, d_model, n_experts=8, top_k=2, ff_mult=2):
        super().__init__()
        self.n_experts = n_experts
        self.top_k = top_k
        self.router = nn.Linear(d_model, n_experts, bias=False)
        ff_dim = int(d_model * ff_mult * 4 / 3)
        self.experts = nn.ModuleList([
            nn.Sequential(
                SwiGLU(d_model, ff_dim),
                nn.Linear(ff_dim, d_model, bias=False)
            ) for _ in range(n_experts)
        ])
        self.aux_loss_coef = 0.01
    def forward(self, x):
        B, T, C = x.shape
        x_flat = x.view(-1, C)
        router_logits = self.router(x_flat)
        routing_weights, selected_experts = torch.topk(router_logits, self.top_k, dim=-1)
        routing_weights = F.softmax(routing_weights, dim=-1, dtype=torch.float32)
        p = F.softmax(router_logits, dim=-1, dtype=torch.float32)
        f = p.mean(dim=0)
        load_balancing_loss = self.aux_loss_coef * self.n_experts * torch.sum(f * f)
        final_output = torch.zeros_like(x_flat)
        for i in range(self.n_experts):
            token_mask = (selected_experts == i).any(dim=-1)
            if token_mask.any():
                expert_input = x_flat[token_mask]
                expert_output = self.experts[i](expert_input)
                routing_subset = routing_weights[token_mask]
                weight_mask = (selected_experts[token_mask] == i)
                final_output[token_mask] += (expert_output * routing_subset[weight_mask].unsqueeze(-1))
        return final_output.view(B, T, C), load_balancing_loss

# First, create a dedicated SwiGLU FeedForward module for clarity.
# This is the same structure used inside your MoE experts.
class SwiGLUFeedForward(nn.Module):
    def __init__(self, d_model, ff_mult=2):
        super().__init__()
        # The hidden dim is often a multiple of d_model, adjusted for SwiGLU best practices.
        ff_dim = int(d_model * ff_mult * 2 / 3)
        self.w1 = nn.Linear(d_model, ff_dim, bias=False)
        self.w3 = nn.Linear(d_model, ff_dim, bias=False)
        self.w2 = nn.Linear(ff_dim, d_model, bias=False)

    def forward(self, x):
        return self.w2(F.silu(self.w1(x)) * self.w3(x))

class HierarchicalGatedPooling(nn.Module):
    def __init__(self, d_model, n_heads, dropout, chunk_size=256, num_summary_vectors=4):
        super().__init__()
        # ... (self.chunk_size, n_heads, d_head, num_summary_vectors are the same) ...
        self.chunk_size = chunk_size
        self.n_heads = n_heads
        self.d_head = d_model // n_heads
        self.num_summary_vectors = num_summary_vectors

        # --- MODIFICATION: Build a custom summarizer block ---
        self.summary_token = nn.Parameter(torch.randn(1, 1, d_model))
        
        # 1. Self-Attention Layer
        self.summarizer_attn = nn.MultiheadAttention(
            embed_dim=d_model, 
            num_heads=n_heads, 
            dropout=dropout, 
            batch_first=True
        )
        self.summarizer_norm1 = RMSNorm(d_model)

        # 2. SwiGLU Feed-Forward Layer
        self.summarizer_ffn = SwiGLUFeedForward(d_model)
        self.summarizer_norm2 = RMSNorm(d_model)
        # --- END OF MODIFICATION ---
        
        # ... (the rest of the __init__ is the same) ...
        self.query_proj = nn.Linear(d_model, d_model * num_summary_vectors, bias=False)
        self.summary_proj = nn.Linear(d_model * num_summary_vectors, d_model, bias=False)
        self.to_kv = nn.Linear(d_model, 2 * d_model, bias=False)
        self.to_g = nn.Linear(d_model, d_model, bias=True)
        self.out_proj = nn.Linear(d_model, d_model, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.norm = RMSNorm(d_model)
        
    def forward(self, x, rope_cos, rope_sin, rope_applier):
        # ... (padding and chunking logic is the same) ...
        B, T, C = x.shape
        padding_needed = (self.chunk_size - T % self.chunk_size) % self.chunk_size
        if padding_needed > 0:
            x = F.pad(x, (0, 0, 0, padding_needed))
        
        chunks = x.view(B, -1, self.chunk_size, C)
        B_chunks, num_chunks, chunk_len, C = chunks.shape
        
        # --- MODIFICATION: Use the custom summarizer block ---
        summary_token = self.summary_token.expand(B_chunks * num_chunks, -1, -1)
        chunks_flat = chunks.view(B_chunks * num_chunks, chunk_len, C)
        chunk_with_token = torch.cat([summary_token, chunks_flat], dim=1)

        # Custom Transformer Block Logic
        # 1. Pre-Norm and Self-Attention
        normed_chunk = self.summarizer_norm1(chunk_with_token)
        attn_output, _ = self.summarizer_attn(normed_chunk, normed_chunk, normed_chunk, need_weights=False)
        # Residual Connection
        post_attn = chunk_with_token + attn_output

        # 2. Pre-Norm and SwiGLU FFN
        normed_post_attn = self.summarizer_norm2(post_attn)
        ffn_output = self.summarizer_ffn(normed_post_attn)
        # Residual Connection
        summarizer_output = post_attn + ffn_output

        # The output corresponding to the first token is our learned summary
        learned_summary = summarizer_output[:, 0, :] 
        # --- END OF MODIFICATION ---

        # ... (The rest of the forward pass is exactly the same) ...
        chunk_content_rep = learned_summary.view(B_chunks, num_chunks, C)
        queries_unshaped = self.query_proj(chunk_content_rep)
        q = queries_unshaped.view(B * num_chunks, self.num_summary_vectors, self.n_heads, self.d_head).transpose(1, 2)
        k, v = self.to_kv(chunks).chunk(2, dim=-1)
        k = k.view(B * num_chunks, self.chunk_size, self.n_heads, self.d_head).transpose(1, 2)
        v = v.view(B * num_chunks, self.chunk_size, self.n_heads, self.d_head).transpose(1, 2)
        multi_vector_summaries = F.scaled_dot_product_attention(q, k, v)
        summaries_flat = multi_vector_summaries.transpose(1, 2).reshape(B, num_chunks, C * self.num_summary_vectors)
        chunk_summaries = self.summary_proj(summaries_flat)
        chunk_summaries_with_rope = rope_applier(chunk_summaries, rope_cos, rope_sin)
        gate_chunks = torch.sigmoid(self.to_g(chunk_summaries_with_rope))
        gated_v_chunks = chunk_summaries_with_rope * gate_chunks
        cumulative_sum = torch.cumsum(gated_v_chunks, dim=1)
        cumulative_gates = torch.cumsum(gate_chunks, dim=1)
        chunk_context = cumulative_sum / (cumulative_gates + 1e-9)
        zero_pad = torch.zeros(B, 1, C, device=x.device, dtype=x.dtype)
        padded_chunk_context = torch.cat([zero_pad, chunk_context], dim=1)
        prev_chunk_context = padded_chunk_context[:, :-1, :]
        chunk_context_expanded = prev_chunk_context.unsqueeze(2).expand(-1, -1, self.chunk_size, -1)
        token_context = chunk_context_expanded.reshape(B, -1, C)
        if padding_needed > 0:
            token_context = token_context[:, :T, :]
        normed_context = self.norm(token_context)
        output = self.out_proj(normed_context)
        return self.dropout(output)

class LocalSlidingWindowAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout, window_size=1024):
        super().__init__()
        self.n_heads = n_heads
        self.window_size = window_size
        self.qkv_proj = nn.Linear(d_model, 3 * d_model, bias=False)
        self.out_proj = nn.Linear(d_model, d_model, bias=False)
        self.dropout_p = dropout
    def forward(self, x, rope_cos, rope_sin, rope_applier):
        B, T, C = x.shape
        q, k, v = self.qkv_proj(x).chunk(3, dim=-1)
        
        # FIX 3: Correctly unpack the tensors instead of creating a generator.
        q = q.view(B, T, self.n_heads, C // self.n_heads)
        k = k.view(B, T, self.n_heads, C // self.n_heads)
        v = v.view(B, T, self.n_heads, C // self.n_heads)
        
        q, k = rope_applier((q, k), rope_cos, rope_sin)
        
        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
        
        mask = torch.full((T, T), float("-inf"), device=x.device, dtype=q.dtype)
        rows = torch.arange(T, device=x.device)[:, None]
        cols = torch.arange(T, device=x.device)[None, :]
        window_mask = (rows >= cols) & (rows - cols < self.window_size)
        mask.masked_fill_(window_mask, 0.0)

        attn_output = F.scaled_dot_product_attention(
            q, k, v, attn_mask=mask, dropout_p=self.dropout_p if self.training else 0.0)
        
        output = attn_output.transpose(1, 2).contiguous().view(B, T, C)
        return self.out_proj(output)

class AkashaBlock(nn.Module):
    def __init__(self, config: Dict[str, Any]):
        super().__init__()
        self.norm = RMSNorm(config['d_model'])
        self.purusha_path = HierarchicalGatedPooling(config['d_model'], config['n_heads'], config['dropout'], config['chunk_size'],config['num_summary_vectors'])
        self.prakriti_path = LocalSlidingWindowAttention(config['d_model'], config['n_heads'], config['dropout'], config['window_size'])
        self.fusion_gate = nn.Linear(config['d_model'], 2 * config['d_model'], bias=False)
        self.moe = VivekaMoE(config['d_model'], config['n_experts'], config['top_k'])
        self.moe_norm = RMSNorm(config['d_model'])
        self.residual_dropout = nn.Dropout(config['dropout'])
    def forward(self, x, token_rope, chunk_rope):
        normed_x = self.norm(x)
        purusha_out = self.purusha_path(normed_x, chunk_rope[0], chunk_rope[1], chunk_rope[2])
        prakriti_out = self.prakriti_path(normed_x, token_rope[0], token_rope[1], token_rope[2])
        gate1, gate2 = self.fusion_gate(normed_x).chunk(2, dim=-1)
        gate1, gate2 = torch.sigmoid(gate1), torch.sigmoid(gate2)
        fused_out = gate1 * purusha_out + gate2 * prakriti_out
        x = x + self.residual_dropout(fused_out)
        moe_out, moe_loss = self.moe(self.moe_norm(x))
        x = x + self.residual_dropout(moe_out)
        return x, moe_loss

class Satori_Akasha_Model(nn.Module):
    def __init__(self, config: Dict[str, Any]):
        super().__init__()
        self.config = config
        self.token_embedding = nn.Embedding(config['vocab_size'], config['d_model'])
        self.embedding_dropout = nn.Dropout(config['dropout'])
        self.token_rope = RotaryPositionalEmbeddings(config['d_model'] // config['n_heads'], config['block_size'])
        self.chunk_rope = RotaryPositionalEmbeddings(config['d_model'], config['max_chunks'])
        self.layers = nn.ModuleList([AkashaBlock(config) for _ in range(config['n_layers'])])
        self.final_norm = RMSNorm(config['d_model'])
        self.output_layer = nn.Linear(config['d_model'], config['vocab_size'], bias=False)
        self.output_layer.weight = self.token_embedding.weight
        self.apply(self._init_weights)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_normal_(module.weight)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.xavier_normal_(module.weight)
    def forward(self, input_ids, labels=None):
        x = self.embedding_dropout(self.token_embedding(input_ids))
        token_rope_cos, token_rope_sin = self.token_rope(x.shape[1], x.device)
        num_chunks = (x.shape[1] + self.config['chunk_size'] - 1) // self.config['chunk_size']
        chunk_rope_cos, chunk_rope_sin = self.chunk_rope(num_chunks, x.device)
        token_rope_params = (token_rope_cos, token_rope_sin, self.token_rope.apply_rotary_emb)
        chunk_rope_params = (chunk_rope_cos, chunk_rope_sin, self.chunk_rope.apply_rotary_emb)
        total_moe_loss = torch.tensor(0.0, device=x.device)
        for layer in self.layers:
            x, moe_loss = layer(x, token_rope_params, chunk_rope_params)
            total_moe_loss += moe_loss
        logits = self.output_layer(self.final_norm(x))
        loss_dict = {}
        if labels is not None:
            task_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1))
            loss_dict = {"task_loss": task_loss, "moe_aux_loss": total_moe_loss / len(self.layers)}
        return logits, loss_dict


# --- 2. Training Configuration ---
def get_satori_akasha_config():
    block_size = 2048
    chunk_size = 256
    config = {
        "model_name": "Satori-Akasha-250M",
        "d_model": 768, "n_layers": 10, "n_heads": 8,
        "window_size": 256, "chunk_size": chunk_size,
        "max_chunks": 8192,
        "n_experts": 2, "top_k": 2, "dropout": 0.05,
        "vocab_size": 32000, "block_size": block_size,
        "num_summary_vectors": 4,
        "tokenizer_path": dest_path_tokenizer,
        "num_epochs": 1, "batch_size": 22, "grad_accum_steps": 35,
        "precision": "bf16", "max_grad_norm": 1.0,
        "optimizer": "AdamW", "learning_rate": 3e-4, "weight_decay": 0.02,
        "beta1": 0.9, "beta2": 0.95,
        "num_train_samples": 460000, "skip_train_samples": 130000+130000+120000+120000+120000+130000+125000+125000+130000+130000+130000+330000+430000,
        "lr_scheduler": "cosine_with_warmup", "warmup_steps": 100,
        "output_dir": "satori_akasha_checkpoints",
        "log_interval_steps": 10, "save_interval_steps": 600,
        "resume_training": False,
        "resume_from_checkpoint": True, 
        
        # 💡 This is our GLOBAL training plan. Never change this number mid-training.
        # "total_training_steps": 45200,
        "total_training_steps": 11000,
    }
    return config

# --- 3. Production-Ready Training Loop ---

# --- 6. Data Pipeline ---
def load_training_data(val_size=10000, skip_size=0):
    streamed_dataset = load_dataset("HuggingFaceFW/fineweb", split="train", streaming=True).skip(skip_size)
    # streamed_dataset = load_dataset("allenai/c4","en", split="train", streaming=True, trust_remote_code=True).skip(skip_size)
    texts = []
    print(f"Collecting {val_size:,} samples from pile (skipping first {skip_size:,})...")
    
    for ex in tqdm(streamed_dataset, total=val_size, desc="Downloading pajama"):
        texts.append(ex["text"])
        if len(texts) >= val_size:
            break

    print(f"📦 Loaded {len(texts):,} samples.")
    return texts

class TextDatasetForLM(Dataset):
    def __init__(self, texts: List[str], tokenizer, block_size: int, batch_size: int = 4048):
        all_token_ids = []
        self.eos_token_id = tokenizer.eos_token_id

        print(f"Tokenizing {len(texts):,} texts in batches of {batch_size}...")
        for i in tqdm(range(0, len(texts), batch_size), desc="Batch Tokenizing"):
            batch_texts = texts[i:i + batch_size]
            encodings = tokenizer(
                batch_texts,
                truncation=False,
                padding=False,
                add_special_tokens=False,
                return_attention_mask=False,
            )  # HF returns dict

            for input_ids in encodings["input_ids"]:
                if input_ids:
                    all_token_ids.extend(input_ids + [self.eos_token_id])  # Append EOS

        # Convert long stream of tokens into block-sized chunks
        self.examples = []
        for i in range(0, len(all_token_ids) - block_size + 1, block_size):
            self.examples.append(torch.tensor(all_token_ids[i:i + block_size], dtype=torch.long))

        print(f"✅ Prepared {len(self.examples):,} samples of size {block_size}.")

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        x = self.examples[i]
        return x[:-1], x[1:]  # Input and target


def collate_batch(batch):
    inputs, labels = zip(*batch)
    return torch.stack(inputs), torch.stack(labels)


# No changes needed here, just for confirmation
def setup(config):
    """Sets up device, tokenizer, and directories."""
    config['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    if config['precision'] == 'bf16' and not torch.cuda.is_bf16_supported():
        print("Warning: bfloat16 not supported on this device, falling back to fp32.")
        config['precision'] = 'fp32'

    try:
        tokenizer = LlamaTokenizer.from_pretrained(config['tokenizer_path'])
        config['vocab_size'] = tokenizer.vocab_size
        config['pad_token_id'] = tokenizer.pad_token_id

        print("vocab size", tokenizer.vocab_size)

        if config['pad_token_id'] is None:
            print("Info: No <pad> token defined, setting pad_token_id = eos_token_id.")
            config['pad_token_id'] = tokenizer.eos_token_id
    except Exception as e:
        print(f"Could not load tokenizer from {config['tokenizer_path']}. Error: {e}")
        tokenizer = None

    os.makedirs(config['output_dir'], exist_ok=True)
    
    return config, tokenizer

def create_model(config):
    """Creates and compiles the Satori-v9 model."""
    model = Satori_Akasha_Model(config)
    
    # Best Practice: Use torch.compile for significant speedup
    if torch.__version__ >= "2.0.0":
        print("Compiling the model with torch.compile()...")
        model = torch.compile(model, backend="aot_eager")
        
    return model.to(config['device'])

def create_optimizer_and_scheduler(model, config, num_training_steps):
    """
    Creates an AdamW optimizer and a standard cosine learning rate scheduler 
    from the Hugging Face transformers library.
    """
    # Don't apply weight decay to bias or LayerNorm/RMSNorm parameters
    param_dict = {pn: p for pn, p in model.named_parameters() if p.requires_grad}
    decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
    nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
    
    optim_groups = [
        {'params': decay_params, 'weight_decay': config['weight_decay']},
        {'params': nodecay_params, 'weight_decay': 0.0}
    ]
    
    # --- MODIFICATION: Use AdamW instead of Lion ---
    optimizer = torch.optim.AdamW(
        optim_groups,
        lr=config['learning_rate'],
        betas=(config.get('beta1', 0.9), config.get('beta2', 0.95)) # Your betas are good
    )
    # --- END OF MODIFICATION ---
    
    # The scheduler remains the same
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config['warmup_steps'],
        num_training_steps=num_training_steps
    )
    
    return optimizer, scheduler

def train(model, config, train_loader, optimizer, scheduler, resume_step):
    """The main training loop."""
    device = config['device']
    use_amp = config['precision'] in ['fp16', 'bf16']
    dtype = torch.bfloat16 if config['precision'] == 'bf16' else torch.float16
    scaler = GradScaler(enabled=(config['precision'] == 'fp16'))
    
    global_step = resume_step
    model.train()
    avg_loss = 0
    for epoch in range(config['num_epochs']):
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config['num_epochs']}")
        
        for i, (input_ids, labels) in enumerate(pbar):
            input_ids, labels = input_ids.to(device), labels.to(device)
            with torch.autocast(device_type=device.type, dtype=dtype, enabled=use_amp):
                _, loss_dict = model(input_ids, labels)
                task_loss = loss_dict['task_loss']
                moe_loss = loss_dict['moe_aux_loss']
                total_loss = task_loss + moe_loss
                avg_loss +=total_loss.item()
                # Scale loss for gradient accumulation
                scaled_loss = total_loss / config['grad_accum_steps']

            # Backward pass
            scaler.scale(scaled_loss).backward()
            
            # Gradient accumulation step
            if (i + 1) % config['grad_accum_steps'] == 0:
                # Best Practice: Unscale gradients before clipping
                scaler.unscale_(optimizer)
                
                # Best Practice: Clip gradients to prevent explosion
                torch.nn.utils.clip_grad_norm_(model.parameters(), config['max_grad_norm'])
                
                # Optimizer and scheduler step
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                optimizer.zero_grad(set_to_none=True)
                
                global_step += 1
                
                # Logging
                if global_step % config['log_interval_steps'] == 0:
                    pbar.set_postfix({
                        "loss": f"{total_loss.item():.4f}",
                        "avg_loss":f"{avg_loss/(i+1)}",
                        "task": f"{task_loss.item():.4f}",
                        "moe": f"{moe_loss.item():.4f}",
                        "lr": f"{scheduler.get_last_lr()[0]:.6e}"
                    })
                    
                print({
                        "loss": f"{total_loss.item():.4f}",
                        "task": f"{task_loss.item():.4f}",
                        "avg_loss":f"{avg_loss/(i+1)}",
                        "moe": f"{moe_loss.item():.4f}",
                        "lr": f"{scheduler.get_last_lr()[0]:.6e}"
                    })

                # Checkpointing
                if global_step > 0 and global_step % config['save_interval_steps'] == 0:
                    checkpoint_path = os.path.join(config['output_dir'], f"step_{global_step}.pt")
                    torch.save({
                        "step": global_step,
                        "model_state_dict": model.state_dict(),
                        "optimizer_state_dict": optimizer.state_dict(),
                        "scheduler_state_dict": scheduler.state_dict(),
                        "scaler_state_dict": scaler.state_dict(),
                        "config": config
                    }, checkpoint_path)
                    print(f"\n✅ Saved checkpoint to {checkpoint_path}")
                    # api.upload_folder(
                    # folder_path="satori_akasha_checkpoints",
                    # repo_id=repo_id,
                    # repo_type="model"
                    # )
    print("Successfully uploaded checkpoints to Hugging Face Hub!")

    checkpoint_path = os.path.join(config['output_dir'], f"step_final_{global_step}.pt")
    torch.save({
                        "step": global_step,
                        "model_state_dict": model.state_dict(),
                        "optimizer_state_dict": optimizer.state_dict(),
                        "scheduler_state_dict": scheduler.state_dict(),
                        "scaler_state_dict": scaler.state_dict(),
                        "config": config
                    }, checkpoint_path)
    # api.upload_folder(
    # folder_path="satori_akasha_checkpoints",
    # repo_id=repo_id,
    # repo_type="model"
    # )
    print("Successfully uploaded checkpoints to Hugging Face Hub!")
    print(f"\n✅ Saved checkpoint to {checkpoint_path}")
            

    print("🎉 Training finished.")


if __name__ == '__main__': 
    # --- 1. Get Configuration ---
    config = get_satori_akasha_config()
    
    # --- 2. Setup Environment ---
    config, tokenizer = setup(config)
    print("--- Starting Training Run for Satori-v9 'Prakriti-Purusha' ---")
    print(json.dumps({k: str(v) if isinstance(v, torch.device) else v 
                  for k, v in config.items()}, indent=2))


    # # --- 3. Load Data ---
    # train_texts = load_training_data(val_size=config['num_train_samples'], skip_size=config['skip_train_samples'])
    # train_texts = texts
    # print(f"✅ Remaining texts after filtering: {len(train_texts):,}")
   
    
    

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


vocab size 32000
Info: No <pad> token defined, setting pad_token_id = eos_token_id.
--- Starting Training Run for Satori-v9 'Prakriti-Purusha' ---
{
  "model_name": "Satori-Akasha-250M",
  "d_model": 768,
  "n_layers": 10,
  "n_heads": 8,
  "window_size": 256,
  "chunk_size": 256,
  "max_chunks": 8192,
  "n_experts": 2,
  "top_k": 2,
  "dropout": 0.05,
  "vocab_size": 32000,
  "block_size": 2048,
  "num_summary_vectors": 4,
  "tokenizer_path": "./tokenizerllma.model",
  "num_epochs": 1,
  "batch_size": 22,
  "grad_accum_steps": 35,
  "precision": "bf16",
  "max_grad_norm": 1.0,
  "optimizer": "AdamW",
  "learning_rate": 0.0003,
  "weight_decay": 0.02,
  "beta1": 0.9,
  "beta2": 0.95,
  "num_train_samples": 460000,
  "skip_train_samples": 2150000,
  "lr_scheduler": "cosine_with_warmup",
  "warmup_steps": 100,
  "output_dir": "satori_akasha_checkpoints",
  "log_interval_steps": 10,
  "save_interval_steps": 600,
  "resume_training": false,
  "resume_from_checkpoint": "/kaggle/input/enhanc

In [5]:
from torch.utils.data import Subset, DataLoader

# Download file from Hub
local_path = hf_hub_download(repo_id=repo_id, filename="tokenized_train_datasetii.pt", repo_type="model")

# Load directly
train_dataset = torch.load(local_path, weights_only=False)

# DataLoader
train_loader = DataLoader(
    train_dataset,
    batch_size=config["batch_size"],
    shuffle=True,
    collate_fn=collate_batch,
    num_workers=4,
    pin_memory=True
)

tokenized_train_datasetii.pt:   0%|          | 0.00/2.09G [00:00<?, ?B/s]

In [8]:
# --- 4. Create Model, Optimizer, and Scheduler ---
model = create_model(config)
checkpoint_path = dest_path # Assuming dest_path is defined
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoints = torch.load(checkpoint_path, map_location=device)

# --- This part is good, create a fresh optimizer and scheduler ---
# It correctly uses the TOTAL training steps for the LR curve
config['warmup_steps'] = int(config['total_training_steps'] * 0.03)
optimizer, scheduler = create_optimizer_and_scheduler(model, config, config['total_training_steps'])

steps_already_done = 0
if config['resume_from_checkpoint']:
    print(f"🔄 Resuming training from checkpoint: {config['resume_from_checkpoint']}")
    
    # 1. Load model weights (Correct)
    model.load_state_dict(checkpoints['model_state_dict'])
    
    # 2. DO NOT load optimizer state to reset momentum (Correct)
    # optimizer.load_state_dict(checkpoints['optimizer_state_dict'])
    
    # 3. Get the step count from the checkpoint (Correct)
    steps_already_done = checkpoints.get('step', 0)
    
    # 4. Manually fast-forward the NEW scheduler to the correct step.
    #    DO NOT load its state_dict.
    print(f"⏩ Fast-forwarding a fresh scheduler by {steps_already_done} steps...")
    for _ in range(steps_already_done):
        scheduler.step()
        
    print(f"Scheduler is now at step {steps_already_done}. LR before kick: {scheduler.get_last_lr()[0]:.6e}")

    # 5. NOW, apply the LR kick to the optimizer.
    #    The scheduler will now correctly proceed from this new, higher LR.
    print("🚀 Applying learning rate kick...")
    for param_group in optimizer.param_groups:
        param_group['lr'] = 1.5e-4
        
    print(f"✅ Resumed successfully. Kicked LR is now: {optimizer.param_groups[0]['lr']:.6e}")

print(f"Total Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.2f}M")

# --- 5. Start Training ---
# Pass the correct resume step to the training loop
train(model, config, train_loader, optimizer, scheduler, resume_step=steps_already_done)

Compiling the model with torch.compile()...


  scaler = GradScaler(enabled=(config['precision'] == 'fp16'))


🔄 Resuming training from checkpoint: /kaggle/input/enhanced-purusha-prakirti/satori_akasha_checkpoints/step_final_2692.pt
⏩ Fast-forwarding a fresh scheduler by 6100 steps...
Scheduler is now at step 6100. LR before kick: 1.045725e-04
🚀 Applying learning rate kick...
✅ Resumed successfully. Kicked LR is now: 1.500000e-04
Total Parameters: 272.40M


Epoch 1/1:   0%|                                                                      | 0/5703 [00:00<?, ?it/s]Epoch 1/1:   0%|                                                           | 1/5703 [00:13<21:36:29, 13.64s/it]Epoch 1/1:   0%|                                                           | 2/5703 [00:17<12:08:31,  7.67s/it]Epoch 1/1:   0%|                                                            | 3/5703 [00:20<8:52:08,  5.60s/it]Epoch 1/1:   0%|                                                            | 4/5703 [00:23<7:21:11,  4.64s/it]Epoch 1/1:   0%|                                                            | 5/5703 [00:26<6:32:55,  4.14s/it]Epoch 1/1:   0%|                                                            | 6/5703 [00:30<6:18:32,  3.99s/it]Epoch 1/1:   0%|                                                            | 7/5703 [00:33<5:50:00,  3.69s/it]Epoch 1/1:   0%|                                                            | 8/5703 [00:36<5:29:14,  3

{'loss': '3.1710', 'task': '3.1610', 'avg_loss': '3.093504238128662', 'moe': '0.0100', 'lr': '1.045262e-04'}


Epoch 1/1:   1%|▍                                                            | 36/5703 [00:53<59:58,  1.58it/s]Epoch 1/1:   1%|▍                                                            | 37/5703 [00:54<59:17,  1.59it/s]Epoch 1/1:   1%|▍                                                            | 38/5703 [00:54<58:46,  1.61it/s]Epoch 1/1:   1%|▍                                                            | 39/5703 [00:55<58:32,  1.61it/s]Epoch 1/1:   1%|▍                                                            | 40/5703 [00:56<58:19,  1.62it/s]Epoch 1/1:   1%|▍                                                            | 41/5703 [00:56<58:06,  1.62it/s]Epoch 1/1:   1%|▍                                                            | 42/5703 [00:57<58:05,  1.62it/s]Epoch 1/1:   1%|▍                                                            | 43/5703 [00:57<57:56,  1.63it/s]Epoch 1/1:   1%|▍                                                            | 44/5703 [00:58<57:52,  1

{'loss': '3.1413', 'task': '3.1312', 'avg_loss': '3.1154172795159476', 'moe': '0.0100', 'lr': '1.044799e-04'}


Epoch 1/1:   1%|▊                                                            | 71/5703 [01:15<57:19,  1.64it/s]Epoch 1/1:   1%|▊                                                            | 72/5703 [01:15<57:18,  1.64it/s]Epoch 1/1:   1%|▊                                                            | 73/5703 [01:16<57:22,  1.64it/s]Epoch 1/1:   1%|▊                                                            | 74/5703 [01:16<57:19,  1.64it/s]Epoch 1/1:   1%|▊                                                            | 75/5703 [01:17<57:17,  1.64it/s]Epoch 1/1:   1%|▊                                                            | 76/5703 [01:18<57:15,  1.64it/s]Epoch 1/1:   1%|▊                                                            | 77/5703 [01:18<57:12,  1.64it/s]Epoch 1/1:   1%|▊                                                            | 78/5703 [01:19<57:10,  1.64it/s]Epoch 1/1:   1%|▊                                                            | 79/5703 [01:20<57:09,  1

{'loss': '3.1471', 'task': '3.1371', 'avg_loss': '3.118731255758376', 'moe': '0.0100', 'lr': '1.044336e-04'}


Epoch 1/1:   2%|█                                                           | 106/5703 [01:36<56:58,  1.64it/s]Epoch 1/1:   2%|█▏                                                          | 107/5703 [01:37<57:02,  1.64it/s]Epoch 1/1:   2%|█▏                                                          | 108/5703 [01:37<57:01,  1.64it/s]Epoch 1/1:   2%|█▏                                                          | 109/5703 [01:38<57:08,  1.63it/s]Epoch 1/1:   2%|█▏                                                          | 110/5703 [01:38<57:05,  1.63it/s]Epoch 1/1:   2%|█▏                                                          | 111/5703 [01:39<57:03,  1.63it/s]Epoch 1/1:   2%|█▏                                                          | 112/5703 [01:40<57:01,  1.63it/s]Epoch 1/1:   2%|█▏                                                          | 113/5703 [01:40<56:58,  1.64it/s]Epoch 1/1:   2%|█▏                                                          | 114/5703 [01:41<56:56,  1

{'loss': '3.1961', 'task': '3.1860', 'avg_loss': '3.1261438505990164', 'moe': '0.0100', 'lr': '1.043873e-04'}


Epoch 1/1:   2%|█▍                                                          | 141/5703 [01:57<57:11,  1.62it/s]Epoch 1/1:   2%|█▍                                                          | 142/5703 [01:58<57:00,  1.63it/s]Epoch 1/1:   3%|█▌                                                          | 143/5703 [01:59<56:57,  1.63it/s]Epoch 1/1:   3%|█▌                                                          | 144/5703 [01:59<56:56,  1.63it/s]Epoch 1/1:   3%|█▌                                                          | 145/5703 [02:00<57:29,  1.61it/s]Epoch 1/1:   3%|█▌                                                          | 146/5703 [02:01<57:12,  1.62it/s]Epoch 1/1:   3%|█▌                                                          | 147/5703 [02:01<57:00,  1.62it/s]Epoch 1/1:   3%|█▌                                                          | 148/5703 [02:02<56:51,  1.63it/s]Epoch 1/1:   3%|█▌                                                          | 149/5703 [02:02<56:44,  1

{'loss': '3.0456', 'task': '3.0356', 'avg_loss': '3.122650030681065', 'moe': '0.0100', 'lr': '1.043410e-04'}


Epoch 1/1:   3%|█▊                                                          | 176/5703 [02:19<56:26,  1.63it/s]Epoch 1/1:   3%|█▊                                                          | 177/5703 [02:20<56:24,  1.63it/s]Epoch 1/1:   3%|█▊                                                          | 178/5703 [02:20<56:29,  1.63it/s]Epoch 1/1:   3%|█▉                                                          | 179/5703 [02:21<56:23,  1.63it/s]Epoch 1/1:   3%|█▉                                                          | 180/5703 [02:21<56:23,  1.63it/s]Epoch 1/1:   3%|█▉                                                          | 181/5703 [02:22<56:20,  1.63it/s]Epoch 1/1:   3%|█▉                                                          | 182/5703 [02:23<56:18,  1.63it/s]Epoch 1/1:   3%|█▉                                                          | 183/5703 [02:23<56:15,  1.64it/s]Epoch 1/1:   3%|█▉                                                          | 184/5703 [02:24<56:15,  1

{'loss': '3.0557', 'task': '3.0457', 'avg_loss': '3.122121172859555', 'moe': '0.0100', 'lr': '1.042948e-04'}


Epoch 1/1:   4%|██▏                                                         | 211/5703 [02:40<55:50,  1.64it/s]Epoch 1/1:   4%|██▏                                                         | 212/5703 [02:41<55:49,  1.64it/s]Epoch 1/1:   4%|██▏                                                         | 213/5703 [02:42<55:48,  1.64it/s]Epoch 1/1:   4%|██▎                                                         | 214/5703 [02:42<55:46,  1.64it/s]Epoch 1/1:   4%|██▎                                                         | 215/5703 [02:43<55:45,  1.64it/s]Epoch 1/1:   4%|██▎                                                         | 216/5703 [02:43<55:50,  1.64it/s]Epoch 1/1:   4%|██▎                                                         | 217/5703 [02:44<55:54,  1.64it/s]Epoch 1/1:   4%|██▎                                                         | 218/5703 [02:45<55:50,  1.64it/s]Epoch 1/1:   4%|██▎                                                         | 219/5703 [02:45<55:47,  1

{'loss': '3.1975', 'task': '3.1874', 'avg_loss': '3.12181962363574', 'moe': '0.0100', 'lr': '1.042485e-04'}


Epoch 1/1:   4%|██▌                                                         | 246/5703 [03:02<55:25,  1.64it/s]Epoch 1/1:   4%|██▌                                                         | 247/5703 [03:02<55:24,  1.64it/s]Epoch 1/1:   4%|██▌                                                         | 248/5703 [03:03<55:25,  1.64it/s]Epoch 1/1:   4%|██▌                                                         | 249/5703 [03:04<55:23,  1.64it/s]Epoch 1/1:   4%|██▋                                                         | 250/5703 [03:04<55:23,  1.64it/s]Epoch 1/1:   4%|██▋                                                         | 251/5703 [03:05<55:23,  1.64it/s]Epoch 1/1:   4%|██▋                                                         | 252/5703 [03:05<55:22,  1.64it/s]Epoch 1/1:   4%|██▋                                                         | 253/5703 [03:06<55:21,  1.64it/s]Epoch 1/1:   4%|██▋                                                         | 254/5703 [03:07<55:20,  1

{'loss': '3.0431', 'task': '3.0331', 'avg_loss': '3.1222964312349046', 'moe': '0.0100', 'lr': '1.042022e-04'}


Epoch 1/1:   5%|██▉                                                         | 281/5703 [03:23<55:12,  1.64it/s]Epoch 1/1:   5%|██▉                                                         | 282/5703 [03:24<55:11,  1.64it/s]Epoch 1/1:   5%|██▉                                                         | 283/5703 [03:24<55:11,  1.64it/s]Epoch 1/1:   5%|██▉                                                         | 284/5703 [03:25<55:13,  1.64it/s]Epoch 1/1:   5%|██▉                                                         | 285/5703 [03:26<55:15,  1.63it/s]Epoch 1/1:   5%|███                                                         | 286/5703 [03:26<55:12,  1.64it/s]Epoch 1/1:   5%|███                                                         | 287/5703 [03:27<55:10,  1.64it/s]Epoch 1/1:   5%|███                                                         | 288/5703 [03:27<55:05,  1.64it/s]Epoch 1/1:   5%|███                                                         | 289/5703 [03:28<55:04,  1

{'loss': '3.1966', 'task': '3.1866', 'avg_loss': '3.1207522309015667', 'moe': '0.0100', 'lr': '1.041560e-04'}


Epoch 1/1:   6%|███▎                                                        | 316/5703 [03:44<54:43,  1.64it/s]Epoch 1/1:   6%|███▎                                                        | 317/5703 [03:45<54:42,  1.64it/s]Epoch 1/1:   6%|███▎                                                        | 318/5703 [03:46<54:42,  1.64it/s]Epoch 1/1:   6%|███▎                                                        | 319/5703 [03:46<54:41,  1.64it/s]Epoch 1/1:   6%|███▎                                                        | 320/5703 [03:47<54:41,  1.64it/s]Epoch 1/1:   6%|███▍                                                        | 321/5703 [03:48<54:40,  1.64it/s]Epoch 1/1:   6%|███▍                                                        | 322/5703 [03:48<54:40,  1.64it/s]Epoch 1/1:   6%|███▍                                                        | 323/5703 [03:49<54:40,  1.64it/s]Epoch 1/1:   6%|███▍                                                        | 324/5703 [03:49<54:39,  1

{'loss': '3.0651', 'task': '3.0550', 'avg_loss': '3.1204557732173375', 'moe': '0.0100', 'lr': '1.041097e-04'}


Epoch 1/1:   6%| | 351/5703 [04:06<54:31,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   6%| | 352/5703 [04:06<54:38,  1.63it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   6%| | 353/5703 [04:07<54:50,  1.63it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   6%| | 354/5703 [04:08<54:27,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   6%| | 355/5703 [04:08<54:24,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   6%| | 356/5703 [04:09<54:21,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   6%| | 357/5703 [04:10<54:19,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   6%| | 358/5703 [04:10<54:18,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   6%| | 359/5703 [04:11<54:16,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0

{'loss': '3.1282', 'task': '3.1182', 'avg_loss': '3.1214656012398856', 'moe': '0.0100', 'lr': '1.040635e-04'}


Epoch 1/1:   7%| | 386/5703 [04:27<54:06,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   7%| | 387/5703 [04:28<54:06,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   7%| | 388/5703 [04:28<54:05,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   7%| | 389/5703 [04:29<54:04,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   7%| | 390/5703 [04:30<54:03,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   7%| | 391/5703 [04:30<54:06,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   7%| | 392/5703 [04:31<54:04,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   7%| | 393/5703 [04:32<54:05,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   7%| | 394/5703 [04:32<54:02,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0

{'loss': '3.1163', 'task': '3.1063', 'avg_loss': '3.1227561746324812', 'moe': '0.0100', 'lr': '1.040172e-04'}


Epoch 1/1:   7%| | 421/5703 [04:49<53:55,  1.63it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   7%| | 422/5703 [04:49<53:54,  1.63it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   7%| | 423/5703 [04:50<53:54,  1.63it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   7%| | 424/5703 [04:50<53:53,  1.63it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   7%| | 425/5703 [04:51<53:51,  1.63it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   7%| | 426/5703 [04:52<53:51,  1.63it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   7%| | 427/5703 [04:52<53:46,  1.63it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   8%| | 428/5703 [04:53<53:45,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   8%| | 429/5703 [04:54<53:44,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0

{'loss': '3.1147', 'task': '3.1047', 'avg_loss': '3.1230929442814417', 'moe': '0.0100', 'lr': '1.039710e-04'}


Epoch 1/1:   8%| | 456/5703 [05:10<53:20,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   8%| | 457/5703 [05:11<53:18,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   8%| | 458/5703 [05:11<53:18,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   8%| | 459/5703 [05:12<53:16,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   8%| | 460/5703 [05:12<53:17,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   8%| | 461/5703 [05:13<53:15,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   8%| | 462/5703 [05:14<53:12,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   8%| | 463/5703 [05:14<53:12,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   8%| | 464/5703 [05:15<53:12,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0

{'loss': '3.0601', 'task': '3.0500', 'avg_loss': '3.1230929953711373', 'moe': '0.0100', 'lr': '1.039248e-04'}


Epoch 1/1:   9%| | 491/5703 [05:31<53:07,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   9%| | 492/5703 [05:32<53:07,  1.63it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   9%| | 493/5703 [05:33<53:07,  1.63it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   9%| | 494/5703 [05:33<53:07,  1.63it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   9%| | 495/5703 [05:34<53:06,  1.63it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   9%| | 496/5703 [05:34<53:02,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   9%| | 497/5703 [05:35<52:59,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   9%| | 498/5703 [05:36<52:59,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   9%| | 499/5703 [05:36<52:59,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0

{'loss': '3.1912', 'task': '3.1812', 'avg_loss': '3.122466100511097', 'moe': '0.0100', 'lr': '1.038785e-04'}


Epoch 1/1:   9%| | 526/5703 [05:53<52:44,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   9%| | 527/5703 [05:53<52:43,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   9%| | 528/5703 [05:54<52:43,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   9%| | 529/5703 [05:55<52:42,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   9%| | 530/5703 [05:55<52:41,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   9%| | 531/5703 [05:56<52:40,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   9%| | 532/5703 [05:56<52:40,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   9%| | 533/5703 [05:57<52:39,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:   9%| | 534/5703 [05:58<52:39,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0

{'loss': '3.0778', 'task': '3.0678', 'avg_loss': '3.1229012825659344', 'moe': '0.0100', 'lr': '1.038323e-04'}


Epoch 1/1:  10%| | 561/5703 [06:14<52:14,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  10%| | 562/5703 [06:15<52:13,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  10%| | 563/5703 [06:15<52:16,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  10%| | 564/5703 [06:16<52:17,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  10%| | 565/5703 [06:17<52:17,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  10%| | 566/5703 [06:17<52:18,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  10%| | 567/5703 [06:18<52:18,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  10%| | 568/5703 [06:18<52:20,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  10%| | 569/5703 [06:19<52:22,  1.63it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0

{'loss': '3.1018', 'task': '3.0917', 'avg_loss': '3.1223856320902077', 'moe': '0.0100', 'lr': '1.037861e-04'}


Epoch 1/1:  10%| | 596/5703 [06:36<51:54,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  10%| | 597/5703 [06:36<51:53,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  10%| | 598/5703 [06:37<51:50,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  11%| | 599/5703 [06:37<51:49,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  11%| | 600/5703 [06:38<51:53,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  11%| | 601/5703 [06:39<51:50,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  11%| | 602/5703 [06:39<51:48,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  11%| | 603/5703 [06:40<51:46,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  11%| | 604/5703 [06:40<51:44,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0

{'loss': '3.1453', 'task': '3.1352', 'avg_loss': '3.1224683731321305', 'moe': '0.0100', 'lr': '1.037399e-04'}


Epoch 1/1:  11%| | 631/5703 [06:57<51:29,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  11%| | 632/5703 [06:58<51:28,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  11%| | 633/5703 [06:58<51:28,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  11%| | 634/5703 [06:59<51:28,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  11%| | 635/5703 [06:59<51:27,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  11%| | 636/5703 [07:00<51:27,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  11%| | 637/5703 [07:01<51:26,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  11%| | 638/5703 [07:01<51:25,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  11%| | 639/5703 [07:02<51:25,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0

{'loss': '3.1221', 'task': '3.1121', 'avg_loss': '3.123096996321714', 'moe': '0.0100', 'lr': '1.036937e-04'}


Epoch 1/1:  12%| | 666/5703 [07:18<51:06,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  12%| | 667/5703 [07:19<51:09,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  12%| | 668/5703 [07:19<51:10,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  12%| | 669/5703 [07:20<51:10,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  12%| | 670/5703 [07:21<51:07,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  12%| | 671/5703 [07:21<51:06,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  12%| | 672/5703 [07:22<51:04,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  12%| | 673/5703 [07:23<51:04,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0550, moeEpoch 1/1:  12%| | 674/5703 [07:23<51:04,  1.64it/s, loss=3.0651, avg_loss=3.1204557732173375, task=3.0

{'loss': '3.1373', 'task': '3.1273', 'avg_loss': '3.123326061453138', 'moe': '0.0100', 'lr': '1.036475e-04'}


Epoch 1/1:  12%| | 701/5703 [07:40<50:51,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  12%| | 702/5703 [07:40<50:50,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  12%| | 703/5703 [07:41<50:50,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  12%| | 704/5703 [07:41<50:50,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  12%| | 705/5703 [07:42<50:49,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  12%| | 706/5703 [07:43<50:48,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  12%| | 707/5703 [07:43<50:45,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  12%| | 708/5703 [07:44<50:45,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  12%| | 709/5703 [07:44<50:44,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.12

{'loss': '3.0042', 'task': '2.9942', 'avg_loss': '3.1234573711343363', 'moe': '0.0100', 'lr': '1.036012e-04'}


Epoch 1/1:  13%|▏| 736/5703 [08:01<50:30,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  13%|▏| 737/5703 [08:02<50:31,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  13%|▏| 738/5703 [08:02<50:30,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  13%|▏| 739/5703 [08:03<50:29,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  13%|▏| 740/5703 [08:03<50:28,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  13%|▏| 741/5703 [08:04<50:25,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  13%|▏| 742/5703 [08:05<50:24,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  13%|▏| 743/5703 [08:05<50:21,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  13%|▏| 744/5703 [08:06<50:21,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.12

{'loss': '3.0519', 'task': '3.0419', 'avg_loss': '3.122163467283373', 'moe': '0.0100', 'lr': '1.035551e-04'}


Epoch 1/1:  14%|▏| 771/5703 [08:22<50:03,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  14%|▏| 772/5703 [08:23<50:03,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  14%|▏| 773/5703 [08:24<50:02,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  14%|▏| 774/5703 [08:24<50:00,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  14%|▏| 775/5703 [08:25<50:01,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  14%|▏| 776/5703 [08:25<50:03,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  14%|▏| 777/5703 [08:26<50:04,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  14%|▏| 778/5703 [08:27<50:06,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  14%|▏| 779/5703 [08:27<50:05,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.12

{'loss': '3.1799', 'task': '3.1699', 'avg_loss': '3.122678870443972', 'moe': '0.0100', 'lr': '1.035089e-04'}


Epoch 1/1:  14%|▏| 806/5703 [08:44<49:59,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  14%|▏| 807/5703 [08:44<49:55,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  14%|▏| 808/5703 [08:45<49:53,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  14%|▏| 809/5703 [08:46<49:51,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  14%|▏| 810/5703 [08:46<49:51,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  14%|▏| 811/5703 [08:47<49:50,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  14%|▏| 812/5703 [08:47<49:51,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  14%|▏| 813/5703 [08:48<49:48,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  14%|▏| 814/5703 [08:49<49:46,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.12

{'loss': '3.0831', 'task': '3.0731', 'avg_loss': '3.1224947969118753', 'moe': '0.0100', 'lr': '1.034627e-04'}


Epoch 1/1:  15%|▏| 841/5703 [09:05<50:47,  1.60it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  15%|▏| 842/5703 [09:06<50:20,  1.61it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  15%|▏| 843/5703 [09:06<50:07,  1.62it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  15%|▏| 844/5703 [09:07<49:56,  1.62it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  15%|▏| 845/5703 [09:08<50:00,  1.62it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  15%|▏| 846/5703 [09:08<49:56,  1.62it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  15%|▏| 847/5703 [09:09<49:51,  1.62it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  15%|▏| 848/5703 [09:10<49:47,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  15%|▏| 849/5703 [09:10<49:47,  1.62it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.12

{'loss': '3.0968', 'task': '3.0868', 'avg_loss': '3.121256261280605', 'moe': '0.0100', 'lr': '1.034165e-04'}


Epoch 1/1:  15%|▏| 876/5703 [09:27<49:11,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  15%|▏| 877/5703 [09:27<49:10,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  15%|▏| 878/5703 [09:28<49:08,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  15%|▏| 879/5703 [09:29<49:06,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  15%|▏| 880/5703 [09:29<49:04,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  15%|▏| 881/5703 [09:30<49:03,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  15%|▏| 882/5703 [09:30<49:02,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  15%|▏| 883/5703 [09:31<49:01,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  16%|▏| 884/5703 [09:32<49:00,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.12

{'loss': '3.1812', 'task': '3.1712', 'avg_loss': '3.121350604885227', 'moe': '0.0100', 'lr': '1.033703e-04'}


Epoch 1/1:  16%|▏| 911/5703 [09:48<48:48,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  16%|▏| 912/5703 [09:49<48:46,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  16%|▏| 913/5703 [09:49<48:47,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  16%|▏| 914/5703 [09:50<48:50,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  16%|▏| 915/5703 [09:51<48:52,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  16%|▏| 916/5703 [09:51<48:49,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  16%|▏| 917/5703 [09:52<48:46,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  16%|▏| 918/5703 [09:52<48:47,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  16%|▏| 919/5703 [09:53<48:55,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.12

{'loss': '3.1873', 'task': '3.1773', 'avg_loss': '3.1208599685991882', 'moe': '0.0100', 'lr': '1.033241e-04'}


Epoch 1/1:  17%|▏| 946/5703 [10:10<48:22,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  17%|▏| 947/5703 [10:10<48:22,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  17%|▏| 948/5703 [10:11<48:20,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  17%|▏| 949/5703 [10:11<48:18,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  17%|▏| 950/5703 [10:12<48:16,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  17%|▏| 951/5703 [10:13<48:15,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  17%|▏| 952/5703 [10:13<48:15,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  17%|▏| 953/5703 [10:14<48:14,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  17%|▏| 954/5703 [10:14<48:13,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.12

{'loss': '3.1859', 'task': '3.1759', 'avg_loss': '3.120624559509511', 'moe': '0.0100', 'lr': '1.032780e-04'}


Epoch 1/1:  17%|▏| 981/5703 [10:31<48:05,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  17%|▏| 982/5703 [10:32<48:05,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  17%|▏| 983/5703 [10:32<48:05,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  17%|▏| 984/5703 [10:33<48:06,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  17%|▏| 985/5703 [10:33<48:05,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  17%|▏| 986/5703 [10:34<48:04,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  17%|▏| 987/5703 [10:35<48:02,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  17%|▏| 988/5703 [10:35<47:59,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moe=Epoch 1/1:  17%|▏| 989/5703 [10:36<48:02,  1.64it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.12

{'loss': '3.0734', 'task': '3.0634', 'avg_loss': '3.1202583531440773', 'moe': '0.0100', 'lr': '1.032318e-04'}


Epoch 1/1:  18%|▏| 1016/5703 [10:52<47:57,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moeEpoch 1/1:  18%|▏| 1017/5703 [10:53<47:55,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moeEpoch 1/1:  18%|▏| 1018/5703 [10:54<47:51,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moeEpoch 1/1:  18%|▏| 1019/5703 [10:54<47:49,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moeEpoch 1/1:  18%|▏| 1020/5703 [10:55<47:48,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moeEpoch 1/1:  18%|▏| 1021/5703 [10:55<47:47,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moeEpoch 1/1:  18%|▏| 1022/5703 [10:56<47:47,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moeEpoch 1/1:  18%|▏| 1023/5703 [10:57<47:47,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1273, moeEpoch 1/1:  18%|▏| 1024/5703 [10:57<47:46,  1.63it/s, loss=3.1373, avg_loss=3.123326061453138, task=3.1

{'loss': '2.9988', 'task': '2.9888', 'avg_loss': '3.119605347769601', 'moe': '0.0100', 'lr': '1.031857e-04'}


Epoch 1/1:  18%|▏| 1051/5703 [11:14<47:17,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  18%|▏| 1052/5703 [11:14<47:16,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  18%|▏| 1053/5703 [11:15<47:15,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  18%|▏| 1054/5703 [11:16<47:16,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  18%|▏| 1055/5703 [11:16<47:16,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  19%|▏| 1056/5703 [11:17<47:15,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  19%|▏| 1057/5703 [11:17<47:16,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  19%|▏| 1058/5703 [11:18<47:13,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  19%|▏| 1059/5703 [11:19<47:13,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9

{'loss': '3.0135', 'task': '3.0035', 'avg_loss': '3.119551332095801', 'moe': '0.0100', 'lr': '1.031395e-04'}


Epoch 1/1:  19%|▏| 1086/5703 [11:35<46:55,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  19%|▏| 1087/5703 [11:36<46:54,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  19%|▏| 1088/5703 [11:36<46:54,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  19%|▏| 1089/5703 [11:37<46:54,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  19%|▏| 1090/5703 [11:38<46:54,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  19%|▏| 1091/5703 [11:38<46:54,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  19%|▏| 1092/5703 [11:39<46:59,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  19%|▏| 1093/5703 [11:39<46:56,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  19%|▏| 1094/5703 [11:40<46:54,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9

{'loss': '3.1973', 'task': '3.1873', 'avg_loss': '3.1191391563841275', 'moe': '0.0100', 'lr': '1.030934e-04'}


Epoch 1/1:  20%|▏| 1121/5703 [11:57<46:36,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  20%|▏| 1122/5703 [11:57<46:34,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  20%|▏| 1123/5703 [11:58<46:31,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  20%|▏| 1124/5703 [11:58<46:29,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  20%|▏| 1125/5703 [11:59<46:28,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  20%|▏| 1126/5703 [12:00<46:28,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  20%|▏| 1127/5703 [12:00<46:32,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  20%|▏| 1128/5703 [12:01<46:32,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  20%|▏| 1129/5703 [12:01<46:30,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9

{'loss': '3.0525', 'task': '3.0425', 'avg_loss': '3.1192610598229744', 'moe': '0.0100', 'lr': '1.030472e-04'}


Epoch 1/1:  20%|▏| 1156/5703 [12:18<47:14,  1.60it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  20%|▏| 1157/5703 [12:19<47:02,  1.61it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  20%|▏| 1158/5703 [12:19<46:47,  1.62it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  20%|▏| 1159/5703 [12:20<46:35,  1.63it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  20%|▏| 1160/5703 [12:20<46:27,  1.63it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  20%|▏| 1161/5703 [12:21<46:20,  1.63it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  20%|▏| 1162/5703 [12:22<46:15,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  20%|▏| 1163/5703 [12:22<46:13,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  20%|▏| 1164/5703 [12:23<46:11,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9

{'loss': '3.2344', 'task': '3.2243', 'avg_loss': '3.1190850043497167', 'moe': '0.0100', 'lr': '1.030011e-04'}


Epoch 1/1:  21%|▏| 1191/5703 [12:39<45:47,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  21%|▏| 1192/5703 [12:40<45:47,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  21%|▏| 1193/5703 [12:41<45:50,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  21%|▏| 1194/5703 [12:41<45:48,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  21%|▏| 1195/5703 [12:42<45:47,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  21%|▏| 1196/5703 [12:42<45:48,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  21%|▏| 1197/5703 [12:43<45:47,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  21%|▏| 1198/5703 [12:44<45:46,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  21%|▏| 1199/5703 [12:44<45:46,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9

{'loss': '3.1156', 'task': '3.1055', 'avg_loss': '3.118221119082704', 'moe': '0.0100', 'lr': '1.029549e-04'}


Epoch 1/1:  21%|▏| 1226/5703 [13:01<45:34,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  22%|▏| 1227/5703 [13:01<45:31,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  22%|▏| 1228/5703 [13:02<45:30,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  22%|▏| 1229/5703 [13:03<45:28,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  22%|▏| 1230/5703 [13:03<45:26,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  22%|▏| 1231/5703 [13:04<45:25,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  22%|▏| 1232/5703 [13:04<45:24,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  22%|▏| 1233/5703 [13:05<45:24,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  22%|▏| 1234/5703 [13:06<45:23,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9

{'loss': '3.1275', 'task': '3.1175', 'avg_loss': '3.1185267474916247', 'moe': '0.0100', 'lr': '1.029088e-04'}


Epoch 1/1:  22%|▏| 1261/5703 [13:22<45:07,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  22%|▏| 1262/5703 [13:23<45:08,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  22%|▏| 1263/5703 [13:23<45:08,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  22%|▏| 1264/5703 [13:24<45:08,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  22%|▏| 1265/5703 [13:24<45:06,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  22%|▏| 1266/5703 [13:25<45:05,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  22%|▏| 1267/5703 [13:26<45:20,  1.63it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  22%|▏| 1268/5703 [13:26<45:15,  1.63it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  22%|▏| 1269/5703 [13:27<45:10,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9

{'loss': '3.2555', 'task': '3.2454', 'avg_loss': '3.117737272439316', 'moe': '0.0100', 'lr': '1.028627e-04'}


Epoch 1/1:  23%|▏| 1296/5703 [13:43<44:50,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  23%|▏| 1297/5703 [13:44<44:48,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  23%|▏| 1298/5703 [13:45<44:47,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  23%|▏| 1299/5703 [13:45<44:46,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  23%|▏| 1300/5703 [13:46<44:45,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  23%|▏| 1301/5703 [13:46<44:44,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  23%|▏| 1302/5703 [13:47<44:47,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  23%|▏| 1303/5703 [13:48<44:46,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  23%|▏| 1304/5703 [13:48<44:45,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9

{'loss': '3.2373', 'task': '3.2273', 'avg_loss': '3.11762357044937', 'moe': '0.0100', 'lr': '1.028166e-04'}


Epoch 1/1:  23%|▏| 1331/5703 [14:05<44:25,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  23%|▏| 1332/5703 [14:05<44:25,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  23%|▏| 1333/5703 [14:06<44:23,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  23%|▏| 1334/5703 [14:07<44:22,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  23%|▏| 1335/5703 [14:07<44:21,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  23%|▏| 1336/5703 [14:08<44:20,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  23%|▏| 1337/5703 [14:08<44:20,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  23%|▏| 1338/5703 [14:09<44:20,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  23%|▏| 1339/5703 [14:10<44:19,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9

{'loss': '3.0892', 'task': '3.0791', 'avg_loss': '3.117330317794185', 'moe': '0.0100', 'lr': '1.027705e-04'}


Epoch 1/1:  24%|▏| 1366/5703 [14:26<44:02,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  24%|▏| 1367/5703 [14:27<44:03,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  24%|▏| 1368/5703 [14:27<44:02,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  24%|▏| 1369/5703 [14:28<44:01,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  24%|▏| 1370/5703 [14:29<44:00,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  24%|▏| 1371/5703 [14:29<44:00,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  24%|▏| 1372/5703 [14:30<43:59,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  24%|▏| 1373/5703 [14:30<43:58,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9888, moeEpoch 1/1:  24%|▏| 1374/5703 [14:31<43:58,  1.64it/s, loss=2.9988, avg_loss=3.119605347769601, task=2.9

{'loss': '3.2981', 'task': '3.2880', 'avg_loss': '3.117859604528972', 'moe': '0.0100', 'lr': '1.027244e-04'}


Epoch 1/1:  25%|▏| 1401/5703 [14:48<43:41,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  25%|▏| 1402/5703 [14:48<43:40,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  25%|▏| 1403/5703 [14:49<43:38,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  25%|▏| 1404/5703 [14:49<43:38,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  25%|▏| 1405/5703 [14:50<43:37,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  25%|▏| 1406/5703 [14:51<43:36,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  25%|▏| 1407/5703 [14:51<43:35,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  25%|▏| 1408/5703 [14:52<43:35,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  25%|▏| 1409/5703 [14:52<43:39,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2

{'loss': '3.1554', 'task': '3.1453', 'avg_loss': '3.117548296094356', 'moe': '0.0100', 'lr': '1.026783e-04'}


Epoch 1/1:  25%|▎| 1436/5703 [15:09<43:19,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  25%|▎| 1437/5703 [15:09<43:18,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  25%|▎| 1438/5703 [15:10<43:18,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  25%|▎| 1439/5703 [15:11<43:20,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  25%|▎| 1440/5703 [15:11<43:19,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  25%|▎| 1441/5703 [15:12<43:17,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  25%|▎| 1442/5703 [15:13<43:21,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  25%|▎| 1443/5703 [15:13<43:19,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  25%|▎| 1444/5703 [15:14<43:18,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2

{'loss': '3.3235', 'task': '3.3134', 'avg_loss': '3.1174005922006103', 'moe': '0.0100', 'lr': '1.026322e-04'}


Epoch 1/1:  26%|▎| 1471/5703 [15:30<43:00,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  26%|▎| 1472/5703 [15:31<43:04,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  26%|▎| 1473/5703 [15:31<43:01,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  26%|▎| 1474/5703 [15:32<42:59,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  26%|▎| 1475/5703 [15:33<42:58,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  26%|▎| 1476/5703 [15:33<42:57,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  26%|▎| 1477/5703 [15:34<43:00,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  26%|▎| 1478/5703 [15:34<43:01,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  26%|▎| 1479/5703 [15:35<43:00,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2

{'loss': '3.0492', 'task': '3.0392', 'avg_loss': '3.1172699557586365', 'moe': '0.0100', 'lr': '1.025861e-04'}


Epoch 1/1:  26%|▎| 1506/5703 [15:52<42:45,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  26%|▎| 1507/5703 [15:52<42:44,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  26%|▎| 1508/5703 [15:53<42:42,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  26%|▎| 1509/5703 [15:53<42:52,  1.63it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  26%|▎| 1510/5703 [15:54<42:52,  1.63it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  26%|▎| 1511/5703 [15:55<42:47,  1.63it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  27%|▎| 1512/5703 [15:55<42:48,  1.63it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  27%|▎| 1513/5703 [15:56<42:44,  1.63it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  27%|▎| 1514/5703 [15:57<42:40,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2

{'loss': '3.1532', 'task': '3.1432', 'avg_loss': '3.116931574530416', 'moe': '0.0100', 'lr': '1.025400e-04'}


Epoch 1/1:  27%|▎| 1541/5703 [16:13<42:27,  1.63it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  27%|▎| 1542/5703 [16:14<42:23,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  27%|▎| 1543/5703 [16:14<42:22,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  27%|▎| 1544/5703 [16:15<42:22,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  27%|▎| 1545/5703 [16:15<42:22,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  27%|▎| 1546/5703 [16:16<42:23,  1.63it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  27%|▎| 1547/5703 [16:17<42:26,  1.63it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  27%|▎| 1548/5703 [16:17<42:24,  1.63it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  27%|▎| 1549/5703 [16:18<42:23,  1.63it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2

{'loss': '3.1267', 'task': '3.1167', 'avg_loss': '3.116954976187812', 'moe': '0.0100', 'lr': '1.024939e-04'}


Epoch 1/1:  28%|▎| 1576/5703 [16:34<41:54,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  28%|▎| 1577/5703 [16:35<41:54,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  28%|▎| 1578/5703 [16:36<41:53,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  28%|▎| 1579/5703 [16:36<41:53,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  28%|▎| 1580/5703 [16:37<41:52,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  28%|▎| 1581/5703 [16:37<41:51,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  28%|▎| 1582/5703 [16:38<41:50,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  28%|▎| 1583/5703 [16:39<41:50,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  28%|▎| 1584/5703 [16:39<41:49,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2

{'loss': '3.2131', 'task': '3.2031', 'avg_loss': '3.1172908691145618', 'moe': '0.0100', 'lr': '1.024478e-04'}


Epoch 1/1:  28%|▎| 1611/5703 [16:56<41:30,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  28%|▎| 1612/5703 [16:56<41:30,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  28%|▎| 1613/5703 [16:57<41:30,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  28%|▎| 1614/5703 [16:58<41:32,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  28%|▎| 1615/5703 [16:58<41:33,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  28%|▎| 1616/5703 [16:59<41:32,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  28%|▎| 1617/5703 [16:59<41:31,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  28%|▎| 1618/5703 [17:00<41:29,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  28%|▎| 1619/5703 [17:01<41:28,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2

{'loss': '3.0356', 'task': '3.0256', 'avg_loss': '3.11697484178746', 'moe': '0.0100', 'lr': '1.024017e-04'}


Epoch 1/1:  29%|▎| 1646/5703 [17:17<41:09,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  29%|▎| 1647/5703 [17:18<41:08,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  29%|▎| 1648/5703 [17:19<41:07,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  29%|▎| 1649/5703 [17:19<41:06,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  29%|▎| 1650/5703 [17:20<41:05,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  29%|▎| 1651/5703 [17:21<41:04,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  29%|▎| 1652/5703 [17:21<41:05,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  29%|▎| 1653/5703 [17:22<41:04,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  29%|▎| 1654/5703 [17:22<41:03,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2

{'loss': '3.1476', 'task': '3.1376', 'avg_loss': '3.116828746596972', 'moe': '0.0100', 'lr': '1.023557e-04'}


Epoch 1/1:  29%|▎| 1681/5703 [17:39<40:50,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  29%|▎| 1682/5703 [17:39<40:48,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  30%|▎| 1683/5703 [17:40<40:48,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  30%|▎| 1684/5703 [17:41<40:47,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  30%|▎| 1685/5703 [17:41<40:47,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  30%|▎| 1686/5703 [17:42<40:45,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  30%|▎| 1687/5703 [17:42<40:44,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  30%|▎| 1688/5703 [17:43<40:43,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  30%|▎| 1689/5703 [17:44<40:42,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2

{'loss': '3.1430', 'task': '3.1330', 'avg_loss': '3.1168056618715516', 'moe': '0.0100', 'lr': '1.023096e-04'}


Epoch 1/1:  30%|▎| 1716/5703 [18:00<40:25,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  30%|▎| 1717/5703 [18:01<40:28,  1.64it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  30%|▎| 1718/5703 [18:01<40:38,  1.63it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  30%|▎| 1719/5703 [18:02<40:53,  1.62it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  30%|▎| 1720/5703 [18:03<40:49,  1.63it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  30%|▎| 1721/5703 [18:03<40:53,  1.62it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  30%|▎| 1722/5703 [18:04<42:38,  1.56it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  30%|▎| 1723/5703 [18:05<42:18,  1.57it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2880, moeEpoch 1/1:  30%|▎| 1724/5703 [18:05<42:25,  1.56it/s, loss=3.2981, avg_loss=3.117859604528972, task=3.2

{'loss': '3.2234', 'task': '3.2134', 'avg_loss': '3.1167786763054983', 'moe': '0.0100', 'lr': '1.022636e-04'}

✅ Saved checkpoint to satori_akasha_checkpoints/step_lightning_6150.pt


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ..._checkpoints/step_lightning_6150.pt:   0%|          |  552kB / 3.27GB            

Epoch 1/1:  31%|▎| 1750/5703 [19:26<21:58:18, 20.01s/it, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134,Epoch 1/1:  31%|▎| 1751/5703 [19:26<15:34:11, 14.18s/it, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134,Epoch 1/1:  31%|▎| 1752/5703 [19:27<11:05:52, 10.11s/it, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134,Epoch 1/1:  31%|▎| 1753/5703 [19:28<7:58:06,  7.26s/it, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, Epoch 1/1:  31%|▎| 1754/5703 [19:28<5:46:40,  5.27s/it, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, Epoch 1/1:  31%|▎| 1755/5703 [19:29<4:14:42,  3.87s/it, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, Epoch 1/1:  31%|▎| 1756/5703 [19:29<3:10:21,  2.89s/it, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, Epoch 1/1:  31%|▎| 1757/5703 [19:30<2:25:16,  2.21s/it, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, Epoch 1/1:  31%|▎| 1758/5703 [19:31<1:53:41,  1.73s/it, loss=3.2234, avg_loss=3.1167786763054983, task=

{'loss': '3.0562', 'task': '3.0462', 'avg_loss': '3.1164598131046244', 'moe': '0.0100', 'lr': '1.022175e-04'}


Epoch 1/1:  31%|▎| 1786/5703 [19:48<39:45,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  31%|▎| 1787/5703 [19:48<39:45,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  31%|▎| 1788/5703 [19:49<39:44,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  31%|▎| 1789/5703 [19:50<39:45,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  31%|▎| 1790/5703 [19:50<39:44,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  31%|▎| 1791/5703 [19:51<39:43,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  31%|▎| 1792/5703 [19:51<39:43,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  31%|▎| 1793/5703 [19:52<39:42,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  31%|▎| 1794/5703 [19:53<39:41,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.

{'loss': '3.0749', 'task': '3.0649', 'avg_loss': '3.116265709059579', 'moe': '0.0100', 'lr': '1.021715e-04'}


Epoch 1/1:  32%|▎| 1821/5703 [20:09<39:28,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  32%|▎| 1822/5703 [20:10<39:27,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  32%|▎| 1823/5703 [20:10<39:25,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  32%|▎| 1824/5703 [20:11<39:25,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  32%|▎| 1825/5703 [20:12<39:24,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  32%|▎| 1826/5703 [20:12<39:23,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  32%|▎| 1827/5703 [20:13<39:22,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  32%|▎| 1828/5703 [20:13<39:21,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  32%|▎| 1829/5703 [20:14<39:21,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.

{'loss': '3.1979', 'task': '3.1879', 'avg_loss': '3.1163409413031813', 'moe': '0.0100', 'lr': '1.021254e-04'}


Epoch 1/1:  33%|▎| 1856/5703 [20:30<39:06,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  33%|▎| 1857/5703 [20:31<39:05,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  33%|▎| 1858/5703 [20:32<39:05,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  33%|▎| 1859/5703 [20:32<39:04,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  33%|▎| 1860/5703 [20:33<39:03,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  33%|▎| 1861/5703 [20:34<39:03,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  33%|▎| 1862/5703 [20:34<39:03,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  33%|▎| 1863/5703 [20:35<39:03,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  33%|▎| 1864/5703 [20:35<39:02,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.

{'loss': '3.1485', 'task': '3.1384', 'avg_loss': '3.1157133064572773', 'moe': '0.0100', 'lr': '1.020794e-04'}


Epoch 1/1:  33%|▎| 1891/5703 [20:52<38:52,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  33%|▎| 1892/5703 [20:52<38:51,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  33%|▎| 1893/5703 [20:53<38:50,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  33%|▎| 1894/5703 [20:54<38:50,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  33%|▎| 1895/5703 [20:54<38:50,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  33%|▎| 1896/5703 [20:55<38:49,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  33%|▎| 1897/5703 [20:56<38:48,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  33%|▎| 1898/5703 [20:56<38:48,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  33%|▎| 1899/5703 [20:57<38:47,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.

{'loss': '3.0484', 'task': '3.0384', 'avg_loss': '3.1157114561502035', 'moe': '0.0100', 'lr': '1.020333e-04'}


Epoch 1/1:  34%|▎| 1926/5703 [21:13<38:33,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  34%|▎| 1927/5703 [21:14<38:32,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  34%|▎| 1928/5703 [21:15<38:31,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  34%|▎| 1929/5703 [21:15<38:30,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  34%|▎| 1930/5703 [21:16<38:29,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  34%|▎| 1931/5703 [21:16<38:30,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  34%|▎| 1932/5703 [21:17<38:28,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  34%|▎| 1933/5703 [21:18<38:27,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  34%|▎| 1934/5703 [21:18<38:26,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.

{'loss': '3.0326', 'task': '3.0226', 'avg_loss': '3.115516655177486', 'moe': '0.0100', 'lr': '1.019873e-04'}


Epoch 1/1:  34%|▎| 1961/5703 [21:35<38:08,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  34%|▎| 1962/5703 [21:35<38:07,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  34%|▎| 1963/5703 [21:36<38:06,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  34%|▎| 1964/5703 [21:37<38:09,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  34%|▎| 1965/5703 [21:37<38:06,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  34%|▎| 1966/5703 [21:38<38:04,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  34%|▎| 1967/5703 [21:38<38:02,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  35%|▎| 1968/5703 [21:39<38:01,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  35%|▎| 1969/5703 [21:40<38:00,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.

{'loss': '3.2969', 'task': '3.2869', 'avg_loss': '3.1155388649244955', 'moe': '0.0100', 'lr': '1.019413e-04'}


Epoch 1/1:  35%|▎| 1996/5703 [21:56<37:39,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  35%|▎| 1997/5703 [21:57<37:39,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  35%|▎| 1998/5703 [21:57<37:38,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  35%|▎| 1999/5703 [21:58<37:40,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  35%|▎| 2000/5703 [21:59<37:39,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  35%|▎| 2001/5703 [21:59<37:39,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  35%|▎| 2002/5703 [22:00<37:39,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  35%|▎| 2003/5703 [22:00<37:39,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  35%|▎| 2004/5703 [22:01<37:39,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.

{'loss': '3.0736', 'task': '3.0636', 'avg_loss': '3.115016354480988', 'moe': '0.0100', 'lr': '1.018953e-04'}


Epoch 1/1:  36%|▎| 2031/5703 [22:17<37:27,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  36%|▎| 2032/5703 [22:18<37:28,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  36%|▎| 2033/5703 [22:19<37:25,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  36%|▎| 2034/5703 [22:19<37:24,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  36%|▎| 2035/5703 [22:20<37:23,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  36%|▎| 2036/5703 [22:21<37:24,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  36%|▎| 2037/5703 [22:21<37:23,  1.63it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  36%|▎| 2038/5703 [22:22<37:21,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  36%|▎| 2039/5703 [22:22<37:19,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.

{'loss': '2.9844', 'task': '2.9743', 'avg_loss': '3.1145321590848467', 'moe': '0.0100', 'lr': '1.018493e-04'}


Epoch 1/1:  36%|▎| 2066/5703 [22:39<36:59,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  36%|▎| 2067/5703 [22:39<36:57,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  36%|▎| 2068/5703 [22:40<36:56,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  36%|▎| 2069/5703 [22:41<36:55,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  36%|▎| 2070/5703 [22:41<36:54,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  36%|▎| 2071/5703 [22:42<36:54,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  36%|▎| 2072/5703 [22:43<36:52,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  36%|▎| 2073/5703 [22:43<36:54,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.2134, moEpoch 1/1:  36%|▎| 2074/5703 [22:44<36:53,  1.64it/s, loss=3.2234, avg_loss=3.1167786763054983, task=3.

{'loss': '3.1810', 'task': '3.1710', 'avg_loss': '3.1142201664334253', 'moe': '0.0100', 'lr': '1.018032e-04'}


Epoch 1/1:  37%|▎| 2101/5703 [23:00<36:38,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  37%|▎| 2102/5703 [23:01<36:36,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  37%|▎| 2103/5703 [23:01<36:38,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  37%|▎| 2104/5703 [23:02<36:37,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  37%|▎| 2105/5703 [23:03<36:35,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  37%|▎| 2106/5703 [23:03<36:34,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  37%|▎| 2107/5703 [23:04<36:34,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  37%|▎| 2108/5703 [23:05<36:33,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  37%|▎| 2109/5703 [23:05<36:32,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.

{'loss': '3.0806', 'task': '3.0706', 'avg_loss': '3.1141329783187257', 'moe': '0.0100', 'lr': '1.017572e-04'}


Epoch 1/1:  37%|▎| 2136/5703 [23:22<36:21,  1.63it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  37%|▎| 2137/5703 [23:22<36:20,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  37%|▎| 2138/5703 [23:23<36:18,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  38%|▍| 2139/5703 [23:24<36:18,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  38%|▍| 2140/5703 [23:24<36:22,  1.63it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  38%|▍| 2141/5703 [23:25<36:17,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  38%|▍| 2142/5703 [23:25<36:15,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  38%|▍| 2143/5703 [23:26<36:14,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  38%|▍| 2144/5703 [23:27<36:13,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.

{'loss': '3.1286', 'task': '3.1185', 'avg_loss': '3.1139134082926034', 'moe': '0.0100', 'lr': '1.017112e-04'}


Epoch 1/1:  38%|▍| 2171/5703 [23:43<35:57,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  38%|▍| 2172/5703 [23:44<35:55,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  38%|▍| 2173/5703 [23:44<35:56,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  38%|▍| 2174/5703 [23:45<35:54,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  38%|▍| 2175/5703 [23:46<35:53,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  38%|▍| 2176/5703 [23:46<35:53,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  38%|▍| 2177/5703 [23:47<35:52,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  38%|▍| 2178/5703 [23:47<35:52,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  38%|▍| 2179/5703 [23:48<35:51,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.

{'loss': '3.0732', 'task': '3.0632', 'avg_loss': '3.1134613737768055', 'moe': '0.0100', 'lr': '1.016653e-04'}


Epoch 1/1:  39%|▍| 2206/5703 [24:04<35:37,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  39%|▍| 2207/5703 [24:05<35:35,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  39%|▍| 2208/5703 [24:06<35:34,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  39%|▍| 2209/5703 [24:06<35:33,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  39%|▍| 2210/5703 [24:07<35:33,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  39%|▍| 2211/5703 [24:08<35:32,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  39%|▍| 2212/5703 [24:08<35:31,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  39%|▍| 2213/5703 [24:09<35:30,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  39%|▍| 2214/5703 [24:09<35:29,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.

{'loss': '3.0831', 'task': '3.0731', 'avg_loss': '3.1130497581192427', 'moe': '0.0100', 'lr': '1.016193e-04'}


Epoch 1/1:  39%|▍| 2241/5703 [24:26<35:13,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  39%|▍| 2242/5703 [24:26<35:14,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  39%|▍| 2243/5703 [24:27<35:13,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  39%|▍| 2244/5703 [24:28<35:14,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  39%|▍| 2245/5703 [24:28<35:13,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  39%|▍| 2246/5703 [24:29<35:11,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  39%|▍| 2247/5703 [24:30<35:12,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  39%|▍| 2248/5703 [24:30<35:11,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  39%|▍| 2249/5703 [24:31<35:12,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.

{'loss': '3.0199', 'task': '3.0099', 'avg_loss': '3.1129367670122083', 'moe': '0.0100', 'lr': '1.015733e-04'}


Epoch 1/1:  40%|▍| 2276/5703 [24:47<35:00,  1.63it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  40%|▍| 2277/5703 [24:48<34:56,  1.63it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  40%|▍| 2278/5703 [24:49<34:54,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  40%|▍| 2279/5703 [24:49<34:53,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  40%|▍| 2280/5703 [24:50<34:52,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  40%|▍| 2281/5703 [24:50<34:53,  1.63it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  40%|▍| 2282/5703 [24:51<34:52,  1.63it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  40%|▍| 2283/5703 [24:52<34:52,  1.63it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  40%|▍| 2284/5703 [24:52<34:50,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.

{'loss': '3.0738', 'task': '3.0638', 'avg_loss': '3.113039263089498', 'moe': '0.0100', 'lr': '1.015273e-04'}


Epoch 1/1:  41%|▍| 2311/5703 [25:09<34:34,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  41%|▍| 2312/5703 [25:09<34:33,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  41%|▍| 2313/5703 [25:10<34:31,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  41%|▍| 2314/5703 [25:11<34:30,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  41%|▍| 2315/5703 [25:11<34:30,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  41%|▍| 2316/5703 [25:12<34:30,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  41%|▍| 2317/5703 [25:12<34:29,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  41%|▍| 2318/5703 [25:13<34:29,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  41%|▍| 2319/5703 [25:14<34:31,  1.63it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.

{'loss': '3.1522', 'task': '3.1421', 'avg_loss': '3.113213604803024', 'moe': '0.0100', 'lr': '1.014813e-04'}


Epoch 1/1:  41%|▍| 2346/5703 [25:30<34:13,  1.63it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  41%|▍| 2347/5703 [25:31<34:11,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  41%|▍| 2348/5703 [25:31<34:10,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  41%|▍| 2349/5703 [25:32<34:08,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  41%|▍| 2350/5703 [25:33<34:07,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  41%|▍| 2351/5703 [25:33<34:12,  1.63it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  41%|▍| 2352/5703 [25:34<34:09,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  41%|▍| 2353/5703 [25:34<34:13,  1.63it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  41%|▍| 2354/5703 [25:35<34:09,  1.63it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.

{'loss': '3.0940', 'task': '3.0840', 'avg_loss': '3.11324527854679', 'moe': '0.0100', 'lr': '1.014354e-04'}


Epoch 1/1:  42%|▍| 2381/5703 [25:52<33:49,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  42%|▍| 2382/5703 [25:52<33:47,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  42%|▍| 2383/5703 [25:53<33:46,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  42%|▍| 2384/5703 [25:53<33:45,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  42%|▍| 2385/5703 [25:54<33:44,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  42%|▍| 2386/5703 [25:55<33:46,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  42%|▍| 2387/5703 [25:55<33:47,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  42%|▍| 2388/5703 [25:56<33:45,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  42%|▍| 2389/5703 [25:56<33:44,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.

{'loss': '3.0349', 'task': '3.0249', 'avg_loss': '3.1133397778615697', 'moe': '0.0100', 'lr': '1.013894e-04'}


Epoch 1/1:  42%|▍| 2416/5703 [26:13<33:26,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  42%|▍| 2417/5703 [26:14<33:28,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  42%|▍| 2418/5703 [26:14<33:26,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  42%|▍| 2419/5703 [26:15<33:25,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  42%|▍| 2420/5703 [26:15<33:24,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  42%|▍| 2421/5703 [26:16<33:26,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  42%|▍| 2422/5703 [26:17<33:25,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  42%|▍| 2423/5703 [26:17<33:24,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.1710, moEpoch 1/1:  43%|▍| 2424/5703 [26:18<33:22,  1.64it/s, loss=3.1810, avg_loss=3.1142201664334253, task=3.

{'loss': '3.1515', 'task': '3.1415', 'avg_loss': '3.1133532248711098', 'moe': '0.0100', 'lr': '1.013434e-04'}


Epoch 1/1:  43%|▍| 2451/5703 [26:34<33:06,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  43%|▍| 2452/5703 [26:35<33:07,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  43%|▍| 2453/5703 [26:36<33:05,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  43%|▍| 2454/5703 [26:36<33:03,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  43%|▍| 2455/5703 [26:37<33:02,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  43%|▍| 2456/5703 [26:37<33:01,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  43%|▍| 2457/5703 [26:38<33:00,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  43%|▍| 2458/5703 [26:39<32:59,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  43%|▍| 2459/5703 [26:39<32:59,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.

{'loss': '3.1000', 'task': '3.0899', 'avg_loss': '3.1134992004640147', 'moe': '0.0100', 'lr': '1.012975e-04'}


Epoch 1/1:  44%|▍| 2486/5703 [26:56<32:45,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  44%|▍| 2487/5703 [26:56<32:45,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  44%|▍| 2488/5703 [26:57<32:45,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  44%|▍| 2489/5703 [26:58<32:44,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  44%|▍| 2490/5703 [26:58<32:49,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  44%|▍| 2491/5703 [26:59<32:45,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  44%|▍| 2492/5703 [26:59<32:43,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  44%|▍| 2493/5703 [27:00<32:40,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  44%|▍| 2494/5703 [27:01<32:38,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.

{'loss': '3.1821', 'task': '3.1720', 'avg_loss': '3.1136806295031594', 'moe': '0.0100', 'lr': '1.012516e-04'}


Epoch 1/1:  44%|▍| 2521/5703 [27:17<32:22,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  44%|▍| 2522/5703 [27:18<32:21,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  44%|▍| 2523/5703 [27:18<32:21,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  44%|▍| 2524/5703 [27:19<32:20,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  44%|▍| 2525/5703 [27:20<32:19,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  44%|▍| 2526/5703 [27:20<32:18,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  44%|▍| 2527/5703 [27:21<32:18,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  44%|▍| 2528/5703 [27:21<32:17,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  44%|▍| 2529/5703 [27:22<32:17,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.

{'loss': '3.2191', 'task': '3.2091', 'avg_loss': '3.1135005759632985', 'moe': '0.0100', 'lr': '1.012056e-04'}


Epoch 1/1:  45%|▍| 2556/5703 [27:39<32:07,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  45%|▍| 2557/5703 [27:39<32:05,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  45%|▍| 2558/5703 [27:40<32:02,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  45%|▍| 2559/5703 [27:40<32:00,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  45%|▍| 2560/5703 [27:41<31:59,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  45%|▍| 2561/5703 [27:42<31:58,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  45%|▍| 2562/5703 [27:42<31:57,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  45%|▍| 2563/5703 [27:43<31:56,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  45%|▍| 2564/5703 [27:43<31:56,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.

{'loss': '3.1751', 'task': '3.1651', 'avg_loss': '3.1134923264787004', 'moe': '0.0100', 'lr': '1.011597e-04'}


Epoch 1/1:  45%|▍| 2591/5703 [28:00<31:45,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  45%|▍| 2592/5703 [28:01<31:45,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  45%|▍| 2593/5703 [28:01<31:47,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  45%|▍| 2594/5703 [28:02<31:44,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  46%|▍| 2595/5703 [28:02<31:42,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  46%|▍| 2596/5703 [28:03<31:42,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  46%|▍| 2597/5703 [28:04<31:39,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  46%|▍| 2598/5703 [28:04<31:39,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  46%|▍| 2599/5703 [28:05<31:39,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.

{'loss': '3.1223', 'task': '3.1123', 'avg_loss': '3.1135781784965877', 'moe': '0.0100', 'lr': '1.011137e-04'}


Epoch 1/1:  46%|▍| 2626/5703 [28:21<31:22,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  46%|▍| 2627/5703 [28:22<31:23,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  46%|▍| 2628/5703 [28:23<31:27,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  46%|▍| 2629/5703 [28:23<31:24,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  46%|▍| 2630/5703 [28:24<31:20,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  46%|▍| 2631/5703 [28:24<31:19,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  46%|▍| 2632/5703 [28:25<31:16,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  46%|▍| 2633/5703 [28:26<31:15,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  46%|▍| 2634/5703 [28:26<31:14,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.

{'loss': '3.1827', 'task': '3.1727', 'avg_loss': '3.113636548895585', 'moe': '0.0100', 'lr': '1.010678e-04'}


Epoch 1/1:  47%|▍| 2661/5703 [28:43<30:58,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  47%|▍| 2662/5703 [28:43<30:56,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  47%|▍| 2663/5703 [28:44<30:56,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  47%|▍| 2664/5703 [28:45<30:57,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  47%|▍| 2665/5703 [28:45<30:56,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  47%|▍| 2666/5703 [28:46<30:57,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  47%|▍| 2667/5703 [28:46<30:57,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  47%|▍| 2668/5703 [28:47<30:55,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  47%|▍| 2669/5703 [28:48<30:53,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.

{'loss': '3.0288', 'task': '3.0188', 'avg_loss': '3.1134266645435056', 'moe': '0.0100', 'lr': '1.010219e-04'}


Epoch 1/1:  47%|▍| 2696/5703 [29:04<30:45,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  47%|▍| 2697/5703 [29:05<30:46,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  47%|▍| 2698/5703 [29:05<30:44,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  47%|▍| 2699/5703 [29:06<30:48,  1.62it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  47%|▍| 2700/5703 [29:07<30:39,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  47%|▍| 2701/5703 [29:07<30:38,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  47%|▍| 2702/5703 [29:08<30:36,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  47%|▍| 2703/5703 [29:08<30:35,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  47%|▍| 2704/5703 [29:09<30:33,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.

{'loss': '3.1264', 'task': '3.1163', 'avg_loss': '3.113257970303406', 'moe': '0.0100', 'lr': '1.009760e-04'}


Epoch 1/1:  48%|▍| 2731/5703 [29:26<30:16,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  48%|▍| 2732/5703 [29:26<30:15,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  48%|▍| 2733/5703 [29:27<30:13,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  48%|▍| 2734/5703 [29:27<30:12,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  48%|▍| 2735/5703 [29:28<30:11,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  48%|▍| 2736/5703 [29:29<30:13,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  48%|▍| 2737/5703 [29:29<30:11,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  48%|▍| 2738/5703 [29:30<30:11,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  48%|▍| 2739/5703 [29:30<30:10,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.

{'loss': '3.0137', 'task': '3.0037', 'avg_loss': '3.1129870669104522', 'moe': '0.0100', 'lr': '1.009301e-04'}


Epoch 1/1:  49%|▍| 2766/5703 [29:47<29:56,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  49%|▍| 2767/5703 [29:48<29:54,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  49%|▍| 2768/5703 [29:48<29:55,  1.63it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  49%|▍| 2769/5703 [29:49<29:53,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  49%|▍| 2770/5703 [29:49<29:51,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  49%|▍| 2771/5703 [29:50<29:52,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  49%|▍| 2772/5703 [29:51<29:52,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  49%|▍| 2773/5703 [29:51<29:52,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.1415, moEpoch 1/1:  49%|▍| 2774/5703 [29:52<29:50,  1.64it/s, loss=3.1515, avg_loss=3.1133532248711098, task=3.

{'loss': '3.2360', 'task': '3.2259', 'avg_loss': '3.113008128745215', 'moe': '0.0100', 'lr': '1.008842e-04'}


Epoch 1/1:  49%|▍| 2801/5703 [30:08<29:34,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  49%|▍| 2802/5703 [30:09<29:34,  1.63it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  49%|▍| 2803/5703 [30:10<29:32,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  49%|▍| 2804/5703 [30:10<29:33,  1.63it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  49%|▍| 2805/5703 [30:11<29:38,  1.63it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  49%|▍| 2806/5703 [30:11<29:32,  1.63it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  49%|▍| 2807/5703 [30:12<29:30,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  49%|▍| 2808/5703 [30:13<29:31,  1.63it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  49%|▍| 2809/5703 [30:13<29:35,  1.63it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2

{'loss': '3.1088', 'task': '3.0988', 'avg_loss': '3.1128184580929066', 'moe': '0.0100', 'lr': '1.008383e-04'}


Epoch 1/1:  50%|▍| 2836/5703 [30:30<29:12,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  50%|▍| 2837/5703 [30:30<29:10,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  50%|▍| 2838/5703 [30:31<29:09,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  50%|▍| 2839/5703 [30:32<29:08,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  50%|▍| 2840/5703 [30:32<29:08,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  50%|▍| 2841/5703 [30:33<29:08,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  50%|▍| 2842/5703 [30:33<29:07,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  50%|▍| 2843/5703 [30:34<29:06,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  50%|▍| 2844/5703 [30:35<29:05,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2

{'loss': '3.0176', 'task': '3.0075', 'avg_loss': '3.1123284369810946', 'moe': '0.0100', 'lr': '1.007924e-04'}


Epoch 1/1:  50%|▌| 2871/5703 [30:51<28:48,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  50%|▌| 2872/5703 [30:52<28:47,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  50%|▌| 2873/5703 [30:52<28:47,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  50%|▌| 2874/5703 [30:53<28:46,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  50%|▌| 2875/5703 [30:54<28:45,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  50%|▌| 2876/5703 [30:54<28:44,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  50%|▌| 2877/5703 [30:55<28:45,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  50%|▌| 2878/5703 [30:55<28:44,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  50%|▌| 2879/5703 [30:56<28:43,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2

{'loss': '3.0109', 'task': '3.0009', 'avg_loss': '3.1123254119426575', 'moe': '0.0100', 'lr': '1.007465e-04'}


Epoch 1/1:  51%|▌| 2906/5703 [31:13<28:25,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  51%|▌| 2907/5703 [31:13<28:25,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  51%|▌| 2908/5703 [31:14<28:25,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  51%|▌| 2909/5703 [31:14<28:24,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  51%|▌| 2910/5703 [31:15<28:23,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  51%|▌| 2911/5703 [31:16<28:23,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  51%|▌| 2912/5703 [31:16<28:22,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  51%|▌| 2913/5703 [31:17<28:21,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  51%|▌| 2914/5703 [31:17<28:20,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2

{'loss': '3.0820', 'task': '3.0720', 'avg_loss': '3.112276511451825', 'moe': '0.0100', 'lr': '1.007006e-04'}


Epoch 1/1:  52%|▌| 2941/5703 [31:34<28:05,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  52%|▌| 2942/5703 [31:35<28:04,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  52%|▌| 2943/5703 [31:35<28:03,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  52%|▌| 2944/5703 [31:36<28:02,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  52%|▌| 2945/5703 [31:36<28:01,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  52%|▌| 2946/5703 [31:37<28:01,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  52%|▌| 2947/5703 [31:38<28:02,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  52%|▌| 2948/5703 [31:38<28:02,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  52%|▌| 2949/5703 [31:39<28:05,  1.63it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2

{'loss': '3.0881', 'task': '3.0781', 'avg_loss': '3.1122775700513055', 'moe': '0.0100', 'lr': '1.006547e-04'}


Epoch 1/1:  52%|▌| 2976/5703 [31:55<27:42,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  52%|▌| 2977/5703 [31:56<27:41,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  52%|▌| 2978/5703 [31:57<27:40,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  52%|▌| 2979/5703 [31:57<27:40,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  52%|▌| 2980/5703 [31:58<27:40,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  52%|▌| 2981/5703 [31:58<27:40,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  52%|▌| 2982/5703 [31:59<27:39,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  52%|▌| 2983/5703 [32:00<27:38,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  52%|▌| 2984/5703 [32:00<27:37,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2

{'loss': '3.0054', 'task': '2.9954', 'avg_loss': '3.112086161980993', 'moe': '0.0100', 'lr': '1.006088e-04'}


Epoch 1/1:  53%|▌| 3011/5703 [32:17<27:23,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  53%|▌| 3012/5703 [32:17<27:22,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  53%|▌| 3013/5703 [32:18<27:21,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  53%|▌| 3014/5703 [32:18<27:21,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  53%|▌| 3015/5703 [32:19<27:21,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  53%|▌| 3016/5703 [32:20<27:20,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  53%|▌| 3017/5703 [32:20<27:20,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  53%|▌| 3018/5703 [32:21<27:19,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  53%|▌| 3019/5703 [32:22<27:18,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2

{'loss': '3.0425', 'task': '3.0324', 'avg_loss': '3.1120157338911287', 'moe': '0.0100', 'lr': '1.005630e-04'}


Epoch 1/1:  53%|▌| 3046/5703 [32:38<27:00,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  53%|▌| 3047/5703 [32:39<26:59,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  53%|▌| 3048/5703 [32:39<26:59,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  53%|▌| 3049/5703 [32:40<27:00,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  53%|▌| 3050/5703 [32:40<26:59,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  53%|▌| 3051/5703 [32:41<26:57,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  54%|▌| 3052/5703 [32:42<26:59,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  54%|▌| 3053/5703 [32:42<26:59,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  54%|▌| 3054/5703 [32:43<26:58,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2

{'loss': '3.1254', 'task': '3.1154', 'avg_loss': '3.1119710058360903', 'moe': '0.0100', 'lr': '1.005171e-04'}


Epoch 1/1:  54%|▌| 3081/5703 [32:59<26:46,  1.63it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  54%|▌| 3082/5703 [33:00<26:46,  1.63it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  54%|▌| 3083/5703 [33:01<26:45,  1.63it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  54%|▌| 3084/5703 [33:01<26:45,  1.63it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  54%|▌| 3085/5703 [33:02<26:43,  1.63it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  54%|▌| 3086/5703 [33:03<26:41,  1.63it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  54%|▌| 3087/5703 [33:03<26:38,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  54%|▌| 3088/5703 [33:04<26:36,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  54%|▌| 3089/5703 [33:04<26:35,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2

{'loss': '3.1511', 'task': '3.1411', 'avg_loss': '3.1117665129335506', 'moe': '0.0100', 'lr': '1.004712e-04'}


Epoch 1/1:  55%|▌| 3116/5703 [33:21<26:17,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  55%|▌| 3117/5703 [33:21<26:16,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  55%|▌| 3118/5703 [33:22<26:17,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  55%|▌| 3119/5703 [33:23<26:16,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  55%|▌| 3120/5703 [33:23<26:15,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  55%|▌| 3121/5703 [33:24<26:14,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  55%|▌| 3122/5703 [33:25<26:13,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  55%|▌| 3123/5703 [33:25<26:12,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2259, moeEpoch 1/1:  55%|▌| 3124/5703 [33:26<26:12,  1.64it/s, loss=3.2360, avg_loss=3.113008128745215, task=3.2

{'loss': '3.1922', 'task': '3.1822', 'avg_loss': '3.1116401130434066', 'moe': '0.0100', 'lr': '1.004254e-04'}


Epoch 1/1:  55%|▌| 3151/5703 [33:42<25:57,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  55%|▌| 3152/5703 [33:43<25:59,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  55%|▌| 3153/5703 [33:43<26:00,  1.63it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  55%|▌| 3154/5703 [33:44<25:58,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  55%|▌| 3155/5703 [33:45<25:58,  1.63it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  55%|▌| 3156/5703 [33:45<25:57,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  55%|▌| 3157/5703 [33:46<25:55,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  55%|▌| 3158/5703 [33:47<25:55,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  55%|▌| 3159/5703 [33:47<25:54,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.

{'loss': '3.0519', 'task': '3.0418', 'avg_loss': '3.1113898614886417', 'moe': '0.0100', 'lr': '1.003795e-04'}


Epoch 1/1:  56%|▌| 3186/5703 [34:04<25:36,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  56%|▌| 3187/5703 [34:04<25:35,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  56%|▌| 3188/5703 [34:05<25:34,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  56%|▌| 3189/5703 [34:05<25:34,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  56%|▌| 3190/5703 [34:06<25:33,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  56%|▌| 3191/5703 [34:07<25:33,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  56%|▌| 3192/5703 [34:07<25:34,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  56%|▌| 3193/5703 [34:08<25:33,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  56%|▌| 3194/5703 [34:09<25:32,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.

{'loss': '3.0856', 'task': '3.0755', 'avg_loss': '3.111274612023964', 'moe': '0.0100', 'lr': '1.003337e-04'}


Epoch 1/1:  56%|▌| 3221/5703 [34:25<25:14,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  56%|▌| 3222/5703 [34:26<25:13,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  57%|▌| 3223/5703 [34:26<25:12,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  57%|▌| 3224/5703 [34:27<25:12,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  57%|▌| 3225/5703 [34:27<25:11,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  57%|▌| 3226/5703 [34:28<25:10,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  57%|▌| 3227/5703 [34:29<25:10,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  57%|▌| 3228/5703 [34:29<25:09,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  57%|▌| 3229/5703 [34:30<25:09,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.

{'loss': '3.0372', 'task': '3.0271', 'avg_loss': '3.111430630119898', 'moe': '0.0100', 'lr': '1.002879e-04'}


Epoch 1/1:  57%|▌| 3256/5703 [34:46<24:52,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  57%|▌| 3257/5703 [34:47<24:51,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  57%|▌| 3258/5703 [34:48<24:51,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  57%|▌| 3259/5703 [34:48<24:50,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  57%|▌| 3260/5703 [34:49<24:49,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  57%|▌| 3261/5703 [34:49<24:48,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  57%|▌| 3262/5703 [34:50<24:48,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  57%|▌| 3263/5703 [34:51<24:47,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  57%|▌| 3264/5703 [34:51<24:46,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.

{'loss': '3.0179', 'task': '3.0079', 'avg_loss': '3.111232889894294', 'moe': '0.0100', 'lr': '1.002420e-04'}


Epoch 1/1:  58%|▌| 3291/5703 [35:08<24:30,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  58%|▌| 3292/5703 [35:08<24:30,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  58%|▌| 3293/5703 [35:09<24:29,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  58%|▌| 3294/5703 [35:10<24:28,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  58%|▌| 3295/5703 [35:10<24:28,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  58%|▌| 3296/5703 [35:11<24:27,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  58%|▌| 3297/5703 [35:11<24:26,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  58%|▌| 3298/5703 [35:12<24:26,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  58%|▌| 3299/5703 [35:13<24:25,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.

{'loss': '3.2100', 'task': '3.2000', 'avg_loss': '3.1112097676356036', 'moe': '0.0100', 'lr': '1.001962e-04'}


Epoch 1/1:  58%|▌| 3326/5703 [35:29<24:09,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  58%|▌| 3327/5703 [35:30<24:08,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  58%|▌| 3328/5703 [35:30<24:07,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  58%|▌| 3329/5703 [35:31<24:06,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  58%|▌| 3330/5703 [35:32<24:06,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  58%|▌| 3331/5703 [35:32<24:05,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  58%|▌| 3332/5703 [35:33<24:07,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  58%|▌| 3333/5703 [35:33<24:06,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  58%|▌| 3334/5703 [35:34<24:05,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.

{'loss': '3.1502', 'task': '3.1402', 'avg_loss': '3.1114679223724773', 'moe': '0.0100', 'lr': '1.001504e-04'}


Epoch 1/1:  59%|▌| 3361/5703 [35:50<23:48,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  59%|▌| 3362/5703 [35:51<23:47,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  59%|▌| 3363/5703 [35:52<23:47,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  59%|▌| 3364/5703 [35:52<23:46,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  59%|▌| 3365/5703 [35:53<23:46,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  59%|▌| 3366/5703 [35:53<23:45,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  59%|▌| 3367/5703 [35:54<23:44,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  59%|▌| 3368/5703 [35:55<23:43,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  59%|▌| 3369/5703 [35:55<23:43,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.

{'loss': '3.1744', 'task': '3.1644', 'avg_loss': '3.1113417088195394', 'moe': '0.0100', 'lr': '1.001046e-04'}


Epoch 1/1:  60%|▌| 3396/5703 [36:12<23:28,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  60%|▌| 3397/5703 [36:12<23:27,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  60%|▌| 3398/5703 [36:13<23:26,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  60%|▌| 3399/5703 [36:14<23:25,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  60%|▌| 3400/5703 [36:14<23:25,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  60%|▌| 3401/5703 [36:15<23:24,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  60%|▌| 3402/5703 [36:15<23:23,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  60%|▌| 3403/5703 [36:16<23:23,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  60%|▌| 3404/5703 [36:17<23:24,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.

{'loss': '3.0221', 'task': '3.0121', 'avg_loss': '3.111129811901393', 'moe': '0.0100', 'lr': '1.000587e-04'}


Epoch 1/1:  60%|▌| 3431/5703 [36:33<23:04,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  60%|▌| 3432/5703 [36:34<23:04,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  60%|▌| 3433/5703 [36:34<23:03,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  60%|▌| 3434/5703 [36:35<23:03,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  60%|▌| 3435/5703 [36:36<23:02,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  60%|▌| 3436/5703 [36:36<23:02,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  60%|▌| 3437/5703 [36:37<23:01,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  60%|▌| 3438/5703 [36:37<23:01,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  60%|▌| 3439/5703 [36:38<23:00,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.

{'loss': '3.1349', 'task': '3.1249', 'avg_loss': '3.111077632807722', 'moe': '0.0100', 'lr': '1.000129e-04'}


Epoch 1/1:  61%|▌| 3466/5703 [36:55<22:43,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  61%|▌| 3467/5703 [36:55<22:42,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  61%|▌| 3468/5703 [36:56<22:42,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  61%|▌| 3469/5703 [36:56<22:41,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  61%|▌| 3470/5703 [36:57<22:41,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  61%|▌| 3471/5703 [36:58<22:40,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  61%|▌| 3472/5703 [36:58<22:39,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  61%|▌| 3473/5703 [36:59<22:39,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.1822, moEpoch 1/1:  61%|▌| 3474/5703 [36:59<22:38,  1.64it/s, loss=3.1922, avg_loss=3.1116401130434066, task=3.

{'loss': '3.1633', 'task': '3.1533', 'avg_loss': '3.111122304848262', 'moe': '0.0100', 'lr': '9.996714e-05'}

✅ Saved checkpoint to satori_akasha_checkpoints/step_lightning_6200.pt


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ..._checkpoints/step_lightning_6200.pt:   0%|          |  552kB / 3.27GB            

Epoch 1/1:  61%|▌| 3500/5703 [38:11<10:36:24, 17.33s/it, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, Epoch 1/1:  61%|▌| 3501/5703 [38:12<7:31:51, 12.31s/it, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, mEpoch 1/1:  61%|▌| 3502/5703 [38:12<5:22:49,  8.80s/it, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, mEpoch 1/1:  61%|▌| 3503/5703 [38:13<3:52:36,  6.34s/it, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, mEpoch 1/1:  61%|▌| 3504/5703 [38:13<2:49:29,  4.62s/it, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, mEpoch 1/1:  61%|▌| 3505/5703 [38:14<2:05:19,  3.42s/it, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, mEpoch 1/1:  61%|▌| 3506/5703 [38:15<1:34:25,  2.58s/it, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, mEpoch 1/1:  61%|▌| 3507/5703 [38:15<1:12:47,  1.99s/it, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, mEpoch 1/1:  62%|▌| 3508/5703 [38:16<57:37,  1.58s/it, loss=3.1633, avg_loss=3.111122304848262, task=3.1

{'loss': '3.0095', 'task': '2.9995', 'avg_loss': '3.1109054691552105', 'moe': '0.0100', 'lr': '9.992134e-05'}


Epoch 1/1:  62%|▌| 3536/5703 [38:33<22:02,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  62%|▌| 3537/5703 [38:34<22:01,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  62%|▌| 3538/5703 [38:34<22:01,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  62%|▌| 3539/5703 [38:35<22:00,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  62%|▌| 3540/5703 [38:35<21:59,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  62%|▌| 3541/5703 [38:36<21:58,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  62%|▌| 3542/5703 [38:37<21:58,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  62%|▌| 3543/5703 [38:37<21:57,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  62%|▌| 3544/5703 [38:38<21:57,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1

{'loss': '3.1508', 'task': '3.1408', 'avg_loss': '3.110935464359465', 'moe': '0.0100', 'lr': '9.987555e-05'}


Epoch 1/1:  63%|▋| 3571/5703 [38:54<21:40,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  63%|▋| 3572/5703 [38:55<21:39,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  63%|▋| 3573/5703 [38:56<21:39,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  63%|▋| 3574/5703 [38:56<21:39,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  63%|▋| 3575/5703 [38:57<21:38,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  63%|▋| 3576/5703 [38:57<21:37,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  63%|▋| 3577/5703 [38:58<21:35,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  63%|▋| 3578/5703 [38:59<21:38,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  63%|▋| 3579/5703 [38:59<21:37,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1

{'loss': '3.1269', 'task': '3.1169', 'avg_loss': '3.1111666348704685', 'moe': '0.0100', 'lr': '9.982976e-05'}


Epoch 1/1:  63%|▋| 3606/5703 [39:16<21:25,  1.63it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  63%|▋| 3607/5703 [39:16<21:26,  1.63it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  63%|▋| 3608/5703 [39:17<21:24,  1.63it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  63%|▋| 3609/5703 [39:18<21:24,  1.63it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  63%|▋| 3610/5703 [39:18<21:23,  1.63it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  63%|▋| 3611/5703 [39:19<21:22,  1.63it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  63%|▋| 3612/5703 [39:19<21:21,  1.63it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  63%|▋| 3613/5703 [39:20<21:20,  1.63it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  63%|▋| 3614/5703 [39:21<21:19,  1.63it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1

{'loss': '2.9792', 'task': '2.9691', 'avg_loss': '3.110967662177243', 'moe': '0.0100', 'lr': '9.978398e-05'}


Epoch 1/1:  64%|▋| 3641/5703 [39:37<21:00,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  64%|▋| 3642/5703 [39:38<20:59,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  64%|▋| 3643/5703 [39:38<20:58,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  64%|▋| 3644/5703 [39:39<20:57,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  64%|▋| 3645/5703 [39:40<20:56,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  64%|▋| 3646/5703 [39:40<20:56,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  64%|▋| 3647/5703 [39:41<20:55,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  64%|▋| 3648/5703 [39:41<20:54,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  64%|▋| 3649/5703 [39:42<20:52,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1

{'loss': '3.1381', 'task': '3.1281', 'avg_loss': '3.111047205243792', 'moe': '0.0100', 'lr': '9.973821e-05'}


Epoch 1/1:  64%|▋| 3676/5703 [39:59<20:33,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  64%|▋| 3677/5703 [39:59<20:33,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  64%|▋| 3678/5703 [40:00<20:33,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  65%|▋| 3679/5703 [40:00<20:32,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  65%|▋| 3680/5703 [40:01<20:33,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  65%|▋| 3681/5703 [40:02<20:32,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  65%|▋| 3682/5703 [40:02<20:31,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  65%|▋| 3683/5703 [40:03<20:30,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  65%|▋| 3684/5703 [40:03<20:29,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1

{'loss': '3.0704', 'task': '3.0604', 'avg_loss': '3.1110685480894102', 'moe': '0.0100', 'lr': '9.969243e-05'}


Epoch 1/1:  65%|▋| 3711/5703 [40:20<20:11,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  65%|▋| 3712/5703 [40:20<20:10,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  65%|▋| 3713/5703 [40:21<20:10,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  65%|▋| 3714/5703 [40:22<20:10,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  65%|▋| 3715/5703 [40:22<20:09,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  65%|▋| 3716/5703 [40:23<20:09,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  65%|▋| 3717/5703 [40:24<20:08,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  65%|▋| 3718/5703 [40:24<20:07,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  65%|▋| 3719/5703 [40:25<20:06,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1

{'loss': '3.1395', 'task': '3.1295', 'avg_loss': '3.111085489945354', 'moe': '0.0100', 'lr': '9.964667e-05'}


Epoch 1/1:  66%|▋| 3746/5703 [40:41<19:53,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  66%|▋| 3747/5703 [40:42<19:52,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  66%|▋| 3748/5703 [40:42<19:50,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  66%|▋| 3749/5703 [40:43<19:50,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  66%|▋| 3750/5703 [40:44<19:49,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  66%|▋| 3751/5703 [40:44<19:48,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  66%|▋| 3752/5703 [40:45<19:47,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  66%|▋| 3753/5703 [40:45<19:47,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  66%|▋| 3754/5703 [40:46<19:47,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1

{'loss': '3.1617', 'task': '3.1517', 'avg_loss': '3.111038940293448', 'moe': '0.0100', 'lr': '9.960091e-05'}


Epoch 1/1:  66%|▋| 3781/5703 [41:03<19:31,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  66%|▋| 3782/5703 [41:03<19:30,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  66%|▋| 3783/5703 [41:04<19:30,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  66%|▋| 3784/5703 [41:04<19:30,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  66%|▋| 3785/5703 [41:05<19:29,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  66%|▋| 3786/5703 [41:06<19:28,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  66%|▋| 3787/5703 [41:06<19:27,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  66%|▋| 3788/5703 [41:07<19:26,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  66%|▋| 3789/5703 [41:07<19:25,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1

{'loss': '3.1486', 'task': '3.1386', 'avg_loss': '3.1109012295659215', 'moe': '0.0100', 'lr': '9.955516e-05'}


Epoch 1/1:  67%|▋| 3816/5703 [41:24<19:09,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  67%|▋| 3817/5703 [41:24<19:08,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  67%|▋| 3818/5703 [41:25<19:07,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  67%|▋| 3819/5703 [41:26<19:06,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  67%|▋| 3820/5703 [41:26<19:05,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  67%|▋| 3821/5703 [41:27<19:05,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  67%|▋| 3822/5703 [41:28<19:04,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  67%|▋| 3823/5703 [41:28<19:03,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1533, moeEpoch 1/1:  67%|▋| 3824/5703 [41:29<19:03,  1.64it/s, loss=3.1633, avg_loss=3.111122304848262, task=3.1

{'loss': '3.0953', 'task': '3.0853', 'avg_loss': '3.110937819418969', 'moe': '0.0100', 'lr': '9.950941e-05'}


Epoch 1/1:  68%|▋| 3851/5703 [41:45<18:51,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  68%|▋| 3852/5703 [41:46<18:50,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  68%|▋| 3853/5703 [41:47<18:50,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  68%|▋| 3854/5703 [41:47<18:49,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  68%|▋| 3855/5703 [41:48<18:49,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  68%|▋| 3856/5703 [41:49<18:49,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  68%|▋| 3857/5703 [41:49<18:48,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  68%|▋| 3858/5703 [41:50<18:48,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  68%|▋| 3859/5703 [41:50<18:47,  1.63it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0

{'loss': '3.0998', 'task': '3.0897', 'avg_loss': '3.110967868144601', 'moe': '0.0100', 'lr': '9.946366e-05'}


Epoch 1/1:  68%|▋| 3886/5703 [42:07<18:25,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  68%|▋| 3887/5703 [42:07<18:25,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  68%|▋| 3888/5703 [42:08<18:24,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  68%|▋| 3889/5703 [42:09<18:24,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  68%|▋| 3890/5703 [42:09<18:24,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  68%|▋| 3891/5703 [42:10<18:23,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  68%|▋| 3892/5703 [42:11<18:23,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  68%|▋| 3893/5703 [42:11<18:22,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  68%|▋| 3894/5703 [42:12<18:21,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0

{'loss': '3.0740', 'task': '3.0639', 'avg_loss': '3.1111160264331468', 'moe': '0.0100', 'lr': '9.941792e-05'}


Epoch 1/1:  69%|▋| 3921/5703 [42:28<18:06,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  69%|▋| 3922/5703 [42:29<18:06,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  69%|▋| 3923/5703 [42:29<18:04,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  69%|▋| 3924/5703 [42:30<18:03,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  69%|▋| 3925/5703 [42:31<18:03,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  69%|▋| 3926/5703 [42:31<18:04,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  69%|▋| 3927/5703 [42:32<18:04,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  69%|▋| 3928/5703 [42:32<18:03,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  69%|▋| 3929/5703 [42:33<18:02,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0

{'loss': '3.0092', 'task': '2.9991', 'avg_loss': '3.111080329728639', 'moe': '0.0100', 'lr': '9.937219e-05'}


Epoch 1/1:  69%|▋| 3956/5703 [42:50<17:45,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  69%|▋| 3957/5703 [42:50<17:45,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  69%|▋| 3958/5703 [42:51<17:44,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  69%|▋| 3959/5703 [42:51<17:44,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  69%|▋| 3960/5703 [42:52<17:43,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  69%|▋| 3961/5703 [42:53<17:43,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  69%|▋| 3962/5703 [42:53<17:42,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  69%|▋| 3963/5703 [42:54<17:42,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  70%|▋| 3964/5703 [42:54<17:41,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0

{'loss': '3.0186', 'task': '3.0086', 'avg_loss': '3.1112144178615178', 'moe': '0.0100', 'lr': '9.932646e-05'}


Epoch 1/1:  70%|▋| 3991/5703 [43:11<17:26,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  70%|▋| 3992/5703 [43:12<17:26,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  70%|▋| 3993/5703 [43:12<17:24,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  70%|▋| 3994/5703 [43:13<17:23,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  70%|▋| 3995/5703 [43:13<17:23,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  70%|▋| 3996/5703 [43:14<17:22,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  70%|▋| 3997/5703 [43:15<17:22,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  70%|▋| 3998/5703 [43:15<17:21,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  70%|▋| 3999/5703 [43:16<17:20,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0

{'loss': '3.1064', 'task': '3.0964', 'avg_loss': '3.111172031912004', 'moe': '0.0100', 'lr': '9.928074e-05'}


Epoch 1/1:  71%|▋| 4026/5703 [43:32<17:02,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  71%|▋| 4027/5703 [43:33<17:01,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  71%|▋| 4028/5703 [43:34<17:01,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  71%|▋| 4029/5703 [43:34<17:00,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  71%|▋| 4030/5703 [43:35<17:00,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  71%|▋| 4031/5703 [43:35<16:59,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  71%|▋| 4032/5703 [43:36<16:59,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  71%|▋| 4033/5703 [43:37<16:58,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  71%|▋| 4034/5703 [43:37<16:58,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0

{'loss': '3.0735', 'task': '3.0635', 'avg_loss': '3.1109918687143936', 'moe': '0.0100', 'lr': '9.923502e-05'}


Epoch 1/1:  71%|▋| 4061/5703 [43:54<16:41,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  71%|▋| 4062/5703 [43:54<16:40,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  71%|▋| 4063/5703 [43:55<16:39,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  71%|▋| 4064/5703 [43:56<16:39,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  71%|▋| 4065/5703 [43:56<16:38,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  71%|▋| 4066/5703 [43:57<16:38,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  71%|▋| 4067/5703 [43:57<16:38,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  71%|▋| 4068/5703 [43:58<16:37,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  71%|▋| 4069/5703 [43:59<16:36,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0

{'loss': '3.2244', 'task': '3.2143', 'avg_loss': '3.110846039838407', 'moe': '0.0100', 'lr': '9.918931e-05'}


Epoch 1/1:  72%|▋| 4096/5703 [44:15<16:19,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  72%|▋| 4097/5703 [44:16<16:19,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  72%|▋| 4098/5703 [44:16<16:18,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  72%|▋| 4099/5703 [44:17<16:18,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  72%|▋| 4100/5703 [44:18<16:17,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  72%|▋| 4101/5703 [44:18<16:17,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  72%|▋| 4102/5703 [44:19<16:16,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  72%|▋| 4103/5703 [44:19<16:16,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  72%|▋| 4104/5703 [44:20<16:15,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0

{'loss': '3.1215', 'task': '3.1114', 'avg_loss': '3.1109123970636734', 'moe': '0.0100', 'lr': '9.914361e-05'}


Epoch 1/1:  72%|▋| 4131/5703 [44:36<15:58,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  72%|▋| 4132/5703 [44:37<15:58,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  72%|▋| 4133/5703 [44:38<15:57,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  72%|▋| 4134/5703 [44:38<15:56,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  73%|▋| 4135/5703 [44:39<15:56,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  73%|▋| 4136/5703 [44:39<15:55,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  73%|▋| 4137/5703 [44:40<15:55,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  73%|▋| 4138/5703 [44:41<15:54,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  73%|▋| 4139/5703 [44:41<15:54,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0

{'loss': '3.1930', 'task': '3.1830', 'avg_loss': '3.1108693923698323', 'moe': '0.0100', 'lr': '9.909790e-05'}


Epoch 1/1:  73%|▋| 4166/5703 [44:58<15:37,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  73%|▋| 4167/5703 [44:58<15:36,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  73%|▋| 4168/5703 [44:59<15:36,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  73%|▋| 4169/5703 [45:00<15:36,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  73%|▋| 4170/5703 [45:00<15:36,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  73%|▋| 4171/5703 [45:01<15:35,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  73%|▋| 4172/5703 [45:01<15:34,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  73%|▋| 4173/5703 [45:02<15:34,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0853, moeEpoch 1/1:  73%|▋| 4174/5703 [45:03<15:33,  1.64it/s, loss=3.0953, avg_loss=3.110937819418969, task=3.0

{'loss': '3.0180', 'task': '3.0080', 'avg_loss': '3.110769446123214', 'moe': '0.0100', 'lr': '9.905221e-05'}


Epoch 1/1:  74%|▋| 4201/5703 [45:19<15:18,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  74%|▋| 4202/5703 [45:20<15:17,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  74%|▋| 4203/5703 [45:20<15:16,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  74%|▋| 4204/5703 [45:21<15:15,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  74%|▋| 4205/5703 [45:22<15:14,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  74%|▋| 4206/5703 [45:22<15:14,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  74%|▋| 4207/5703 [45:23<15:13,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  74%|▋| 4208/5703 [45:23<15:12,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  74%|▋| 4209/5703 [45:24<15:12,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0

{'loss': '3.1644', 'task': '3.1543', 'avg_loss': '3.110725148143566', 'moe': '0.0100', 'lr': '9.900652e-05'}


Epoch 1/1:  74%|▋| 4236/5703 [45:41<14:55,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  74%|▋| 4237/5703 [45:41<14:54,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  74%|▋| 4238/5703 [45:42<14:53,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  74%|▋| 4239/5703 [45:42<14:52,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  74%|▋| 4240/5703 [45:43<14:52,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  74%|▋| 4241/5703 [45:44<14:51,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  74%|▋| 4242/5703 [45:44<14:50,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  74%|▋| 4243/5703 [45:45<14:49,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  74%|▋| 4244/5703 [45:45<14:49,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0

{'loss': '3.0496', 'task': '3.0395', 'avg_loss': '3.1105758857392036', 'moe': '0.0100', 'lr': '9.896083e-05'}


Epoch 1/1:  75%|▋| 4271/5703 [46:02<14:32,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  75%|▋| 4272/5703 [46:03<14:32,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  75%|▋| 4273/5703 [46:03<14:32,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  75%|▋| 4274/5703 [46:04<14:31,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  75%|▋| 4275/5703 [46:04<14:30,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  75%|▋| 4276/5703 [46:05<14:30,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  75%|▋| 4277/5703 [46:06<14:29,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  75%|▊| 4278/5703 [46:06<14:29,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  75%|▊| 4279/5703 [46:07<14:28,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0

{'loss': '2.9979', 'task': '2.9879', 'avg_loss': '3.110557962266964', 'moe': '0.0100', 'lr': '9.891515e-05'}


Epoch 1/1:  76%|▊| 4306/5703 [46:23<14:12,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  76%|▊| 4307/5703 [46:24<14:11,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  76%|▊| 4308/5703 [46:24<14:11,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  76%|▊| 4309/5703 [46:25<14:10,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  76%|▊| 4310/5703 [46:26<14:10,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  76%|▊| 4311/5703 [46:26<14:09,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  76%|▊| 4312/5703 [46:27<14:09,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  76%|▊| 4313/5703 [46:28<14:08,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  76%|▊| 4314/5703 [46:28<14:07,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0

{'loss': '3.1066', 'task': '3.0966', 'avg_loss': '3.1105202739689206', 'moe': '0.0100', 'lr': '9.886948e-05'}


Epoch 1/1:  76%|▊| 4341/5703 [46:45<13:51,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  76%|▊| 4342/5703 [46:45<13:50,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  76%|▊| 4343/5703 [46:46<13:49,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  76%|▊| 4344/5703 [46:46<13:49,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  76%|▊| 4345/5703 [46:47<13:48,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  76%|▊| 4346/5703 [46:48<13:47,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  76%|▊| 4347/5703 [46:48<13:47,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  76%|▊| 4348/5703 [46:49<13:46,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  76%|▊| 4349/5703 [46:50<13:46,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0

{'loss': '3.0296', 'task': '3.0196', 'avg_loss': '3.110461384146554', 'moe': '0.0100', 'lr': '9.882381e-05'}


Epoch 1/1:  77%|▊| 4376/5703 [47:06<13:29,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  77%|▊| 4377/5703 [47:07<13:29,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  77%|▊| 4378/5703 [47:07<13:28,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  77%|▊| 4379/5703 [47:08<13:27,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  77%|▊| 4380/5703 [47:08<13:27,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  77%|▊| 4381/5703 [47:09<13:26,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  77%|▊| 4382/5703 [47:10<13:25,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  77%|▊| 4383/5703 [47:10<13:25,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  77%|▊| 4384/5703 [47:11<13:24,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0

{'loss': '3.1791', 'task': '3.1690', 'avg_loss': '3.1104341458301157', 'moe': '0.0100', 'lr': '9.877815e-05'}


Epoch 1/1:  77%|▊| 4411/5703 [47:27<13:07,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  77%|▊| 4412/5703 [47:28<13:07,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  77%|▊| 4413/5703 [47:29<13:06,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  77%|▊| 4414/5703 [47:29<13:06,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  77%|▊| 4415/5703 [47:30<13:05,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  77%|▊| 4416/5703 [47:30<13:04,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  77%|▊| 4417/5703 [47:31<13:04,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  77%|▊| 4418/5703 [47:32<13:03,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  77%|▊| 4419/5703 [47:32<13:02,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0

{'loss': '3.1425', 'task': '3.1325', 'avg_loss': '3.1103543497714203', 'moe': '0.0100', 'lr': '9.873249e-05'}


Epoch 1/1:  78%|▊| 4446/5703 [47:49<12:48,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  78%|▊| 4447/5703 [47:49<12:47,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  78%|▊| 4448/5703 [47:50<12:46,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  78%|▊| 4449/5703 [47:51<12:46,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  78%|▊| 4450/5703 [47:51<12:45,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  78%|▊| 4451/5703 [47:52<12:45,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  78%|▊| 4452/5703 [47:52<12:44,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  78%|▊| 4453/5703 [47:53<12:44,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  78%|▊| 4454/5703 [47:54<12:43,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0

{'loss': '3.0692', 'task': '3.0592', 'avg_loss': '3.1102840628474953', 'moe': '0.0100', 'lr': '9.868683e-05'}


Epoch 1/1:  79%|▊| 4481/5703 [48:10<12:25,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  79%|▊| 4482/5703 [48:11<12:24,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  79%|▊| 4483/5703 [48:11<12:24,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  79%|▊| 4484/5703 [48:12<12:23,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  79%|▊| 4485/5703 [48:13<12:22,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  79%|▊| 4486/5703 [48:13<12:22,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  79%|▊| 4487/5703 [48:14<12:21,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  79%|▊| 4488/5703 [48:14<12:21,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  79%|▊| 4489/5703 [48:15<12:20,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0

{'loss': '3.0795', 'task': '3.0695', 'avg_loss': '3.1100908695529332', 'moe': '0.0100', 'lr': '9.864119e-05'}


Epoch 1/1:  79%|▊| 4516/5703 [48:32<12:06,  1.63it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  79%|▊| 4517/5703 [48:32<12:05,  1.63it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  79%|▊| 4518/5703 [48:33<12:05,  1.63it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  79%|▊| 4519/5703 [48:33<12:04,  1.63it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  79%|▊| 4520/5703 [48:34<12:03,  1.63it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  79%|▊| 4521/5703 [48:35<12:02,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  79%|▊| 4522/5703 [48:35<12:01,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  79%|▊| 4523/5703 [48:36<12:00,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0080, moeEpoch 1/1:  79%|▊| 4524/5703 [48:36<12:00,  1.64it/s, loss=3.0180, avg_loss=3.110769446123214, task=3.0

{'loss': '3.0772', 'task': '3.0672', 'avg_loss': '3.1102344512939455', 'moe': '0.0100', 'lr': '9.859554e-05'}


Epoch 1/1:  80%|▊| 4551/5703 [48:53<11:44,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  80%|▊| 4552/5703 [48:54<11:43,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  80%|▊| 4553/5703 [48:54<11:42,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  80%|▊| 4554/5703 [48:55<11:41,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  80%|▊| 4555/5703 [48:55<11:41,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  80%|▊| 4556/5703 [48:56<11:40,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  80%|▊| 4557/5703 [48:57<11:39,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  80%|▊| 4558/5703 [48:57<11:38,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  80%|▊| 4559/5703 [48:58<11:37,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.

{'loss': '3.1798', 'task': '3.1697', 'avg_loss': '3.110302473440555', 'moe': '0.0100', 'lr': '9.854991e-05'}


Epoch 1/1:  80%|▊| 4586/5703 [49:14<11:21,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  80%|▊| 4587/5703 [49:15<11:20,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  80%|▊| 4588/5703 [49:16<11:19,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  80%|▊| 4589/5703 [49:16<11:19,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  80%|▊| 4590/5703 [49:17<11:19,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  81%|▊| 4591/5703 [49:17<11:19,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  81%|▊| 4592/5703 [49:18<11:19,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  81%|▊| 4593/5703 [49:19<11:18,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  81%|▊| 4594/5703 [49:19<11:17,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.

{'loss': '3.1181', 'task': '3.1081', 'avg_loss': '3.1101910681435556', 'moe': '0.0100', 'lr': '9.850428e-05'}


Epoch 1/1:  81%|▊| 4621/5703 [49:36<11:01,  1.63it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  81%|▊| 4622/5703 [49:36<11:00,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  81%|▊| 4623/5703 [49:37<10:59,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  81%|▊| 4624/5703 [49:38<10:58,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  81%|▊| 4625/5703 [49:38<10:57,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  81%|▊| 4626/5703 [49:39<10:56,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  81%|▊| 4627/5703 [49:39<10:55,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  81%|▊| 4628/5703 [49:40<10:54,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  81%|▊| 4629/5703 [49:41<10:54,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.

{'loss': '3.0001', 'task': '2.9901', 'avg_loss': '3.1101093799291952', 'moe': '0.0100', 'lr': '9.845865e-05'}


Epoch 1/1:  82%|▊| 4656/5703 [49:57<10:36,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  82%|▊| 4657/5703 [49:58<10:36,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  82%|▊| 4658/5703 [49:58<10:36,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  82%|▊| 4659/5703 [49:59<10:35,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  82%|▊| 4660/5703 [49:59<10:35,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  82%|▊| 4661/5703 [50:00<10:34,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  82%|▊| 4662/5703 [50:01<10:34,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  82%|▊| 4663/5703 [50:01<10:33,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  82%|▊| 4664/5703 [50:02<10:32,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.

{'loss': '3.1035', 'task': '3.0935', 'avg_loss': '3.109860497293696', 'moe': '0.0100', 'lr': '9.841303e-05'}


Epoch 1/1:  82%|▊| 4691/5703 [50:18<10:17,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  82%|▊| 4692/5703 [50:19<10:16,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  82%|▊| 4693/5703 [50:20<10:15,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  82%|▊| 4694/5703 [50:20<10:14,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  82%|▊| 4695/5703 [50:21<10:14,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  82%|▊| 4696/5703 [50:21<10:13,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  82%|▊| 4697/5703 [50:22<10:13,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  82%|▊| 4698/5703 [50:23<10:12,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  82%|▊| 4699/5703 [50:23<10:12,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.

{'loss': '3.0691', 'task': '3.0591', 'avg_loss': '3.1098000459822397', 'moe': '0.0100', 'lr': '9.836741e-05'}


Epoch 1/1:  83%|▊| 4726/5703 [50:40<09:55,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  83%|▊| 4727/5703 [50:40<09:55,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  83%|▊| 4728/5703 [50:41<09:54,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  83%|▊| 4729/5703 [50:42<09:54,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  83%|▊| 4730/5703 [50:42<09:53,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  83%|▊| 4731/5703 [50:43<09:52,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  83%|▊| 4732/5703 [50:43<09:52,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  83%|▊| 4733/5703 [50:44<09:51,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  83%|▊| 4734/5703 [50:45<09:51,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.

{'loss': '3.1027', 'task': '3.0927', 'avg_loss': '3.1098432377106002', 'moe': '0.0100', 'lr': '9.832180e-05'}


Epoch 1/1:  83%|▊| 4761/5703 [51:01<09:35,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  83%|▊| 4762/5703 [51:02<09:34,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  84%|▊| 4763/5703 [51:02<09:34,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  84%|▊| 4764/5703 [51:03<09:33,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  84%|▊| 4765/5703 [51:04<09:33,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  84%|▊| 4766/5703 [51:04<09:33,  1.63it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  84%|▊| 4767/5703 [51:05<09:32,  1.63it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  84%|▊| 4768/5703 [51:05<09:32,  1.63it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  84%|▊| 4769/5703 [51:06<09:31,  1.63it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.

{'loss': '3.2189', 'task': '3.2089', 'avg_loss': '3.10986046094964', 'moe': '0.0100', 'lr': '9.827620e-05'}


Epoch 1/1:  84%|▊| 4796/5703 [51:22<09:13,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  84%|▊| 4797/5703 [51:23<09:12,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  84%|▊| 4798/5703 [51:24<09:11,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  84%|▊| 4799/5703 [51:24<09:11,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  84%|▊| 4800/5703 [51:25<09:10,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  84%|▊| 4801/5703 [51:26<09:10,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  84%|▊| 4802/5703 [51:26<09:10,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  84%|▊| 4803/5703 [51:27<09:09,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  84%|▊| 4804/5703 [51:27<09:08,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.

{'loss': '3.0823', 'task': '3.0722', 'avg_loss': '3.109863282928309', 'moe': '0.0100', 'lr': '9.823060e-05'}


Epoch 1/1:  85%|▊| 4831/5703 [51:44<08:51,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  85%|▊| 4832/5703 [51:44<08:50,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  85%|▊| 4833/5703 [51:45<08:49,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  85%|▊| 4834/5703 [51:46<08:49,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  85%|▊| 4835/5703 [51:46<08:48,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  85%|▊| 4836/5703 [51:47<08:48,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  85%|▊| 4837/5703 [51:47<08:47,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  85%|▊| 4838/5703 [51:48<08:46,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  85%|▊| 4839/5703 [51:49<08:46,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.

{'loss': '3.0967', 'task': '3.0867', 'avg_loss': '3.110005263115862', 'moe': '0.0100', 'lr': '9.818501e-05'}


Epoch 1/1:  85%|▊| 4866/5703 [52:05<08:30,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  85%|▊| 4867/5703 [52:06<08:29,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  85%|▊| 4868/5703 [52:06<08:28,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  85%|▊| 4869/5703 [52:07<08:27,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  85%|▊| 4870/5703 [52:08<08:27,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  85%|▊| 4871/5703 [52:08<08:26,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  85%|▊| 4872/5703 [52:09<08:26,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  85%|▊| 4873/5703 [52:09<08:26,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.0672, moEpoch 1/1:  85%|▊| 4874/5703 [52:10<08:25,  1.64it/s, loss=3.0772, avg_loss=3.1102344512939455, task=3.

{'loss': '3.1987', 'task': '3.1887', 'avg_loss': '3.1099310923595818', 'moe': '0.0100', 'lr': '9.813942e-05'}


Epoch 1/1:  86%|▊| 4901/5703 [52:27<08:08,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  86%|▊| 4902/5703 [52:27<08:08,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  86%|▊| 4903/5703 [52:28<08:07,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  86%|▊| 4904/5703 [52:28<08:06,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  86%|▊| 4905/5703 [52:29<08:06,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  86%|▊| 4906/5703 [52:30<08:05,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  86%|▊| 4907/5703 [52:30<08:04,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  86%|▊| 4908/5703 [52:31<08:04,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  86%|▊| 4909/5703 [52:31<08:06,  1.63it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.

{'loss': '3.2648', 'task': '3.2547', 'avg_loss': '3.109903363108997', 'moe': '0.0100', 'lr': '9.809384e-05'}


Epoch 1/1:  87%|▊| 4936/5703 [52:48<07:48,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  87%|▊| 4937/5703 [52:49<07:47,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  87%|▊| 4938/5703 [52:49<07:47,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  87%|▊| 4939/5703 [52:50<07:46,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  87%|▊| 4940/5703 [52:50<07:45,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  87%|▊| 4941/5703 [52:51<07:44,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  87%|▊| 4942/5703 [52:52<07:44,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  87%|▊| 4943/5703 [52:52<07:44,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  87%|▊| 4944/5703 [52:53<07:43,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.

{'loss': '3.1790', 'task': '3.1690', 'avg_loss': '3.1098637247949057', 'moe': '0.0100', 'lr': '9.804826e-05'}


Epoch 1/1:  87%|▊| 4971/5703 [53:10<07:33,  1.61it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  87%|▊| 4972/5703 [53:10<07:30,  1.62it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  87%|▊| 4973/5703 [53:11<07:29,  1.63it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  87%|▊| 4974/5703 [53:11<07:27,  1.63it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  87%|▊| 4975/5703 [53:12<07:25,  1.63it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  87%|▊| 4976/5703 [53:13<07:24,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  87%|▊| 4977/5703 [53:13<07:23,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  87%|▊| 4978/5703 [53:14<07:22,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  87%|▊| 4979/5703 [53:14<07:21,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.

{'loss': '3.1617', 'task': '3.1517', 'avg_loss': '3.1098090901122344', 'moe': '0.0100', 'lr': '9.800269e-05'}


Epoch 1/1:  88%|▉| 5006/5703 [53:31<07:05,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  88%|▉| 5007/5703 [53:32<07:04,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  88%|▉| 5008/5703 [53:32<07:04,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  88%|▉| 5009/5703 [53:33<07:03,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  88%|▉| 5010/5703 [53:33<07:02,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  88%|▉| 5011/5703 [53:34<07:02,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  88%|▉| 5012/5703 [53:35<07:01,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  88%|▉| 5013/5703 [53:35<07:00,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  88%|▉| 5014/5703 [53:36<07:00,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.

{'loss': '3.3066', 'task': '3.2966', 'avg_loss': '3.1097509733741244', 'moe': '0.0100', 'lr': '9.795712e-05'}


Epoch 1/1:  88%|▉| 5041/5703 [53:52<06:43,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  88%|▉| 5042/5703 [53:53<06:42,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  88%|▉| 5043/5703 [53:54<06:42,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  88%|▉| 5044/5703 [53:54<06:41,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  88%|▉| 5045/5703 [53:55<06:40,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  88%|▉| 5046/5703 [53:55<06:40,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  88%|▉| 5047/5703 [53:56<06:39,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  89%|▉| 5048/5703 [53:57<06:38,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  89%|▉| 5049/5703 [53:57<06:38,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.

{'loss': '3.2471', 'task': '3.2371', 'avg_loss': '3.1096542048336837', 'moe': '0.0100', 'lr': '9.791156e-05'}


Epoch 1/1:  89%|▉| 5076/5703 [54:14<06:21,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  89%|▉| 5077/5703 [54:14<06:20,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  89%|▉| 5078/5703 [54:15<06:20,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  89%|▉| 5079/5703 [54:16<06:19,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  89%|▉| 5080/5703 [54:16<06:19,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  89%|▉| 5081/5703 [54:17<06:19,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  89%|▉| 5082/5703 [54:17<06:19,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  89%|▉| 5083/5703 [54:18<06:18,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  89%|▉| 5084/5703 [54:19<06:17,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.

{'loss': '3.1067', 'task': '3.0967', 'avg_loss': '3.10960507075614', 'moe': '0.0100', 'lr': '9.786600e-05'}


Epoch 1/1:  90%|▉| 5111/5703 [54:35<06:00,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  90%|▉| 5112/5703 [54:36<06:01,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  90%|▉| 5113/5703 [54:36<06:01,  1.63it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  90%|▉| 5114/5703 [54:37<06:00,  1.63it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  90%|▉| 5115/5703 [54:38<06:00,  1.63it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  90%|▉| 5116/5703 [54:38<05:59,  1.63it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  90%|▉| 5117/5703 [54:39<05:58,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  90%|▉| 5118/5703 [54:39<05:57,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  90%|▉| 5119/5703 [54:40<05:56,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.

{'loss': '3.0093', 'task': '2.9993', 'avg_loss': '3.109605391773932', 'moe': '0.0100', 'lr': '9.782045e-05'}


Epoch 1/1:  90%|▉| 5146/5703 [54:56<05:39,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  90%|▉| 5147/5703 [54:57<05:38,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  90%|▉| 5148/5703 [54:58<05:38,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  90%|▉| 5149/5703 [54:58<05:37,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  90%|▉| 5150/5703 [54:59<05:36,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  90%|▉| 5151/5703 [54:59<05:36,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  90%|▉| 5152/5703 [55:00<05:35,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  90%|▉| 5153/5703 [55:01<05:34,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  90%|▉| 5154/5703 [55:01<05:34,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.

{'loss': '3.1461', 'task': '3.1361', 'avg_loss': '3.1095803403946425', 'moe': '0.0100', 'lr': '9.777491e-05'}


Epoch 1/1:  91%|▉| 5181/5703 [55:18<05:17,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  91%|▉| 5182/5703 [55:18<05:17,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  91%|▉| 5183/5703 [55:19<05:16,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  91%|▉| 5184/5703 [55:20<05:16,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  91%|▉| 5185/5703 [55:20<05:15,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  91%|▉| 5186/5703 [55:21<05:14,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  91%|▉| 5187/5703 [55:21<05:14,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  91%|▉| 5188/5703 [55:22<05:13,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  91%|▉| 5189/5703 [55:23<05:12,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.

{'loss': '3.2202', 'task': '3.2102', 'avg_loss': '3.1097541259096317', 'moe': '0.0100', 'lr': '9.772937e-05'}


Epoch 1/1:  91%|▉| 5216/5703 [55:39<04:56,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  91%|▉| 5217/5703 [55:40<04:55,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  91%|▉| 5218/5703 [55:40<04:55,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  92%|▉| 5219/5703 [55:41<04:54,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  92%|▉| 5220/5703 [55:42<04:54,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  92%|▉| 5221/5703 [55:42<04:53,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  92%|▉| 5222/5703 [55:43<04:52,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  92%|▉| 5223/5703 [55:43<04:52,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.1887, moEpoch 1/1:  92%|▉| 5224/5703 [55:44<04:51,  1.64it/s, loss=3.1987, avg_loss=3.1099310923595818, task=3.

{'loss': '3.0473', 'task': '3.0373', 'avg_loss': '3.109728075345357', 'moe': '0.0100', 'lr': '9.768384e-05'}

✅ Saved checkpoint to satori_akasha_checkpoints/step_lightning_6250.pt


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ..._checkpoints/step_lightning_6250.pt:   0%|          |  552kB / 3.27GB            

Epoch 1/1:  92%|▉| 5250/5703 [56:52<2:02:41, 16.25s/it, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, mEpoch 1/1:  92%|▉| 5251/5703 [56:53<1:27:00, 11.55s/it, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, mEpoch 1/1:  92%|▉| 5252/5703 [56:53<1:02:09,  8.27s/it, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, mEpoch 1/1:  92%|▉| 5253/5703 [56:54<44:47,  5.97s/it, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  92%|▉| 5254/5703 [56:54<32:39,  4.36s/it, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  92%|▉| 5255/5703 [56:55<24:11,  3.24s/it, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  92%|▉| 5256/5703 [56:56<18:15,  2.45s/it, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  92%|▉| 5257/5703 [56:56<14:06,  1.90s/it, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  92%|▉| 5258/5703 [56:57<11:12,  1.51s/it, loss=3.0473, avg_loss=3.109728075345357, task=3.0

{'loss': '3.0969', 'task': '3.0869', 'avg_loss': '3.1096324390133483', 'moe': '0.0100', 'lr': '9.763831e-05'}


Epoch 1/1:  93%|▉| 5286/5703 [57:14<04:14,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  93%|▉| 5287/5703 [57:14<04:13,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  93%|▉| 5288/5703 [57:15<04:13,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  93%|▉| 5289/5703 [57:16<04:12,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  93%|▉| 5290/5703 [57:16<04:11,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  93%|▉| 5291/5703 [57:17<04:11,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  93%|▉| 5292/5703 [57:18<04:10,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  93%|▉| 5293/5703 [57:18<04:10,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  93%|▉| 5294/5703 [57:19<04:09,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0

{'loss': '2.9924', 'task': '2.9824', 'avg_loss': '3.1094908913723507', 'moe': '0.0100', 'lr': '9.759279e-05'}


Epoch 1/1:  93%|▉| 5321/5703 [57:35<03:53,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  93%|▉| 5322/5703 [57:36<03:52,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  93%|▉| 5323/5703 [57:36<03:52,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  93%|▉| 5324/5703 [57:37<03:51,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  93%|▉| 5325/5703 [57:38<03:50,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  93%|▉| 5326/5703 [57:38<03:49,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  93%|▉| 5327/5703 [57:39<03:49,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  93%|▉| 5328/5703 [57:40<03:48,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  93%|▉| 5329/5703 [57:40<03:47,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0

{'loss': '3.1234', 'task': '3.1134', 'avg_loss': '3.109525767583696', 'moe': '0.0100', 'lr': '9.754727e-05'}


Epoch 1/1:  94%|▉| 5356/5703 [57:57<03:31,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  94%|▉| 5357/5703 [57:57<03:30,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  94%|▉| 5358/5703 [57:58<03:30,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  94%|▉| 5359/5703 [57:58<03:29,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  94%|▉| 5360/5703 [57:59<03:28,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  94%|▉| 5361/5703 [58:00<03:28,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  94%|▉| 5362/5703 [58:00<03:27,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  94%|▉| 5363/5703 [58:01<03:27,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  94%|▉| 5364/5703 [58:01<03:26,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0

{'loss': '3.2416', 'task': '3.2316', 'avg_loss': '3.109571090213443', 'moe': '0.0100', 'lr': '9.750176e-05'}


Epoch 1/1:  95%|▉| 5391/5703 [58:18<03:09,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  95%|▉| 5392/5703 [58:19<03:09,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  95%|▉| 5393/5703 [58:19<03:08,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  95%|▉| 5394/5703 [58:20<03:08,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  95%|▉| 5395/5703 [58:20<03:07,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  95%|▉| 5396/5703 [58:21<03:06,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  95%|▉| 5397/5703 [58:22<03:06,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  95%|▉| 5398/5703 [58:22<03:05,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  95%|▉| 5399/5703 [58:23<03:05,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0

{'loss': '3.0184', 'task': '3.0084', 'avg_loss': '3.109470262527466', 'moe': '0.0100', 'lr': '9.745625e-05'}


Epoch 1/1:  95%|▉| 5426/5703 [58:39<02:48,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  95%|▉| 5427/5703 [58:40<02:48,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  95%|▉| 5428/5703 [58:40<02:47,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  95%|▉| 5429/5703 [58:41<02:46,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  95%|▉| 5430/5703 [58:42<02:46,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  95%|▉| 5431/5703 [58:42<02:45,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  95%|▉| 5432/5703 [58:43<02:45,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  95%|▉| 5433/5703 [58:44<02:44,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  95%|▉| 5434/5703 [58:44<02:43,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0

{'loss': '3.1070', 'task': '3.0970', 'avg_loss': '3.1093988169680586', 'moe': '0.0100', 'lr': '9.741075e-05'}


Epoch 1/1:  96%|▉| 5461/5703 [59:01<02:27,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  96%|▉| 5462/5703 [59:01<02:26,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  96%|▉| 5463/5703 [59:02<02:26,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  96%|▉| 5464/5703 [59:02<02:25,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  96%|▉| 5465/5703 [59:03<02:25,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  96%|▉| 5466/5703 [59:04<02:24,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  96%|▉| 5467/5703 [59:04<02:23,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  96%|▉| 5468/5703 [59:05<02:23,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  96%|▉| 5469/5703 [59:05<02:22,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0

{'loss': '3.0489', 'task': '3.0389', 'avg_loss': '3.109472906426802', 'moe': '0.0100', 'lr': '9.736526e-05'}


Epoch 1/1:  96%|▉| 5496/5703 [59:22<02:06,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  96%|▉| 5497/5703 [59:23<02:05,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  96%|▉| 5498/5703 [59:23<02:05,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  96%|▉| 5499/5703 [59:24<02:04,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  96%|▉| 5500/5703 [59:24<02:03,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  96%|▉| 5501/5703 [59:25<02:03,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  96%|▉| 5502/5703 [59:26<02:02,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  96%|▉| 5503/5703 [59:26<02:01,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  97%|▉| 5504/5703 [59:27<02:01,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0

{'loss': '2.8068', 'task': '2.7967', 'avg_loss': '3.1093763994696344', 'moe': '0.0100', 'lr': '9.731977e-05'}


Epoch 1/1:  97%|▉| 5531/5703 [59:43<01:44,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  97%|▉| 5532/5703 [59:44<01:44,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  97%|▉| 5533/5703 [59:44<01:43,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  97%|▉| 5534/5703 [59:45<01:42,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  97%|▉| 5535/5703 [59:46<01:42,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  97%|▉| 5536/5703 [59:46<01:41,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  97%|▉| 5537/5703 [59:47<01:41,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  97%|▉| 5538/5703 [59:48<01:40,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, moeEpoch 1/1:  97%|▉| 5539/5703 [59:48<01:39,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0

{'loss': '3.0695', 'task': '3.0594', 'avg_loss': '3.109399451464013', 'moe': '0.0100', 'lr': '9.727428e-05'}


Epoch 1/1:  98%|▉| 5566/5703 [1:00:05<01:23,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, mEpoch 1/1:  98%|▉| 5567/5703 [1:00:05<01:22,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, mEpoch 1/1:  98%|▉| 5568/5703 [1:00:06<01:22,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, mEpoch 1/1:  98%|▉| 5569/5703 [1:00:06<01:21,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, mEpoch 1/1:  98%|▉| 5570/5703 [1:00:07<01:21,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, mEpoch 1/1:  98%|▉| 5571/5703 [1:00:08<01:20,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, mEpoch 1/1:  98%|▉| 5572/5703 [1:00:08<01:19,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, mEpoch 1/1:  98%|▉| 5573/5703 [1:00:09<01:19,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3.0373, mEpoch 1/1:  98%|▉| 5574/5703 [1:00:10<01:18,  1.64it/s, loss=3.0473, avg_loss=3.109728075345357, task=3

{'loss': '3.1498', 'task': '3.1397', 'avg_loss': '3.1094732904008455', 'moe': '0.0100', 'lr': '9.722881e-05'}


Epoch 1/1:  98%|▉| 5601/5703 [1:00:26<01:02,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1:  98%|▉| 5602/5703 [1:00:27<01:01,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1:  98%|▉| 5603/5703 [1:00:27<01:01,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1:  98%|▉| 5604/5703 [1:00:28<01:00,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1:  98%|▉| 5605/5703 [1:00:28<00:59,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1:  98%|▉| 5606/5703 [1:00:29<00:59,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1:  98%|▉| 5607/5703 [1:00:30<00:58,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1:  98%|▉| 5608/5703 [1:00:30<00:57,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1:  98%|▉| 5609/5703 [1:00:31<00:57,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=

{'loss': '3.1292', 'task': '3.1192', 'avg_loss': '3.10946319177284', 'moe': '0.0100', 'lr': '9.718333e-05'}


Epoch 1/1:  99%|▉| 5636/5703 [1:00:47<00:40,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1:  99%|▉| 5637/5703 [1:00:48<00:40,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1:  99%|▉| 5638/5703 [1:00:49<00:39,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1:  99%|▉| 5639/5703 [1:00:49<00:39,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1:  99%|▉| 5640/5703 [1:00:50<00:38,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1:  99%|▉| 5641/5703 [1:00:50<00:37,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1:  99%|▉| 5642/5703 [1:00:51<00:37,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1:  99%|▉| 5643/5703 [1:00:52<00:36,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1:  99%|▉| 5644/5703 [1:00:52<00:36,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=

{'loss': '3.1794', 'task': '3.1694', 'avg_loss': '3.109414932084462', 'moe': '0.0100', 'lr': '9.713787e-05'}


Epoch 1/1:  99%|▉| 5671/5703 [1:01:09<00:19,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1:  99%|▉| 5672/5703 [1:01:09<00:18,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1:  99%|▉| 5673/5703 [1:01:10<00:18,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1:  99%|▉| 5674/5703 [1:01:11<00:17,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1: 100%|▉| 5675/5703 [1:01:11<00:17,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1: 100%|▉| 5676/5703 [1:01:12<00:16,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1: 100%|▉| 5677/5703 [1:01:12<00:15,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1: 100%|▉| 5678/5703 [1:01:13<00:15,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=3.1397, Epoch 1/1: 100%|▉| 5679/5703 [1:01:14<00:14,  1.64it/s, loss=3.1498, avg_loss=3.1094732904008455, task=

Successfully uploaded checkpoints to Hugging Face Hub!


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ..._checkpoints/step_lightning_6250.pt:   3%|2         | 92.3MB / 3.27GB            

  ...asha_checkpoints/step_final_6262.pt:   0%|          |  552kB / 3.27GB            

Successfully uploaded checkpoints to Hugging Face Hub!

✅ Saved checkpoint to satori_akasha_checkpoints/step_final_6262.pt
🎉 Training finished.


In [9]:
api.upload_folder(
    folder_path="satori_akasha_checkpoints",
    repo_id=repo_id,
    repo_type="model"
)
print("Successfully uploaded checkpoints to Hugging Face Hub!")

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...asha_checkpoints/step_final_6262.pt:   3%|3         |  101MB / 3.27GB            

  ..._checkpoints/step_lightning_6250.pt:   3%|3         |  101MB / 3.27GB            

No files have been modified since last commit. Skipping to prevent empty commit.


Successfully uploaded checkpoints to Hugging Face Hub!


In [None]:
# def train(model, config, train_loader, optimizer, scheduler, resume_step):
#     """The main training loop."""
#     device = config['device']
#     use_amp = config['precision'] in ['fp16', 'bf16']
#     dtype = torch.bfloat16 if config['precision'] == 'bf16' else torch.float16
#     scaler = GradScaler(enabled=(config['precision'] == 'fp16'))
    
#     global_step = resume_step
#     model.train()
#     avg_loss = 0
#     for epoch in range(config['num_epochs']):
#         pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config['num_epochs']}")
        
#         for i, (input_ids, labels) in enumerate(pbar):
#             input_ids, labels = input_ids.to(device), labels.to(device)
#             with torch.autocast(device_type=device.type, dtype=dtype, enabled=use_amp):
#                 _, loss_dict = model(input_ids, labels)
#                 task_loss = loss_dict['task_loss']
#                 moe_loss = loss_dict['moe_aux_loss']
#                 total_loss = task_loss + moe_loss
#                 avg_loss +=total_loss.item()
#                 # Scale loss for gradient accumulation
#                 scaled_loss = total_loss / config['grad_accum_steps']

#             # Backward pass
#             scaler.scale(scaled_loss).backward()
            
#             # Gradient accumulation step
#             if (i + 1) % config['grad_accum_steps'] == 0:
#                 # Best Practice: Unscale gradients before clipping
#                 scaler.unscale_(optimizer)
                
#                 # Best Practice: Clip gradients to prevent explosion
#                 torch.nn.utils.clip_grad_norm_(model.parameters(), config['max_grad_norm'])
                
#                 # Optimizer and scheduler step
#                 scaler.step(optimizer)
#                 scaler.update()
#                 scheduler.step()
#                 optimizer.zero_grad(set_to_none=True)
                
#                 global_step += 1
                
#                 # Logging
#                 if global_step % config['log_interval_steps'] == 0:
#                     pbar.set_postfix({
#                         "loss": f"{total_loss.item():.4f}",
#                         "avg_loss":f"{avg_loss/(i+1)}",
#                         "task": f"{task_loss.item():.4f}",
#                         "moe": f"{moe_loss.item():.4f}",
#                         "lr": f"{scheduler.get_last_lr()[0]:.6e}"
#                     })
                    
#                 print({
#                         "loss": f"{total_loss.item():.4f}",
#                         "task": f"{task_loss.item():.4f}",
#                         "avg_loss":f"{avg_loss/(i+1)}",
#                         "moe": f"{moe_loss.item():.4f}",
#                         "lr": f"{scheduler.get_last_lr()[0]:.6e}"
#                     })

#                 # Checkpointing
#                 if global_step > 0 and global_step % 50 == 0:
#                     checkpoint_path = os.path.join(config['output_dir'], f"step_lightning_{global_step}.pt")
#                     torch.save({
#                         "step": global_step,
#                         "model_state_dict": model.state_dict(),
#                         "optimizer_state_dict": optimizer.state_dict(),
#                         "scheduler_state_dict": scheduler.state_dict(),
#                         "scaler_state_dict": scaler.state_dict(),
#                         "config": config
#                     }, checkpoint_path)
#                     print(f"\n✅ Saved checkpoint to {checkpoint_path}")
#                     api.upload_folder(
#                     folder_path="satori_akasha_checkpoints",
#                     repo_id=repo_id,
#                     repo_type="model"
#                     )
#     print("Successfully uploaded checkpoints to Hugging Face Hub!")

#     checkpoint_path = os.path.join(config['output_dir'], f"step_final_{global_step}.pt")
#     torch.save({
#                         "step": global_step,
#                         "model_state_dict": model.state_dict(),
#                         "optimizer_state_dict": optimizer.state_dict(),
#                         "scheduler_state_dict": scheduler.state_dict(),
#                         "scaler_state_dict": scaler.state_dict(),
#                         "config": config
#                     }, checkpoint_path)
#     api.upload_folder(
#     folder_path="satori_akasha_checkpoints",
#     repo_id=repo_id,
#     repo_type="model"
#     )
#     print("Successfully uploaded checkpoints to Hugging Face Hub!")
#     print(f"\n✅ Saved checkpoint to {checkpoint_path}")
            

#     print("🎉 Training finished.")

In [9]:
dest_path

'./step_final_4769.pt'