In [1]:
import torch

print("="*80)
print("CUDA DIAGNOSTICS")
print("="*80)

# Check CUDA availability
print(f"\nCUDA Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"PyTorch Version: {torch.__version__}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    
    for i in range(torch.cuda.device_count()):
        print(f"\n--- GPU {i} ---")
        print(f"Name: {torch.cuda.get_device_name(i)}")
        print(f"Capability: {torch.cuda.get_device_capability(i)}")
        
        # Memory info
        props = torch.cuda.get_device_properties(i)
        print(f"Total Memory: {props.total_memory / 1024**3:.2f} GB")
        print(f"Available Memory: {torch.cuda.mem_get_info(i)[0] / 1024**3:.2f} GB")
        print(f"Allocated Memory: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB")
        
        # Test tensor creation
        try:
            test_tensor = torch.randn(100, 100).cuda(i)
            print(f"‚úì Can create tensors on GPU {i}")
        except Exception as e:
            print(f"‚úó Error creating tensor: {e}")
else:
    print("\n‚úó CUDA is NOT available!")


CUDA DIAGNOSTICS

CUDA Available: True
CUDA Version: 12.1
PyTorch Version: 2.5.1+cu121
Number of GPUs: 1

--- GPU 0 ---
Name: NVIDIA RTX A1000
Capability: (8, 6)
Total Memory: 8.00 GB
Available Memory: 7.03 GB
Allocated Memory: 0.00 GB
‚úì Can create tensors on GPU 0


In [1]:
import torch
import gc

# Kill all CUDA processes
torch.cuda.empty_cache()
gc.collect()

# Reset peak memory stats
torch.cuda.reset_peak_memory_stats()
torch.cuda.reset_accumulated_memory_stats()

print(f"Available Memory: {torch.cuda.mem_get_info(0)[0] / 1024**3:.2f} GB")

Available Memory: 7.03 GB


In [2]:
# Core dependencies
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

!pip install torchaudio transformers einops tqdm descript-audio-codec
#audiotools



In [4]:
!pip install --retries 10 --timeout 30 descript-audio-codec
!pip install --retries 10 --timeout 30 git+https://github.com/descriptinc/audiotools

Collecting descript-audio-codec
  Using cached descript_audio_codec-1.0.0-py3-none-any.whl.metadata (7.8 kB)
Collecting argbind>=0.3.7 (from descript-audio-codec)
  Downloading argbind-0.3.9.tar.gz (17 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting descript-audiotools>=0.7.2 (from descript-audio-codec)
  Downloading descript_audiotools-0.7.2-py2.py3-none-any.whl.metadata (3.4 kB)
Collecting docstring-parser (from argbind>=0.3.7->descript-audio-codec)
  Downloading docstring_parser-0.17.0-py3-none-any.whl.metadata (3.5 kB)
Collecting pyloudnorm (from descript-audiotools>=0.7.2->descript-audio-codec)
  Downloading pyloudnorm-0.1.1-py3-none-any.whl.metadata (5.6 kB)
Collecting importlib-resources

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
databricks-sdk 0.70.0 requires protobuf<7.0,>=4.21.0, but you have protobuf 3.19.6 which is incompatible.
opentelemetry-proto 1.38.0 requires protobuf<7.0,>=5.0, but you have protobuf 3.19.6 which is incompatible.
ray 2.52.1 requires protobuf>=3.20.3, but you have protobuf 3.19.6 which is incompatible.
tensorflow 2.20.0 requires protobuf>=5.28.0, but you have protobuf 3.19.6 which is incompatible.


Collecting git+https://github.com/descriptinc/audiotools
  Cloning https://github.com/descriptinc/audiotools to c:\users\user\appdata\local\temp\pip-req-build-5_d7i_qg


  Running command git clone --filter=blob:none --quiet https://github.com/descriptinc/audiotools 'C:\Users\user\AppData\Local\Temp\pip-req-build-5_d7i_qg'
  fatal: unable to access 'https://github.com/descriptinc/audiotools/': Could not resolve host: github.com
  error: subprocess-exited-with-error
  
  git clone --filter=blob:none --quiet https://github.com/descriptinc/audiotools 'C:\Users\user\AppData\Local\Temp\pip-req-build-5_d7i_qg' did not run successfully.
  exit code: 128
  
  No available output.
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
ERROR: Failed to build 'git+https://github.com/descriptinc/audiotools' when git clone --filter=blob:none --quiet https://github.com/descriptinc/audiotools 'c:\users\user\appdata\local\temp\pip-req-build-5_d7i_qg'


In [6]:
# In notebook:
!pip install "C:/Users/user/.cache/dac/protobuf-5.28.3-cp310-abi3-win_amd64.whl" --force-reinstal

Processing c:\users\user\.cache\dac\protobuf-5.28.3-cp310-abi3-win_amd64.whl
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 5.28.3
    Uninstalling protobuf-5.28.3:
      Successfully uninstalled protobuf-5.28.3
Successfully installed protobuf-5.28.3


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
descript-audiotools 0.7.2 requires protobuf<3.20,>=3.9.2, but you have protobuf 5.28.3 which is incompatible.


In [1]:
import os

dac_model_path = "C:/Users/user/.cache/dac/weights_44khz_16kbps.pth"

if os.path.exists(dac_model_path):
    print("‚úÖ DAC model found! You can train!")
    print(f"   Location: {dac_model_path}")
    file_size = os.path.getsize(dac_model_path) / (1024 * 1024)
    print(f"   Size: {file_size:.2f} MB")
else:
    print("‚ùå DAC model NOT found!")
    print(f"   Expected location: {dac_model_path}")
    print("\nüì• You need to download it first!")

‚úÖ DAC model found! You can train!
   Location: C:/Users/user/.cache/dac/weights_44khz_16kbps.pth
   Size: 245.08 MB


In [1]:
"""
Production-Ready Audio Effect Generator using DAC-VAE (No AudioTools)
Path B: High-quality audio generation with stable training

This implementation uses Descript Audio Codec (DAC) with VAE for:
- Clean latent space manipulation
- Stable gradient flow
- Production-quality audio synthesis

NO AUDIOTOOLS DEPENDENCY - Uses DAC encoder/decoder directly!
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import pandas as pd
import os
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from einops import rearrange
import matplotlib.pyplot as plt
import json
import numpy as np
from tqdm import tqdm

# DAC import (NO audiotools needed!)
try:
    import dac
    print("‚úì DAC library imported successfully")
except ImportError:
    print("‚ùå DAC not installed. Run: pip install descript-audio-codec")
    exit(1)

#############################################
#                 CONFIG
#############################################

class CFG:
    # Paths - KEEP YOUR PATHS
    csv_path = "C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/25000_datapoints.csv"
    base_path = "C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA"
    checkpoint_path = f"{base_path}/result_DAC/model.pt"
    best_model_path = f"{base_path}/result_DAC/model_best.pt"
    plot_path = f"{base_path}/result_DAC/training_curves.png"
    
    # Create result directory if needed
    os.makedirs(f"{base_path}/result_DAC", exist_ok=True)
    
    # Columns - SAME AS YOURS
    audio_col_in = "input_audio_path"
    audio_col_out = "output_audio_path"
    text_col = "prompt"
    
    # Audio settings
    sample_rate = 44100  # DAC uses 44.1kHz (better quality than 24kHz)
    max_audio_length = 5 * 44100  # 5 seconds at 44.1kHz
    
    # DAC Model settings
    dac_model_path = "44khz"  # Options: "16khz", "24khz", "44khz"
    
    # Training - OPTIMIZED FOR DAC
    batch_size = 2  # Start small due to 44kHz
    accumulation_steps = 4  # Effective batch = 8
    epochs = 40  # More epochs for production quality
    
    # Learning rates - TUNED FOR DAC
    lr_unet = 1e-5  # UNet learning rate
    lr_text = 5e-7  # Text encoder learning rate (frozen mostly)
    weight_decay = 0.01
    grad_clip = 1.0
    
    # Loss weights
    audio_loss_weight = 1.0  # Waveform reconstruction
    latent_loss_weight = 0.1  # Latent space matching
    
    # Mixed precision
    use_amp = True
    
    # Logging
    log_interval = 100  # Print every 100 steps
    
    # UNet architecture - OPTIMIZED FOR DAC LATENTS
    unet_channels = [64, 128, 256, 512]  # Deeper for better quality
    text_dim = 768  # BERT hidden size
    
    # Data splits
    train_ratio = 0.7
    val_ratio = 0.15
    test_ratio = 0.15
    
    # Freezing options
    freeze_text_encoder = True  # Set False to fine-tune BERT
    freeze_dac = True  # ALWAYS keep True (don't touch DAC)
    
    # Device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Num workers
    num_workers = 0  # Windows compatibility

cfg = CFG()

print("="*60)
print("DAC-VAE AUDIO EFFECT GENERATOR (No AudioTools)")
print("="*60)
print(f"Device: {cfg.device}")
print(f"Sample Rate: {cfg.sample_rate} Hz")
print(f"Max Length: {cfg.max_audio_length / cfg.sample_rate:.1f} seconds")
print(f"Batch Size: {cfg.batch_size} x {cfg.accumulation_steps} = {cfg.batch_size * cfg.accumulation_steps}")
print("="*60 + "\n")

#############################################
#          LOAD DAC MODEL
#############################################

print("Loading DAC model...")

# Load from manually downloaded file (for offline use)
dac_model_path = "C:/Users/user/.cache/dac/weights_44khz_16kbps.pth"

if not os.path.exists(dac_model_path):
    print(f"\n‚ùå DAC model not found at: {dac_model_path}")
    print("\n" + "="*60)
    print("MANUAL DOWNLOAD REQUIRED")
    print("="*60)
    print("\nüì• Download Instructions:")
    print("\n1. Go to: https://github.com/descriptinc/descript-audio-codec/releases/tag/1.0.0")
    print("2. Download: weights_44khz_16kbps.pth (245 MB)")
    print(f"3. Save to: {dac_model_path}")
    print("\nüí° TIP: Use mobile hotspot if you have network/DNS issues!")
    print("="*60 + "\n")
    exit(1)

print(f"‚úì Loading from: {dac_model_path}")
dac_model = dac.DAC.load(dac_model_path)
dac_model = dac_model.to(cfg.device)
dac_model.eval()

# Freeze DAC encoder and decoder (we only train UNet)
for param in dac_model.parameters():
    param.requires_grad = False

print("‚úì DAC model loaded and frozen")

# Get DAC latent dimensions
with torch.no_grad():
    dummy_audio = torch.randn(1, 1, cfg.sample_rate).to(cfg.device)
    # Use encoder directly (no audiotools needed!)
    z = dac_model.encoder(dummy_audio)
    latent_channels = z.shape[1]
    latent_time_reduction = dummy_audio.shape[-1] // z.shape[-1]
    
print(f"‚úì DAC Latent Channels: {latent_channels}")
print(f"‚úì Time Reduction Factor: {latent_time_reduction}x")
print()

#############################################
#      DATASET LOADING & PREPARATION
#############################################

print("="*60)
print("LOADING DATASET")
print("="*60)

df = pd.read_csv(cfg.csv_path)
print(f"Original dataset: {len(df)} samples")

# Fix paths - SAME AS YOUR CODE
for col in [cfg.audio_col_in, cfg.audio_col_out]:
    df[col] = df[col].apply(lambda p: os.path.join(cfg.base_path, str(p).replace('\\', '/')))

# Validate files exist
print("\nValidating files...")
valid_indices = []
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Validating"):
    if os.path.exists(row[cfg.audio_col_in]) and os.path.exists(row[cfg.audio_col_out]):
        valid_indices.append(idx)

df = df.iloc[valid_indices].reset_index(drop=True)
print(f"‚úì Valid samples: {len(df)}")

# Split dataset - SAME AS YOURS
train_df, temp_df = train_test_split(df, test_size=(cfg.val_ratio + cfg.test_ratio), random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=cfg.test_ratio/(cfg.val_ratio + cfg.test_ratio), random_state=42)

print(f"\nDataset splits:")
print(f"  Train:      {len(train_df)} samples ({len(train_df)/len(df)*100:.1f}%)")
print(f"  Validation: {len(val_df)} samples ({len(val_df)/len(df)*100:.1f}%)")
print(f"  Test:       {len(test_df)} samples ({len(test_df)/len(df)*100:.1f}%)")
print()

#############################################
#      TOKENIZER
#############################################

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print("‚úì Tokenizer loaded\n")

#############################################
#      DATASET CLASS
#############################################

class AudioEffectDataset(Dataset):
    """Dataset for audio effect generation using DAC (no audiotools)"""
    
    def __init__(self, df):
        self.df = df.reset_index(drop=True)
    
    def __len__(self):
        return len(self.df)
    
    def _load_and_process(self, path):
        """Load audio - returns torch tensor directly"""
        # Load audio
        wav, sr = torchaudio.load(path)
        
        # Resample if needed
        if sr != cfg.sample_rate:
            wav = torchaudio.functional.resample(wav, sr, cfg.sample_rate)
        
        # Convert to mono
        if wav.size(0) > 1:
            wav = wav.mean(dim=0, keepdim=True)
        
        # Trim or pad to max_length
        if wav.size(1) > cfg.max_audio_length:
            wav = wav[:, :cfg.max_audio_length]
        elif wav.size(1) < cfg.max_audio_length:
            wav = F.pad(wav, (0, cfg.max_audio_length - wav.size(1)))
        
        return wav
    
    def __getitem__(self, idx):
        try:
            row = self.df.iloc[idx]
            wav_in = self._load_and_process(row[cfg.audio_col_in])
            wav_out = self._load_and_process(row[cfg.audio_col_out])
            text = row[cfg.text_col]
            
            return wav_in, wav_out, text
        
        except Exception as e:
            print(f"Error loading sample {idx}: {e}")
            # Return zeros as fallback
            return (
                torch.zeros(1, cfg.max_audio_length),
                torch.zeros(1, cfg.max_audio_length),
                "error loading audio"
            )

def collate_fn(batch):
    """Collate function for batching"""
    wav_in, wav_out, texts = zip(*batch)
    
    # Stack waveforms (already same length from dataset)
    wav_in = torch.stack(wav_in)
    wav_out = torch.stack(wav_out)
    
    # Tokenize texts
    tokens = tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    
    return wav_in, wav_out, tokens.input_ids, tokens.attention_mask

#############################################
#      CREATE DATALOADERS
#############################################

print("="*60)
print("CREATING DATALOADERS")
print("="*60)

train_ds = AudioEffectDataset(train_df)
val_ds = AudioEffectDataset(val_df)
test_ds = AudioEffectDataset(test_df)

train_dl = DataLoader(
    train_ds,
    batch_size=cfg.batch_size,
    shuffle=True,
    num_workers=cfg.num_workers,
    collate_fn=collate_fn,
    pin_memory=True
)

val_dl = DataLoader(
    val_ds,
    batch_size=cfg.batch_size,
    shuffle=False,
    num_workers=cfg.num_workers,
    collate_fn=collate_fn,
    pin_memory=True
)

test_dl = DataLoader(
    test_ds,
    batch_size=cfg.batch_size,
    shuffle=False,
    num_workers=cfg.num_workers,
    collate_fn=collate_fn,
    pin_memory=True
)

print(f"Batches per epoch:")
print(f"  Train: {len(train_dl)} batches")
print(f"  Val:   {len(val_dl)} batches")
print(f"  Test:  {len(test_dl)} batches")
print()

#############################################
#      MODEL ARCHITECTURE
#############################################

class CrossAttention(nn.Module):
    """Cross-attention between audio latents and text embeddings"""
    
    def __init__(self, audio_dim, text_dim, n_heads=8):
        super().__init__()
        self.n_heads = n_heads
        self.scale = (audio_dim // n_heads) ** -0.5
        
        self.to_q = nn.Linear(audio_dim, audio_dim)
        self.to_k = nn.Linear(text_dim, audio_dim)
        self.to_v = nn.Linear(text_dim, audio_dim)
        self.to_out = nn.Linear(audio_dim, audio_dim)
        
    def forward(self, x, context):
        """
        x: (B, C, T) - audio features
        context: (B, S, D) - text embeddings
        """
        B, C, T = x.shape
        x_flat = rearrange(x, 'b c t -> b t c')
        
        q = self.to_q(x_flat)
        k = self.to_k(context)
        v = self.to_v(context)
        
        q = rearrange(q, 'b t (h d) -> b h t d', h=self.n_heads)
        k = rearrange(k, 'b s (h d) -> b h s d', h=self.n_heads)
        v = rearrange(v, 'b s (h d) -> b h s d', h=self.n_heads)
        
        attn = torch.einsum('bhqd,bhkd->bhqk', q, k) * self.scale
        attn = F.softmax(attn, dim=-1)
        
        out = torch.einsum('bhqk,bhvd->bhqd', attn, v)
        out = rearrange(out, 'b h t d -> b t (h d)')
        out = self.to_out(out)
        
        return rearrange(out, 'b t c -> b c t')

class ResidualBlock(nn.Module):
    """Residual block with group normalization"""
    
    def __init__(self, channels):
        super().__init__()
        self.conv1 = nn.Conv1d(channels, channels, 3, padding=1)
        self.conv2 = nn.Conv1d(channels, channels, 3, padding=1)
        self.norm1 = nn.GroupNorm(8, channels)
        self.norm2 = nn.GroupNorm(8, channels)
        self.act = nn.SiLU()
        
    def forward(self, x):
        residual = x
        x = self.act(self.norm1(self.conv1(x)))
        x = self.act(self.norm2(self.conv2(x)))
        return x + residual

class DownBlock(nn.Module):
    """Downsampling block with optional cross-attention"""
    
    def __init__(self, in_c, out_c, text_dim=768, use_attn=False):
        super().__init__()
        self.use_attn = use_attn
        
        self.conv = nn.Conv1d(in_c, out_c, 3, padding=1)
        self.res1 = ResidualBlock(out_c)
        self.res2 = ResidualBlock(out_c)
        
        if use_attn:
            self.attn = CrossAttention(out_c, text_dim)
        
        self.downsample = nn.Conv1d(out_c, out_c, 4, stride=2, padding=1)
        
    def forward(self, x, text_emb=None):
        x = self.conv(x)
        x = self.res1(x)
        x = self.res2(x)
        
        if self.use_attn and text_emb is not None:
            x = x + self.attn(x, text_emb)
        
        skip = x
        x = self.downsample(x)
        return x, skip

class UpBlock(nn.Module):
    """Upsampling block with skip connections and optional cross-attention"""
    
    def __init__(self, in_c, out_c, skip_c, text_dim=768, use_attn=False):
        super().__init__()
        self.use_attn = use_attn
        
        self.upsample = nn.ConvTranspose1d(in_c, out_c, 4, stride=2, padding=1)
        self.conv = nn.Conv1d(out_c + skip_c, out_c, 3, padding=1)
        self.res1 = ResidualBlock(out_c)
        self.res2 = ResidualBlock(out_c)
        
        if use_attn:
            self.attn = CrossAttention(out_c, text_dim)
        
    def forward(self, x, skip, text_emb=None):
        x = self.upsample(x)
        
        # Match temporal dimensions
        if x.size(-1) != skip.size(-1):
            x = F.interpolate(x, size=skip.size(-1), mode='linear', align_corners=False)
        
        x = torch.cat([x, skip], dim=1)
        x = self.conv(x)
        x = self.res1(x)
        x = self.res2(x)
        
        if self.use_attn and text_emb is not None:
            x = x + self.attn(x, text_emb)
        
        return x

class LatentUNet(nn.Module):
    """UNet for manipulating DAC latent space"""
    
    def __init__(self, latent_channels, channels, text_dim=768):
        super().__init__()
        
        # Input projection
        self.input_conv = nn.Conv1d(latent_channels, channels[0], 7, padding=3)
        
        # Encoder
        self.down_blocks = nn.ModuleList()
        for i in range(len(channels) - 1):
            use_attn = i >= 2  # Add attention in deeper layers
            self.down_blocks.append(
                DownBlock(channels[i], channels[i+1], text_dim, use_attn)
            )
        
        # Bottleneck
        self.mid_block1 = ResidualBlock(channels[-1])
        self.mid_attn = CrossAttention(channels[-1], text_dim)
        self.mid_block2 = ResidualBlock(channels[-1])
        
        # Decoder
        self.up_blocks = nn.ModuleList()
        for i in range(len(channels) - 1, 0, -1):
            use_attn = i >= 2
            self.up_blocks.append(
                UpBlock(
                    in_c=channels[i],
                    out_c=channels[i-1],
                    skip_c=channels[i],
                    text_dim=text_dim,
                    use_attn=use_attn
                )
            )
        
        # Output projection
        self.output_conv = nn.Conv1d(channels[0], latent_channels, 7, padding=3)
        
    def forward(self, z, text_emb):
        """
        z: (B, latent_channels, T) - DAC latents
        text_emb: (B, S, text_dim) - text embeddings
        """
        original_length = z.size(-1)
        
        x = self.input_conv(z)
        
        # Encoder path
        skips = []
        for down in self.down_blocks:
            x, skip = down(x, text_emb)
            skips.append(skip)
        
        # Bottleneck
        x = self.mid_block1(x)
        x = x + self.mid_attn(x, text_emb)
        x = self.mid_block2(x)
        
        # Decoder path
        for up in self.up_blocks:
            skip = skips.pop()
            x = up(x, skip, text_emb)
        
        # Output
        x = self.output_conv(x)
        
        # Ensure output matches input length
        if x.size(-1) != original_length:
            x = F.interpolate(x, size=original_length, mode='linear', align_corners=False)
        
        return x

class AudioEffectModel(nn.Module):
    """Complete model: Text Encoder + UNet + DAC (no audiotools!)"""
    
    def __init__(self, dac_model, latent_channels, unet_channels, text_dim):
        super().__init__()
        
        # Text encoder (BERT)
        self.text_encoder = AutoModel.from_pretrained("bert-base-uncased")
        
        # Freeze/unfreeze text encoder based on config
        if cfg.freeze_text_encoder:
            for param in self.text_encoder.parameters():
                param.requires_grad = False
            print("Text encoder: FROZEN ‚ùÑÔ∏è")
        else:
            for param in self.text_encoder.parameters():
                param.requires_grad = True
            print("Text encoder: TRAINABLE üî• (fine-tuning enabled)")
        
        # DAC model (frozen)
        self.dac = dac_model
        
        # UNet (trainable)
        self.unet = LatentUNet(latent_channels, unet_channels, text_dim)
        
    def forward(self, wav_in, wav_out, input_ids, attention_mask):
        """
        Forward pass with input validation (NO AUDIOTOOLS!)
        
        Returns:
            wav_pred: Predicted output waveform
            z_pred: Predicted latent
            z_target: Target latent
        """
        # Check for NaN in inputs
        if torch.isnan(wav_in).any() or torch.isnan(wav_out).any():
            return None, None, None
        
        # Encode text
        text_output = self.text_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        text_emb = text_output.last_hidden_state  # (B, S, 768)
        
        # Encode audio to latents using DAC (NO AUDIOTOOLS - direct encoder call!)
        with torch.no_grad():
            z_in = self.dac.encoder(wav_in)
            z_target = self.dac.encoder(wav_out)
        
        # Check for NaN in latents
        if torch.isnan(z_in).any() or torch.isnan(z_target).any():
            print("‚ö†Ô∏è NaN detected in DAC encoding")
            return None, None, None
        
        # Process with UNet
        z_pred = self.unet(z_in, text_emb)
        
        # Check for NaN in prediction
        if torch.isnan(z_pred).any():
            print("‚ö†Ô∏è NaN detected in UNet output")
            return None, None, None
        
        # Decode latents to waveform (NO AUDIOTOOLS - direct decoder call!)
        with torch.no_grad():
            # Decode directly - decoder only needs the latents!
            wav_pred = self.dac.decoder(z_pred)
        
        # Check for NaN in decoded audio
        if torch.isnan(wav_pred).any():
            print("‚ö†Ô∏è NaN detected in DAC decoding")
            return None, None, None
        
        return wav_pred, z_pred, z_target

def init_weights(m):
    """Initialize weights with small values for stability"""
    if isinstance(m, (nn.Conv1d, nn.ConvTranspose1d)):
        nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
        m.weight.data *= 0.1  # Scale down for stability
        if m.bias is not None:
            nn.init.zeros_(m.bias)
    elif isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight, gain=0.02)
        if m.bias is not None:
            nn.init.zeros_(m.bias)
    elif isinstance(m, nn.GroupNorm):
        nn.init.ones_(m.weight)
        nn.init.zeros_(m.bias)

#############################################
#     MODEL INITIALIZATION
#############################################

print("="*60)
print("INITIALIZING MODEL")
print("="*60)

model = AudioEffectModel(
    dac_model=dac_model,
    latent_channels=latent_channels,
    unet_channels=cfg.unet_channels,
    text_dim=cfg.text_dim
).to(cfg.device)

# Initialize UNet weights
model.unet.apply(init_weights)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Frozen parameters: {total_params - trainable_params:,}")
print(f"UNet channels: {cfg.unet_channels}")
print(f"Latent channels: {latent_channels}")
print()

#############################################
#     OPTIMIZER & LOSS
#############################################

# Optimizer based on freeze_text_encoder setting
if cfg.freeze_text_encoder:
    # Only optimize UNet
    optimizer = torch.optim.AdamW(
        model.unet.parameters(),
        lr=cfg.lr_unet,
        weight_decay=cfg.weight_decay
    )
else:
    # Optimize UNet + Text Encoder
    optimizer = torch.optim.AdamW([
        {"params": model.unet.parameters(), "lr": cfg.lr_unet},
        {"params": model.text_encoder.parameters(), "lr": cfg.lr_text},
    ], weight_decay=cfg.weight_decay)

# Cosine annealing scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=cfg.epochs * len(train_dl),
    eta_min=cfg.lr_unet * 0.1
)

# Loss functions
criterion_audio = nn.L1Loss()
criterion_latent = nn.MSELoss()

# Mixed precision scaler
scaler = torch.amp.GradScaler('cuda', enabled=cfg.use_amp)

print("="*60)
print("TRAINING SETUP")
print("="*60)
print(f"Optimizer: AdamW")
print(f"Learning rate: {cfg.lr_unet}")
print(f"Scheduler: CosineAnnealingLR")
print(f"Loss: L1 (audio) + MSE (latent)")
print(f"Mixed precision: {cfg.use_amp}")
print()

#############################################
#     TRAINING & VALIDATION FUNCTIONS
#############################################

def train_epoch(model, dataloader, optimizer, scheduler, scaler, epoch):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    total_audio_loss = 0
    total_latent_loss = 0
    nan_count = 0
    
    optimizer.zero_grad()
    
    pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{cfg.epochs}")
    
    for step, (wav_in, wav_out, ids, mask) in enumerate(pbar):
        wav_in = wav_in.to(cfg.device)
        wav_out = wav_out.to(cfg.device)
        ids = ids.to(cfg.device)
        mask = mask.to(cfg.device)
        
        # Check input
        if torch.isnan(wav_in).any() or torch.isnan(wav_out).any():
            print(f"‚ö†Ô∏è NaN in input at step {step}, skipping...")
            nan_count += 1
            continue
        
        with torch.amp.autocast('cuda', enabled=cfg.use_amp):
            # Forward pass
            wav_pred, z_pred, z_target = model(wav_in, wav_out, ids, mask)
            
            # Check for None (indicates NaN in forward pass)
            if wav_pred is None:
                nan_count += 1
                continue
            
            # Match lengths
            if wav_pred.size(-1) != wav_out.size(-1):
                min_len = min(wav_pred.size(-1), wav_out.size(-1))
                wav_pred = wav_pred[..., :min_len]
                wav_out = wav_out[..., :min_len]
            
            if z_pred.size(-1) != z_target.size(-1):
                min_len = min(z_pred.size(-1), z_target.size(-1))
                z_pred = z_pred[..., :min_len]
                z_target = z_target[..., :min_len]
            
            # Compute losses
            audio_loss = criterion_audio(wav_pred, wav_out)
            latent_loss = criterion_latent(z_pred, z_target)
            
            loss = (cfg.audio_loss_weight * audio_loss + 
                   cfg.latent_loss_weight * latent_loss)
            
            # Scale for gradient accumulation
            loss = loss / cfg.accumulation_steps
        
        # Check loss
        if torch.isnan(loss) or torch.isinf(loss):
            print(f"‚ö†Ô∏è NaN/Inf loss at step {step}, skipping...")
            nan_count += 1
            continue
        
        # Backward pass
        scaler.scale(loss).backward()
        
        # Optimizer step (with gradient accumulation)
        if (step + 1) % cfg.accumulation_steps == 0:
            # Unscale gradients
            scaler.unscale_(optimizer)
            
            # Clip gradients
            grad_norm = torch.nn.utils.clip_grad_norm_(
                model.parameters(),
                cfg.grad_clip
            )
            
            # Check gradient norm
            if torch.isnan(grad_norm) or torch.isinf(grad_norm) or grad_norm > 100:
                print(f"‚ö†Ô∏è Bad gradient (norm={grad_norm:.2f}) at step {step}, skipping...")
                optimizer.zero_grad()
                scaler.update()
                nan_count += 1
                continue
            
            # Update weights
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()
        
        # Accumulate losses
        total_loss += loss.item() * cfg.accumulation_steps
        total_audio_loss += audio_loss.item()
        total_latent_loss += latent_loss.item()
        
        # Update progress bar
        pbar.set_postfix({
            'loss': f'{loss.item() * cfg.accumulation_steps:.4f}',
            'audio': f'{audio_loss.item():.4f}',
            'latent': f'{latent_loss.item():.4f}',
            'nans': nan_count
        })
        
        # Log every N steps
        if (step + 1) % cfg.log_interval == 0:
            avg_loss = total_loss / (step + 1)
            print(f"\n  Step {step+1}/{len(dataloader)} | "
                  f"Loss: {avg_loss:.6f} | "
                  f"Audio: {total_audio_loss/(step+1):.6f} | "
                  f"Latent: {total_latent_loss/(step+1):.6f} | "
                  f"NaNs: {nan_count}")
    
    if nan_count > 0:
        print(f"\n‚ö†Ô∏è Epoch had {nan_count} NaN occurrences")
    
    avg_loss = total_loss / len(dataloader)
    avg_audio_loss = total_audio_loss / len(dataloader)
    avg_latent_loss = total_latent_loss / len(dataloader)
    
    return avg_loss, avg_audio_loss, avg_latent_loss

@torch.no_grad()
def validate_epoch(model, dataloader, epoch):
    """Validate for one epoch"""
    model.eval()
    total_loss = 0
    total_audio_loss = 0
    total_latent_loss = 0
    
    pbar = tqdm(dataloader, desc=f"Validation {epoch+1}/{cfg.epochs}")
    
    for wav_in, wav_out, ids, mask in pbar:
        wav_in = wav_in.to(cfg.device)
        wav_out = wav_out.to(cfg.device)
        ids = ids.to(cfg.device)
        mask = mask.to(cfg.device)
        
        with torch.amp.autocast('cuda', enabled=cfg.use_amp):
            wav_pred, z_pred, z_target = model(wav_in, wav_out, ids, mask)
            
            if wav_pred is None:
                continue
            
            # Match lengths
            if wav_pred.size(-1) != wav_out.size(-1):
                min_len = min(wav_pred.size(-1), wav_out.size(-1))
                wav_pred = wav_pred[..., :min_len]
                wav_out = wav_out[..., :min_len]
            
            if z_pred.size(-1) != z_target.size(-1):
                min_len = min(z_pred.size(-1), z_target.size(-1))
                z_pred = z_pred[..., :min_len]
                z_target = z_target[..., :min_len]
            
            audio_loss = criterion_audio(wav_pred, wav_out)
            latent_loss = criterion_latent(z_pred, z_target)
            
            loss = (cfg.audio_loss_weight * audio_loss + 
                   cfg.latent_loss_weight * latent_loss)
        
        total_loss += loss.item()
        total_audio_loss += audio_loss.item()
        total_latent_loss += latent_loss.item()
        
        pbar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'audio': f'{audio_loss.item():.4f}',
            'latent': f'{latent_loss.item():.4f}'
        })
    
    avg_loss = total_loss / len(dataloader)
    avg_audio_loss = total_audio_loss / len(dataloader)
    avg_latent_loss = total_latent_loss / len(dataloader)
    
    return avg_loss, avg_audio_loss, avg_latent_loss

#############################################
#     TRAINING LOOP
#############################################

print("="*60)
print("STARTING TRAINING")
print("="*60)
print(f"Total epochs: {cfg.epochs}")
print(f"Steps per epoch: {len(train_dl)}")
print(f"Validation every epoch")
print("="*60 + "\n")

# Training history
train_losses = []
val_losses = []
train_audio_losses = []
train_latent_losses = []
val_audio_losses = []
val_latent_losses = []

best_val_loss = float('inf')
start_epoch = 0

# Resume from checkpoint if exists
if os.path.exists(cfg.checkpoint_path):
    print("Loading checkpoint...")
    ckpt = torch.load(cfg.checkpoint_path, map_location=cfg.device)
    model.load_state_dict(ckpt['model'])
    optimizer.load_state_dict(ckpt['optimizer'])
    scheduler.load_state_dict(ckpt['scheduler'])
    scaler.load_state_dict(ckpt['scaler'])
    start_epoch = ckpt['epoch'] + 1
    train_losses = ckpt.get('train_losses', [])
    val_losses = ckpt.get('val_losses', [])
    train_audio_losses = ckpt.get('train_audio_losses', [])
    train_latent_losses = ckpt.get('train_latent_losses', [])
    val_audio_losses = ckpt.get('val_audio_losses', [])
    val_latent_losses = ckpt.get('val_latent_losses', [])
    best_val_loss = ckpt.get('best_val_loss', float('inf'))
    print(f"‚úì Resumed from epoch {start_epoch}")
    print()

# Training loop
for epoch in range(start_epoch, cfg.epochs):
    print(f"\n{'='*60}")
    print(f"EPOCH {epoch+1}/{cfg.epochs}")
    print(f"{'='*60}\n")
    
    # Train
    train_loss, train_audio, train_latent = train_epoch(
        model, train_dl, optimizer, scheduler, scaler, epoch
    )
    
    # Validate
    val_loss, val_audio, val_latent = validate_epoch(
        model, val_dl, epoch
    )
    
    # Store losses
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_audio_losses.append(train_audio)
    train_latent_losses.append(train_latent)
    val_audio_losses.append(val_audio)
    val_latent_losses.append(val_latent)
    
    # Print summary
    print(f"\n{'='*60}")
    print(f"EPOCH {epoch+1}/{cfg.epochs} SUMMARY")
    print(f"{'='*60}")
    print(f"Train Loss:  {train_loss:.6f} (Audio: {train_audio:.6f}, Latent: {train_latent:.6f})")
    print(f"Val Loss:    {val_loss:.6f} (Audio: {val_audio:.6f}, Latent: {val_latent:.6f})")
    print(f"Learning Rate: {scheduler.get_last_lr()[0]:.2e}")
    print(f"{'='*60}\n")
    
    # Save checkpoint
    checkpoint = {
        'epoch': epoch,
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'scheduler': scheduler.state_dict(),
        'scaler': scaler.state_dict(),
        'train_losses': train_losses,
        'val_losses': val_losses,
        'train_audio_losses': train_audio_losses,
        'train_latent_losses': train_latent_losses,
        'val_audio_losses': val_audio_losses,
        'val_latent_losses': val_latent_losses,
        'best_val_loss': best_val_loss,
        'config': {
            'latent_channels': latent_channels,
            'unet_channels': cfg.unet_channels,
            'text_dim': cfg.text_dim,
            'sample_rate': cfg.sample_rate
        }
    }
    torch.save(checkpoint, cfg.checkpoint_path)
    
    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({
            'epoch': epoch,
            'model': model.state_dict(),
            'val_loss': best_val_loss,
            'config': checkpoint['config']
        }, cfg.best_model_path)
        print(f"‚úÖ NEW BEST MODEL! Val Loss: {best_val_loss:.6f}\n")

print("\n" + "="*60)
print("TRAINING COMPLETE!")
print("="*60 + "\n")

#############################################
#     TEST SET EVALUATION
#############################################

print("="*60)
print("TESTING BEST MODEL")
print("="*60 + "\n")

# Load best model
best_ckpt = torch.load(cfg.best_model_path, map_location=cfg.device)
model.load_state_dict(best_ckpt['model'])
print(f"Loaded best model from epoch {best_ckpt['epoch']}")

# Test
test_loss, test_audio, test_latent = validate_epoch(model, test_dl, cfg.epochs)

print(f"\n{'='*60}")
print("FINAL TEST RESULTS")
print(f"{'='*60}")
print(f"Test Loss:  {test_loss:.6f}")
print(f"  Audio Loss:  {test_audio:.6f}")
print(f"  Latent Loss: {test_latent:.6f}")
print(f"{'='*60}\n")

#############################################
#     PLOT TRAINING CURVES
#############################################

print("Generating training curves...")

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Plot 1: Total loss
ax = axes[0, 0]
epochs_range = range(len(train_losses))
ax.plot(epochs_range, train_losses, 'b-', label='Train', linewidth=2, marker='o', markersize=4)
ax.plot(epochs_range, val_losses, 'r-', label='Val', linewidth=2, marker='s', markersize=4)
ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('Total Loss', fontsize=12)
ax.set_title('Total Loss (Audio + Latent)', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)

# Plot 2: Audio loss
ax = axes[0, 1]
ax.plot(epochs_range, train_audio_losses, 'b-', label='Train', linewidth=2, marker='o', markersize=4)
ax.plot(epochs_range, val_audio_losses, 'r-', label='Val', linewidth=2, marker='s', markersize=4)
ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('Audio Loss (L1)', fontsize=12)
ax.set_title('Audio Reconstruction Loss', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)

# Plot 3: Latent loss
ax = axes[1, 0]
ax.plot(epochs_range, train_latent_losses, 'b-', label='Train', linewidth=2, marker='o', markersize=4)
ax.plot(epochs_range, val_latent_losses, 'r-', label='Val', linewidth=2, marker='s', markersize=4)
ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('Latent Loss (MSE)', fontsize=12)
ax.set_title('Latent Space Loss', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)

# Plot 4: Generalization gap
ax = axes[1, 1]
gap = [v - t for t, v in zip(train_losses, val_losses)]
ax.plot(epochs_range, gap, 'g-', label='Val - Train', linewidth=2, marker='d', markersize=4)
ax.axhline(y=0, color='k', linestyle='--', alpha=0.3)
ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('Loss Gap', fontsize=12)
ax.set_title('Generalization Gap (Val - Train)', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(cfg.plot_path, dpi=300, bbox_inches='tight')
print(f"‚úì Plot saved to: {cfg.plot_path}")
plt.close()

#############################################
#     SAVE SUMMARY
#############################################

summary = {
    'dataset': {
        'total_samples': len(df),
        'train_samples': len(train_df),
        'val_samples': len(val_df),
        'test_samples': len(test_df)
    },
    'training': {
        'epochs': cfg.epochs,
        'batch_size': cfg.batch_size,
        'accumulation_steps': cfg.accumulation_steps,
        'effective_batch_size': cfg.batch_size * cfg.accumulation_steps
    },
    'model': {
        'total_parameters': total_params,
        'trainable_parameters': trainable_params,
        'latent_channels': latent_channels,
        'unet_channels': cfg.unet_channels
    },
    'results': {
        'best_train_loss': float(min(train_losses)),
        'best_val_loss': float(best_val_loss),
        'test_loss': float(test_loss),
        'test_audio_loss': float(test_audio),
        'test_latent_loss': float(test_latent)
    },
    'config': {
        'sample_rate': cfg.sample_rate,
        'max_audio_length': cfg.max_audio_length,
        'lr_unet': cfg.lr_unet,
        'audio_loss_weight': cfg.audio_loss_weight,
        'latent_loss_weight': cfg.latent_loss_weight
    }
}

summary_path = f"{cfg.base_path}/result_DAC/training_summary.json"
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"‚úì Summary saved to: {summary_path}")

print("\n" + "="*60)
print("ALL FILES SAVED")
print("="*60)
print(f"‚úì Best model: {cfg.best_model_path}")
print(f"‚úì Checkpoint: {cfg.checkpoint_path}")
print(f"‚úì Training curves: {cfg.plot_path}")
print(f"‚úì Summary: {summary_path}")
print("="*60)

print("\nüéâ TRAINING PIPELINE COMPLETE! üéâ\n")
print("Next steps:")
print("1. Check training curves for convergence")
print("2. Use inference script to test on new audio")
print("3. Fine-tune hyperparameters if needed")
print("\nGood luck with your production model! üöÄ")

  from .autonotebook import tqdm as notebook_tqdm


‚úì DAC library imported successfully
DAC-VAE AUDIO EFFECT GENERATOR (No AudioTools)
Device: cuda
Sample Rate: 44100 Hz
Max Length: 5.0 seconds
Batch Size: 2 x 4 = 8

Loading DAC model...
‚úì Loading from: C:/Users/user/.cache/dac/weights_44khz_16kbps.pth


  model_dict = torch.load(location, "cpu")
  WeightNorm.apply(module, name, dim)


‚úì DAC model loaded and frozen
‚úì DAC Latent Channels: 128
‚úì Time Reduction Factor: 512x

LOADING DATASET
Original dataset: 25000 samples

Validating files...


Validating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25000/25000 [00:06<00:00, 4115.12it/s]


‚úì Valid samples: 25000

Dataset splits:
  Train:      17500 samples (70.0%)
  Validation: 3750 samples (15.0%)
  Test:       3750 samples (15.0%)

Loading tokenizer...


'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /bert-base-uncased/resolve/main/tokenizer_config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000168F8A8B5F0>: Failed to resolve \'huggingface.co\' ([Errno 11002] getaddrinfo failed)"))'), '(Request ID: 1eaf8a62-4700-493b-bf8d-92991e403feb)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /bert-base-uncased/resolve/main/tokenizer_config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000168F8B01460>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: ed94bf2f-a9ce-46cf-844e-b4de4eb102b1)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/toke

‚úì Tokenizer loaded

CREATING DATALOADERS
Batches per epoch:
  Train: 8750 batches
  Val:   1875 batches
  Test:  1875 batches

INITIALIZING MODEL


'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /bert-base-uncased/resolve/main/config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000168F8B2A990>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 60d11b4c-5c67-4be4-af26-b682444cb9f1)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/config.json
Retrying in 2s [Retry 2/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /bert-base-uncased/resolve/main/config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000168F8B29FD0>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 61e1e76d-02ff-4a1f-bb09-6737f949d356)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/config.json
Retrying in 4s [Retry 

Text encoder: FROZEN ‚ùÑÔ∏è
Total parameters: 188,877,378
Trainable parameters: 15,181,888
Frozen parameters: 173,695,490
UNet channels: [64, 128, 256, 512]
Latent channels: 128

TRAINING SETUP
Optimizer: AdamW
Learning rate: 1e-05
Scheduler: CosineAnnealingLR
Loss: L1 (audio) + MSE (latent)
Mixed precision: True

STARTING TRAINING
Total epochs: 40
Steps per epoch: 8750
Validation every epoch

Loading checkpoint...


  ckpt = torch.load(cfg.checkpoint_path, map_location=cfg.device)


‚úì Resumed from epoch 38


EPOCH 39/40



Epoch 39/40:   1%|‚ñè             | 100/8750 [01:08<1:37:01,  1.49it/s, loss=0.7671, audio=0.1776, latent=5.8943, nans=0]


  Step 100/8750 | Loss: 0.655026 | Audio: 0.106148 | Latent: 5.488788 | NaNs: 0


Epoch 39/40:   2%|‚ñé             | 200/8750 [02:10<1:27:16,  1.63it/s, loss=0.5594, audio=0.0421, latent=5.1729, nans=0]


  Step 200/8750 | Loss: 0.656523 | Audio: 0.105389 | Latent: 5.511334 | NaNs: 0


Epoch 39/40:   3%|‚ñç             | 300/8750 [03:11<1:29:13,  1.58it/s, loss=0.6196, audio=0.0283, latent=5.9131, nans=0]


  Step 300/8750 | Loss: 0.657436 | Audio: 0.104366 | Latent: 5.530697 | NaNs: 0


Epoch 39/40:   5%|‚ñã             | 400/8750 [04:14<1:24:09,  1.65it/s, loss=0.5588, audio=0.0700, latent=4.8876, nans=0]


  Step 400/8750 | Loss: 0.652667 | Audio: 0.101991 | Latent: 5.506758 | NaNs: 0


Epoch 39/40:   6%|‚ñä             | 500/8750 [05:14<1:26:21,  1.59it/s, loss=0.5925, audio=0.0515, latent=5.4102, nans=0]


  Step 500/8750 | Loss: 0.652828 | Audio: 0.101911 | Latent: 5.509167 | NaNs: 0


Epoch 39/40:   7%|‚ñâ             | 600/8750 [06:14<1:19:28,  1.71it/s, loss=0.7269, audio=0.1790, latent=5.4783, nans=0]


  Step 600/8750 | Loss: 0.652258 | Audio: 0.101260 | Latent: 5.509978 | NaNs: 0


Epoch 39/40:   8%|‚ñà             | 700/8750 [07:13<1:19:58,  1.68it/s, loss=0.5393, audio=0.0296, latent=5.0973, nans=0]


  Step 700/8750 | Loss: 0.652127 | Audio: 0.100551 | Latent: 5.515763 | NaNs: 0


Epoch 39/40:   9%|‚ñà‚ñé            | 800/8750 [08:13<1:17:39,  1.71it/s, loss=0.7872, audio=0.1783, latent=6.0891, nans=0]


  Step 800/8750 | Loss: 0.651316 | Audio: 0.100004 | Latent: 5.513127 | NaNs: 0


Epoch 39/40:  10%|‚ñà‚ñç            | 900/8750 [09:12<1:17:43,  1.68it/s, loss=0.7988, audio=0.1784, latent=6.2033, nans=0]


  Step 900/8750 | Loss: 0.653674 | Audio: 0.102161 | Latent: 5.515129 | NaNs: 0


Epoch 39/40:  11%|‚ñà‚ñç           | 1000/8750 [10:11<1:15:51,  1.70it/s, loss=0.6036, audio=0.0526, latent=5.5102, nans=0]


  Step 1000/8750 | Loss: 0.654521 | Audio: 0.102428 | Latent: 5.520933 | NaNs: 0


Epoch 39/40:  13%|‚ñà‚ñã           | 1100/8750 [11:11<1:18:39,  1.62it/s, loss=0.5451, audio=0.0489, latent=4.9621, nans=0]


  Step 1100/8750 | Loss: 0.653945 | Audio: 0.101856 | Latent: 5.520893 | NaNs: 0


Epoch 39/40:  14%|‚ñà‚ñä           | 1200/8750 [12:11<1:15:51,  1.66it/s, loss=0.7470, audio=0.1975, latent=5.4954, nans=0]


  Step 1200/8750 | Loss: 0.653423 | Audio: 0.101453 | Latent: 5.519700 | NaNs: 0


Epoch 39/40:  15%|‚ñà‚ñâ           | 1300/8750 [13:11<1:14:43,  1.66it/s, loss=0.7169, audio=0.1818, latent=5.3508, nans=0]


  Step 1300/8750 | Loss: 0.653155 | Audio: 0.100954 | Latent: 5.522008 | NaNs: 0


Epoch 39/40:  16%|‚ñà‚ñà           | 1400/8750 [14:10<1:15:27,  1.62it/s, loss=0.9020, audio=0.3248, latent=5.7717, nans=0]


  Step 1400/8750 | Loss: 0.653110 | Audio: 0.101203 | Latent: 5.519077 | NaNs: 0


Epoch 39/40:  17%|‚ñà‚ñà‚ñè          | 1500/8750 [15:10<1:10:29,  1.71it/s, loss=0.7754, audio=0.1832, latent=5.9219, nans=0]


  Step 1500/8750 | Loss: 0.652877 | Audio: 0.100768 | Latent: 5.521086 | NaNs: 0


Epoch 39/40:  18%|‚ñà‚ñà‚ñç          | 1600/8750 [16:09<1:11:02,  1.68it/s, loss=0.5148, audio=0.0420, latent=4.7275, nans=0]


  Step 1600/8750 | Loss: 0.652647 | Audio: 0.100801 | Latent: 5.518460 | NaNs: 0


Epoch 39/40:  19%|‚ñà‚ñà‚ñå          | 1700/8750 [17:08<1:09:00,  1.70it/s, loss=0.5169, audio=0.0459, latent=4.7099, nans=0]


  Step 1700/8750 | Loss: 0.652809 | Audio: 0.101159 | Latent: 5.516495 | NaNs: 0


Epoch 39/40:  21%|‚ñà‚ñà‚ñã          | 1800/8750 [18:08<1:10:37,  1.64it/s, loss=0.5215, audio=0.0679, latent=4.5352, nans=0]


  Step 1800/8750 | Loss: 0.652968 | Audio: 0.101363 | Latent: 5.516049 | NaNs: 0


Epoch 39/40:  22%|‚ñà‚ñà‚ñä          | 1900/8750 [19:07<1:09:23,  1.65it/s, loss=0.5877, audio=0.0565, latent=5.3121, nans=0]


  Step 1900/8750 | Loss: 0.652446 | Audio: 0.100841 | Latent: 5.516050 | NaNs: 0


Epoch 39/40:  23%|‚ñà‚ñà‚ñâ          | 2000/8750 [20:06<1:09:02,  1.63it/s, loss=0.5228, audio=0.0677, latent=4.5513, nans=0]


  Step 2000/8750 | Loss: 0.652670 | Audio: 0.101080 | Latent: 5.515900 | NaNs: 0


Epoch 39/40:  24%|‚ñà‚ñà‚ñà          | 2100/8750 [21:04<1:03:39,  1.74it/s, loss=0.5676, audio=0.0252, latent=5.4244, nans=0]


  Step 2100/8750 | Loss: 0.652782 | Audio: 0.101024 | Latent: 5.517577 | NaNs: 0


Epoch 39/40:  25%|‚ñà‚ñà‚ñà‚ñé         | 2200/8750 [22:03<1:04:08,  1.70it/s, loss=0.7712, audio=0.1790, latent=5.9223, nans=0]


  Step 2200/8750 | Loss: 0.652988 | Audio: 0.100948 | Latent: 5.520391 | NaNs: 0


Epoch 39/40:  26%|‚ñà‚ñà‚ñà‚ñç         | 2300/8750 [23:02<1:04:15,  1.67it/s, loss=0.7361, audio=0.1987, latent=5.3741, nans=0]


  Step 2300/8750 | Loss: 0.653831 | Audio: 0.101343 | Latent: 5.524871 | NaNs: 0


Epoch 39/40:  27%|‚ñà‚ñà‚ñà‚ñå         | 2400/8750 [24:00<1:03:36,  1.66it/s, loss=0.7120, audio=0.1842, latent=5.2775, nans=0]


  Step 2400/8750 | Loss: 0.653901 | Audio: 0.101469 | Latent: 5.524324 | NaNs: 0


Epoch 39/40:  29%|‚ñà‚ñà‚ñà‚ñà‚ñé          | 2500/8750 [24:59<59:53,  1.74it/s, loss=0.7327, audio=0.1774, latent=5.5526, nans=0]


  Step 2500/8750 | Loss: 0.654104 | Audio: 0.101858 | Latent: 5.522460 | NaNs: 0


Epoch 39/40:  30%|‚ñà‚ñà‚ñà‚ñä         | 2600/8750 [25:58<1:01:07,  1.68it/s, loss=0.6703, audio=0.1780, latent=4.9227, nans=0]


  Step 2600/8750 | Loss: 0.654196 | Audio: 0.101872 | Latent: 5.523232 | NaNs: 0


Epoch 39/40:  31%|‚ñà‚ñà‚ñà‚ñà‚ñã          | 2700/8750 [26:56<58:37,  1.72it/s, loss=0.6076, audio=0.0332, latent=5.7443, nans=0]


  Step 2700/8750 | Loss: 0.654321 | Audio: 0.102231 | Latent: 5.520900 | NaNs: 0


Epoch 39/40:  32%|‚ñà‚ñà‚ñà‚ñà‚ñä          | 2800/8750 [27:55<59:26,  1.67it/s, loss=0.7176, audio=0.1764, latent=5.4124, nans=0]


  Step 2800/8750 | Loss: 0.653927 | Audio: 0.101985 | Latent: 5.519415 | NaNs: 0


Epoch 39/40:  33%|‚ñà‚ñà‚ñà‚ñà‚ñâ          | 2900/8750 [28:54<56:12,  1.73it/s, loss=0.7521, audio=0.1873, latent=5.6485, nans=0]


  Step 2900/8750 | Loss: 0.654036 | Audio: 0.102323 | Latent: 5.517124 | NaNs: 0


Epoch 39/40:  34%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè         | 3000/8750 [29:53<56:14,  1.70it/s, loss=0.5797, audio=0.0514, latent=5.2838, nans=0]


  Step 3000/8750 | Loss: 0.653022 | Audio: 0.101614 | Latent: 5.514079 | NaNs: 0


Epoch 39/40:  35%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé         | 3100/8750 [30:53<58:13,  1.62it/s, loss=0.8975, audio=0.3238, latent=5.7372, nans=0]


  Step 3100/8750 | Loss: 0.652935 | Audio: 0.101606 | Latent: 5.513285 | NaNs: 0


Epoch 39/40:  37%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç         | 3200/8750 [31:52<55:08,  1.68it/s, loss=0.5995, audio=0.0498, latent=5.4970, nans=0]


  Step 3200/8750 | Loss: 0.653315 | Audio: 0.101935 | Latent: 5.513801 | NaNs: 0


Epoch 39/40:  38%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã         | 3300/8750 [32:51<52:06,  1.74it/s, loss=0.5811, audio=0.0561, latent=5.2500, nans=0]


  Step 3300/8750 | Loss: 0.652990 | Audio: 0.101667 | Latent: 5.513232 | NaNs: 0


Epoch 39/40:  39%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä         | 3400/8750 [33:50<52:00,  1.71it/s, loss=0.5316, audio=0.0425, latent=4.8912, nans=0]


  Step 3400/8750 | Loss: 0.653040 | Audio: 0.101699 | Latent: 5.513408 | NaNs: 0


Epoch 39/40:  40%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà         | 3500/8750 [34:49<51:50,  1.69it/s, loss=0.5593, audio=0.0387, latent=5.2067, nans=0]


  Step 3500/8750 | Loss: 0.653262 | Audio: 0.102081 | Latent: 5.511813 | NaNs: 0


Epoch 39/40:  41%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè        | 3600/8750 [35:47<49:14,  1.74it/s, loss=0.5744, audio=0.0312, latent=5.4316, nans=0]


  Step 3600/8750 | Loss: 0.653498 | Audio: 0.102290 | Latent: 5.512084 | NaNs: 0


Epoch 39/40:  42%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé        | 3700/8750 [36:47<48:44,  1.73it/s, loss=0.5672, audio=0.0473, latent=5.1986, nans=0]


  Step 3700/8750 | Loss: 0.653478 | Audio: 0.102208 | Latent: 5.512696 | NaNs: 0


Epoch 39/40:  43%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå        | 3800/8750 [37:45<47:33,  1.73it/s, loss=0.7452, audio=0.1958, latent=5.4934, nans=0]


  Step 3800/8750 | Loss: 0.653112 | Audio: 0.102156 | Latent: 5.509560 | NaNs: 0


Epoch 39/40:  45%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã        | 3900/8750 [38:44<48:19,  1.67it/s, loss=0.7642, audio=0.1832, latent=5.8106, nans=0]


  Step 3900/8750 | Loss: 0.653315 | Audio: 0.102317 | Latent: 5.509985 | NaNs: 0


Epoch 39/40:  46%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä        | 4000/8750 [39:43<47:02,  1.68it/s, loss=0.5490, audio=0.0581, latent=4.9092, nans=0]


  Step 4000/8750 | Loss: 0.653214 | Audio: 0.102249 | Latent: 5.509643 | NaNs: 0


Epoch 39/40:  47%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà        | 4100/8750 [40:42<45:40,  1.70it/s, loss=0.7184, audio=0.1761, latent=5.4234, nans=0]


  Step 4100/8750 | Loss: 0.653237 | Audio: 0.102384 | Latent: 5.508531 | NaNs: 0


Epoch 39/40:  48%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè       | 4200/8750 [41:42<45:14,  1.68it/s, loss=0.5691, audio=0.0360, latent=5.3309, nans=0]


  Step 4200/8750 | Loss: 0.652895 | Audio: 0.102184 | Latent: 5.507107 | NaNs: 0


Epoch 39/40:  49%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé       | 4300/8750 [42:41<43:42,  1.70it/s, loss=0.5718, audio=0.0381, latent=5.3369, nans=0]


  Step 4300/8750 | Loss: 0.652961 | Audio: 0.102235 | Latent: 5.507266 | NaNs: 0


Epoch 39/40:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå       | 4400/8750 [43:40<44:27,  1.63it/s, loss=0.8870, audio=0.3220, latent=5.6507, nans=0]


  Step 4400/8750 | Loss: 0.652639 | Audio: 0.102097 | Latent: 5.505414 | NaNs: 0


Epoch 39/40:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã       | 4500/8750 [44:39<42:03,  1.68it/s, loss=0.5399, audio=0.0437, latent=4.9617, nans=0]


  Step 4500/8750 | Loss: 0.652413 | Audio: 0.101764 | Latent: 5.506485 | NaNs: 0


Epoch 39/40:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ       | 4600/8750 [45:38<41:27,  1.67it/s, loss=0.9546, audio=0.3244, latent=6.3024, nans=0]


  Step 4600/8750 | Loss: 0.652529 | Audio: 0.101769 | Latent: 5.507606 | NaNs: 0


Epoch 39/40:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà       | 4700/8750 [46:38<38:16,  1.76it/s, loss=0.5280, audio=0.0482, latent=4.7978, nans=0]


  Step 4700/8750 | Loss: 0.652616 | Audio: 0.101918 | Latent: 5.506973 | NaNs: 0


Epoch 39/40:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè      | 4800/8750 [47:36<38:48,  1.70it/s, loss=0.9574, audio=0.3295, latent=6.2793, nans=0]


  Step 4800/8750 | Loss: 0.652638 | Audio: 0.101930 | Latent: 5.507077 | NaNs: 0


Epoch 39/40:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç      | 4900/8750 [48:34<37:15,  1.72it/s, loss=0.6062, audio=0.0274, latent=5.7884, nans=0]


  Step 4900/8750 | Loss: 0.652792 | Audio: 0.102100 | Latent: 5.506915 | NaNs: 0


Epoch 39/40:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå      | 5000/8750 [49:33<36:32,  1.71it/s, loss=0.7182, audio=0.1958, latent=5.2241, nans=0]


  Step 5000/8750 | Loss: 0.652889 | Audio: 0.101910 | Latent: 5.509787 | NaNs: 0


Epoch 39/40:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã      | 5100/8750 [50:32<36:52,  1.65it/s, loss=0.7356, audio=0.1786, latent=5.5704, nans=0]


  Step 5100/8750 | Loss: 0.652646 | Audio: 0.101751 | Latent: 5.508952 | NaNs: 0


Epoch 39/40:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ      | 5200/8750 [51:30<34:47,  1.70it/s, loss=0.5614, audio=0.0450, latent=5.1643, nans=0]


  Step 5200/8750 | Loss: 0.652553 | Audio: 0.101741 | Latent: 5.508117 | NaNs: 0


Epoch 39/40:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà      | 5300/8750 [52:28<33:28,  1.72it/s, loss=0.5814, audio=0.0612, latent=5.2018, nans=0]


  Step 5300/8750 | Loss: 0.652203 | Audio: 0.101526 | Latent: 5.506768 | NaNs: 0


Epoch 39/40:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé     | 5400/8750 [53:27<32:18,  1.73it/s, loss=0.5793, audio=0.0468, latent=5.3246, nans=0]


  Step 5400/8750 | Loss: 0.651905 | Audio: 0.101373 | Latent: 5.505326 | NaNs: 0


Epoch 39/40:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç     | 5500/8750 [54:25<32:54,  1.65it/s, loss=0.5598, audio=0.0503, latent=5.0958, nans=0]


  Step 5500/8750 | Loss: 0.652012 | Audio: 0.101501 | Latent: 5.505115 | NaNs: 0


Epoch 39/40:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå     | 5600/8750 [55:25<31:44,  1.65it/s, loss=0.6501, audio=0.0642, latent=5.8586, nans=0]


  Step 5600/8750 | Loss: 0.652263 | Audio: 0.101794 | Latent: 5.504692 | NaNs: 0


Epoch 39/40:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä     | 5700/8750 [56:24<30:00,  1.69it/s, loss=0.6931, audio=0.0596, latent=6.3346, nans=0]


  Step 5700/8750 | Loss: 0.652315 | Audio: 0.101914 | Latent: 5.504004 | NaNs: 0


Epoch 39/40:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ     | 5800/8750 [57:23<29:48,  1.65it/s, loss=0.7033, audio=0.1977, latent=5.0564, nans=0]


  Step 5800/8750 | Loss: 0.652535 | Audio: 0.102189 | Latent: 5.503468 | NaNs: 0


Epoch 39/40:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà     | 5900/8750 [58:22<27:53,  1.70it/s, loss=0.7358, audio=0.2005, latent=5.3527, nans=0]


  Step 5900/8750 | Loss: 0.652367 | Audio: 0.102040 | Latent: 5.503263 | NaNs: 0


Epoch 39/40:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 6000/8750 [59:21<26:23,  1.74it/s, loss=0.5787, audio=0.0229, latent=5.5573, nans=0]


  Step 6000/8750 | Loss: 0.652278 | Audio: 0.102077 | Latent: 5.502009 | NaNs: 0


Epoch 39/40:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 6100/8750 [1:00:20<27:00,  1.64it/s, loss=0.5289, audio=0.0545, latent=4.7441, nans=0]


  Step 6100/8750 | Loss: 0.652181 | Audio: 0.101975 | Latent: 5.502059 | NaNs: 0


Epoch 39/40:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 6200/8750 [1:01:20<24:57,  1.70it/s, loss=0.7106, audio=0.1747, latent=5.3593, nans=0]


  Step 6200/8750 | Loss: 0.651888 | Audio: 0.101791 | Latent: 5.500974 | NaNs: 0


Epoch 39/40:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 6300/8750 [1:02:18<24:05,  1.69it/s, loss=0.5397, audio=0.0454, latent=4.9423, nans=0]


  Step 6300/8750 | Loss: 0.652101 | Audio: 0.101961 | Latent: 5.501400 | NaNs: 0


Epoch 39/40:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 6400/8750 [1:03:16<22:25,  1.75it/s, loss=0.6301, audio=0.0622, latent=5.6789, nans=0]


  Step 6400/8750 | Loss: 0.651835 | Audio: 0.101850 | Latent: 5.499856 | NaNs: 0


Epoch 39/40:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 6500/8750 [1:04:15<22:11,  1.69it/s, loss=0.5533, audio=0.0604, latent=4.9290, nans=0]


  Step 6500/8750 | Loss: 0.651704 | Audio: 0.101737 | Latent: 5.499673 | NaNs: 0


Epoch 39/40:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 6600/8750 [1:05:14<21:05,  1.70it/s, loss=0.7153, audio=0.1686, latent=5.4670, nans=0]


  Step 6600/8750 | Loss: 0.651767 | Audio: 0.101877 | Latent: 5.498901 | NaNs: 0


Epoch 39/40:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 6700/8750 [1:06:13<20:04,  1.70it/s, loss=0.6127, audio=0.0527, latent=5.5993, nans=0]


  Step 6700/8750 | Loss: 0.651827 | Audio: 0.102009 | Latent: 5.498177 | NaNs: 0


Epoch 39/40:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 6800/8750 [1:07:12<19:46,  1.64it/s, loss=0.7358, audio=0.1724, latent=5.6337, nans=0]


  Step 6800/8750 | Loss: 0.651728 | Audio: 0.101948 | Latent: 5.497803 | NaNs: 0


Epoch 39/40:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 6900/8750 [1:08:10<18:09,  1.70it/s, loss=0.5600, audio=0.0497, latent=5.1032, nans=0]


  Step 6900/8750 | Loss: 0.651657 | Audio: 0.101934 | Latent: 5.497222 | NaNs: 0


Epoch 39/40:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 7000/8750 [1:09:09<17:14,  1.69it/s, loss=0.8211, audio=0.2054, latent=6.1576, nans=0]


  Step 7000/8750 | Loss: 0.651798 | Audio: 0.101945 | Latent: 5.498526 | NaNs: 0


Epoch 39/40:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 7100/8750 [1:10:09<16:57,  1.62it/s, loss=0.6028, audio=0.0553, latent=5.4748, nans=0]


  Step 7100/8750 | Loss: 0.651731 | Audio: 0.101828 | Latent: 5.499027 | NaNs: 0


Epoch 39/40:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 7200/8750 [1:11:08<15:38,  1.65it/s, loss=0.8741, audio=0.3186, latent=5.5550, nans=0]


  Step 7200/8750 | Loss: 0.651572 | Audio: 0.101748 | Latent: 5.498243 | NaNs: 0


Epoch 39/40:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 7300/8750 [1:12:07<13:49,  1.75it/s, loss=0.5000, audio=0.0380, latent=4.6198, nans=0]


  Step 7300/8750 | Loss: 0.651473 | Audio: 0.101769 | Latent: 5.497045 | NaNs: 0


Epoch 39/40:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 7400/8750 [1:13:06<13:25,  1.68it/s, loss=0.5715, audio=0.0502, latent=5.2132, nans=0]


  Step 7400/8750 | Loss: 0.651414 | Audio: 0.101699 | Latent: 5.497146 | NaNs: 0


Epoch 39/40:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 7500/8750 [1:14:05<12:13,  1.70it/s, loss=0.6028, audio=0.0526, latent=5.5013, nans=0]


  Step 7500/8750 | Loss: 0.651715 | Audio: 0.101929 | Latent: 5.497863 | NaNs: 0


Epoch 39/40:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 7600/8750 [1:15:04<11:37,  1.65it/s, loss=0.5796, audio=0.0331, latent=5.4646, nans=0]


  Step 7600/8750 | Loss: 0.651525 | Audio: 0.101824 | Latent: 5.497003 | NaNs: 0


Epoch 39/40:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 7700/8750 [1:16:04<10:47,  1.62it/s, loss=0.7307, audio=0.1782, latent=5.5246, nans=0]


  Step 7700/8750 | Loss: 0.651410 | Audio: 0.101831 | Latent: 5.495794 | NaNs: 0


Epoch 39/40:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 7800/8750 [1:17:02<09:22,  1.69it/s, loss=0.5690, audio=0.0610, latent=5.0804, nans=0]


  Step 7800/8750 | Loss: 0.651346 | Audio: 0.101773 | Latent: 5.495735 | NaNs: 0


Epoch 39/40:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 7900/8750 [1:18:01<08:14,  1.72it/s, loss=0.5588, audio=0.0475, latent=5.1137, nans=0]


  Step 7900/8750 | Loss: 0.651219 | Audio: 0.101733 | Latent: 5.494860 | NaNs: 0


Epoch 39/40:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 8000/8750 [1:19:00<07:28,  1.67it/s, loss=0.5878, audio=0.0624, latent=5.2541, nans=0]


  Step 8000/8750 | Loss: 0.651167 | Audio: 0.101656 | Latent: 5.495109 | NaNs: 0


Epoch 39/40:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 8100/8750 [1:19:58<06:34,  1.65it/s, loss=0.5549, audio=0.0270, latent=5.2787, nans=0]


  Step 8100/8750 | Loss: 0.651131 | Audio: 0.101676 | Latent: 5.494555 | NaNs: 0


Epoch 39/40:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 8200/8750 [1:20:57<05:38,  1.63it/s, loss=0.5545, audio=0.0596, latent=4.9490, nans=0]


  Step 8200/8750 | Loss: 0.651235 | Audio: 0.101843 | Latent: 5.493918 | NaNs: 0


Epoch 39/40:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 8300/8750 [1:21:55<04:21,  1.72it/s, loss=0.8987, audio=0.3211, latent=5.7763, nans=0]


  Step 8300/8750 | Loss: 0.651072 | Audio: 0.101771 | Latent: 5.493010 | NaNs: 0


Epoch 39/40:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 8400/8750 [1:22:54<03:21,  1.74it/s, loss=0.5483, audio=0.0591, latent=4.8925, nans=0]


  Step 8400/8750 | Loss: 0.650930 | Audio: 0.101750 | Latent: 5.491792 | NaNs: 0


Epoch 39/40:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 8500/8750 [1:23:53<02:25,  1.72it/s, loss=0.7595, audio=0.1782, latent=5.8126, nans=0]


  Step 8500/8750 | Loss: 0.650823 | Audio: 0.101605 | Latent: 5.492177 | NaNs: 0


Epoch 39/40:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 8600/8750 [1:24:52<01:29,  1.68it/s, loss=0.8053, audio=0.2110, latent=5.9424, nans=0]


  Step 8600/8750 | Loss: 0.650627 | Audio: 0.101554 | Latent: 5.490725 | NaNs: 0


Epoch 39/40:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 8700/8750 [1:25:51<00:30,  1.66it/s, loss=0.5681, audio=0.0490, latent=5.1908, nans=0]


  Step 8700/8750 | Loss: 0.650490 | Audio: 0.101520 | Latent: 5.489699 | NaNs: 0


Epoch 39/40: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8750/8750 [1:26:21<00:00,  1.69it/s, loss=0.5986, audio=0.0396, latent=5.5898, nans=0]
Validation 39/40: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1875/1875 [16:47<00:00,  1.86it/s, loss=0.5529, audio=0.0460, latent=5.0691]



EPOCH 39/40 SUMMARY
Train Loss:  0.650385 (Audio: 0.101473, Latent: 5.489111)
Val Loss:    0.643559 (Audio: 0.097770, Latent: 5.457897)
Learning Rate: 8.75e-06

‚úÖ NEW BEST MODEL! Val Loss: 0.643559


EPOCH 40/40



Epoch 40/40:   1%|‚ñè             | 100/8750 [00:57<1:25:30,  1.69it/s, loss=0.7062, audio=0.1866, latent=5.1965, nans=0]


  Step 100/8750 | Loss: 0.649064 | Audio: 0.101185 | Latent: 5.478794 | NaNs: 0


Epoch 40/40:   2%|‚ñé             | 200/8750 [01:58<1:24:01,  1.70it/s, loss=0.5297, audio=0.0359, latent=4.9375, nans=0]


  Step 200/8750 | Loss: 0.641960 | Audio: 0.096821 | Latent: 5.451392 | NaNs: 0


Epoch 40/40:   3%|‚ñç             | 300/8750 [03:03<1:35:52,  1.47it/s, loss=0.7228, audio=0.1771, latent=5.4571, nans=0]


  Step 300/8750 | Loss: 0.639453 | Audio: 0.095745 | Latent: 5.437082 | NaNs: 0


Epoch 40/40:   5%|‚ñã             | 400/8750 [04:01<1:20:54,  1.72it/s, loss=0.6113, audio=0.0257, latent=5.8562, nans=0]


  Step 400/8750 | Loss: 0.641338 | Audio: 0.096941 | Latent: 5.443973 | NaNs: 0


Epoch 40/40:   6%|‚ñä             | 500/8750 [05:00<1:17:41,  1.77it/s, loss=0.5995, audio=0.0280, latent=5.7150, nans=0]


  Step 500/8750 | Loss: 0.642157 | Audio: 0.097914 | Latent: 5.442430 | NaNs: 0


Epoch 40/40:   7%|‚ñâ             | 600/8750 [05:59<1:20:53,  1.68it/s, loss=0.5298, audio=0.0415, latent=4.8825, nans=0]


  Step 600/8750 | Loss: 0.640996 | Audio: 0.097496 | Latent: 5.434998 | NaNs: 0


Epoch 40/40:   8%|‚ñà             | 700/8750 [06:58<1:16:01,  1.76it/s, loss=0.5738, audio=0.0385, latent=5.3535, nans=0]


  Step 700/8750 | Loss: 0.644864 | Audio: 0.100261 | Latent: 5.446033 | NaNs: 0


Epoch 40/40:   9%|‚ñà‚ñé            | 800/8750 [07:56<1:18:01,  1.70it/s, loss=0.5024, audio=0.0295, latent=4.7292, nans=0]


  Step 800/8750 | Loss: 0.645266 | Audio: 0.101273 | Latent: 5.439930 | NaNs: 0


Epoch 40/40:  10%|‚ñà‚ñç            | 900/8750 [08:53<1:14:19,  1.76it/s, loss=0.5812, audio=0.0201, latent=5.6114, nans=0]


  Step 900/8750 | Loss: 0.646832 | Audio: 0.102601 | Latent: 5.442305 | NaNs: 0


Epoch 40/40:  11%|‚ñà‚ñç           | 1000/8750 [09:51<1:19:11,  1.63it/s, loss=0.5976, audio=0.0544, latent=5.4321, nans=0]


  Step 1000/8750 | Loss: 0.645366 | Audio: 0.101114 | Latent: 5.442514 | NaNs: 0


Epoch 40/40:  12%|‚ñà‚ñå           | 1040/8750 [10:14<1:14:14,  1.73it/s, loss=0.4805, audio=0.0393, latent=4.4115, nans=0]

‚ö†Ô∏è Bad gradient (norm=inf) at step 1039, skipping...


Epoch 40/40:  13%|‚ñà‚ñã           | 1100/8750 [10:49<1:13:59,  1.72it/s, loss=0.4866, audio=0.0489, latent=4.3766, nans=1]


  Step 1100/8750 | Loss: 0.650426 | Audio: 0.101399 | Latent: 5.490264 | NaNs: 1


Epoch 40/40:  14%|‚ñà‚ñä           | 1200/8750 [12:13<2:06:39,  1.01s/it, loss=0.5814, audio=0.0349, latent=5.4646, nans=1]


  Step 1200/8750 | Loss: 0.649992 | Audio: 0.101241 | Latent: 5.487503 | NaNs: 1


Epoch 40/40:  15%|‚ñà‚ñâ           | 1300/8750 [13:51<1:53:31,  1.09it/s, loss=0.9052, audio=0.3232, latent=5.8205, nans=1]


  Step 1300/8750 | Loss: 0.649253 | Audio: 0.100377 | Latent: 5.488765 | NaNs: 1


Epoch 40/40:  16%|‚ñà‚ñà           | 1400/8750 [15:31<2:03:44,  1.01s/it, loss=0.7773, audio=0.1845, latent=5.9281, nans=1]


  Step 1400/8750 | Loss: 0.648717 | Audio: 0.100598 | Latent: 5.481191 | NaNs: 1


Epoch 40/40:  17%|‚ñà‚ñà‚ñè          | 1500/8750 [17:12<2:02:15,  1.01s/it, loss=0.7519, audio=0.1837, latent=5.6826, nans=1]


  Step 1500/8750 | Loss: 0.648452 | Audio: 0.100709 | Latent: 5.477429 | NaNs: 1


Epoch 40/40:  18%|‚ñà‚ñà‚ñç          | 1600/8750 [18:53<2:01:22,  1.02s/it, loss=0.5548, audio=0.0408, latent=5.1401, nans=1]


  Step 1600/8750 | Loss: 0.648931 | Audio: 0.101533 | Latent: 5.473980 | NaNs: 1


Epoch 40/40:  19%|‚ñà‚ñà‚ñå          | 1700/8750 [20:34<1:59:41,  1.02s/it, loss=0.6010, audio=0.0345, latent=5.6649, nans=1]


  Step 1700/8750 | Loss: 0.649303 | Audio: 0.101541 | Latent: 5.477622 | NaNs: 1


Epoch 40/40:  21%|‚ñà‚ñà‚ñã          | 1800/8750 [22:15<1:57:56,  1.02s/it, loss=0.7162, audio=0.1818, latent=5.3438, nans=1]


  Step 1800/8750 | Loss: 0.648685 | Audio: 0.101094 | Latent: 5.475916 | NaNs: 1


Epoch 40/40:  22%|‚ñà‚ñà‚ñä          | 1900/8750 [23:56<1:54:14,  1.00s/it, loss=0.7390, audio=0.1844, latent=5.5459, nans=1]


  Step 1900/8750 | Loss: 0.648435 | Audio: 0.100999 | Latent: 5.474361 | NaNs: 1


Epoch 40/40:  23%|‚ñà‚ñà‚ñâ          | 2000/8750 [25:36<1:54:28,  1.02s/it, loss=0.6292, audio=0.0372, latent=5.9202, nans=1]


  Step 2000/8750 | Loss: 0.647979 | Audio: 0.100828 | Latent: 5.471507 | NaNs: 1


Epoch 40/40:  24%|‚ñà‚ñà‚ñà          | 2100/8750 [27:17<1:52:20,  1.01s/it, loss=0.5996, audio=0.0233, latent=5.7627, nans=1]


  Step 2100/8750 | Loss: 0.647789 | Audio: 0.100762 | Latent: 5.470273 | NaNs: 1


Epoch 40/40:  25%|‚ñà‚ñà‚ñà‚ñé         | 2200/8750 [28:58<1:50:30,  1.01s/it, loss=0.6324, audio=0.0435, latent=5.8894, nans=1]


  Step 2200/8750 | Loss: 0.647373 | Audio: 0.100600 | Latent: 5.467727 | NaNs: 1


Epoch 40/40:  26%|‚ñà‚ñà‚ñà‚ñç         | 2300/8750 [30:39<1:50:35,  1.03s/it, loss=0.5306, audio=0.0702, latent=4.6036, nans=1]


  Step 2300/8750 | Loss: 0.646879 | Audio: 0.100362 | Latent: 5.465175 | NaNs: 1


Epoch 40/40:  27%|‚ñà‚ñà‚ñà‚ñå         | 2400/8750 [32:19<1:46:29,  1.01s/it, loss=0.6586, audio=0.1805, latent=4.7804, nans=1]


  Step 2400/8750 | Loss: 0.647409 | Audio: 0.100744 | Latent: 5.466654 | NaNs: 1


Epoch 40/40:  29%|‚ñà‚ñà‚ñà‚ñã         | 2500/8750 [34:00<1:47:02,  1.03s/it, loss=0.6889, audio=0.1757, latent=5.1318, nans=1]


  Step 2500/8750 | Loss: 0.647055 | Audio: 0.100790 | Latent: 5.462653 | NaNs: 1


Epoch 40/40:  30%|‚ñà‚ñà‚ñà‚ñä         | 2600/8750 [35:41<1:44:26,  1.02s/it, loss=0.5689, audio=0.0496, latent=5.1933, nans=1]


  Step 2600/8750 | Loss: 0.647510 | Audio: 0.101133 | Latent: 5.463772 | NaNs: 1


Epoch 40/40:  31%|‚ñà‚ñà‚ñà‚ñà         | 2700/8750 [37:21<1:41:38,  1.01s/it, loss=0.5941, audio=0.0287, latent=5.6541, nans=1]


  Step 2700/8750 | Loss: 0.647418 | Audio: 0.101426 | Latent: 5.459913 | NaNs: 1


Epoch 40/40:  32%|‚ñà‚ñà‚ñà‚ñà‚ñè        | 2800/8750 [39:02<1:41:22,  1.02s/it, loss=0.5602, audio=0.0588, latent=5.0142, nans=1]


  Step 2800/8750 | Loss: 0.647459 | Audio: 0.101558 | Latent: 5.459011 | NaNs: 1


Epoch 40/40:  33%|‚ñà‚ñà‚ñà‚ñà‚ñé        | 2900/8750 [40:43<1:37:54,  1.00s/it, loss=0.5935, audio=0.0640, latent=5.2944, nans=1]


  Step 2900/8750 | Loss: 0.647235 | Audio: 0.101486 | Latent: 5.457493 | NaNs: 1


Epoch 40/40:  34%|‚ñà‚ñà‚ñà‚ñà‚ñç        | 3000/8750 [42:24<1:35:26,  1.00it/s, loss=0.5925, audio=0.0659, latent=5.2665, nans=1]


  Step 3000/8750 | Loss: 0.646861 | Audio: 0.101073 | Latent: 5.457874 | NaNs: 1


Epoch 40/40:  35%|‚ñà‚ñà‚ñà‚ñà‚ñå        | 3100/8750 [44:05<1:34:47,  1.01s/it, loss=0.7040, audio=0.1779, latent=5.2609, nans=1]


  Step 3100/8750 | Loss: 0.646545 | Audio: 0.100779 | Latent: 5.457660 | NaNs: 1


Epoch 40/40:  37%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç         | 3200/8750 [45:33<53:56,  1.72it/s, loss=0.5395, audio=0.0316, latent=5.0795, nans=1]


  Step 3200/8750 | Loss: 0.646533 | Audio: 0.100883 | Latent: 5.456492 | NaNs: 1


Epoch 40/40:  38%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã         | 3300/8750 [46:32<53:54,  1.68it/s, loss=0.7960, audio=0.1928, latent=6.0318, nans=1]


  Step 3300/8750 | Loss: 0.646622 | Audio: 0.101256 | Latent: 5.453666 | NaNs: 1


Epoch 40/40:  39%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä         | 3400/8750 [47:31<52:37,  1.69it/s, loss=0.7345, audio=0.1867, latent=5.4778, nans=1]


  Step 3400/8750 | Loss: 0.646382 | Audio: 0.101295 | Latent: 5.450878 | NaNs: 1


Epoch 40/40:  40%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà         | 3500/8750 [48:30<51:13,  1.71it/s, loss=0.5113, audio=0.0409, latent=4.7036, nans=1]


  Step 3500/8750 | Loss: 0.645955 | Audio: 0.101131 | Latent: 5.448240 | NaNs: 1


Epoch 40/40:  41%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè        | 3600/8750 [49:29<52:06,  1.65it/s, loss=0.5336, audio=0.0480, latent=4.8558, nans=1]


  Step 3600/8750 | Loss: 0.645863 | Audio: 0.101189 | Latent: 5.446738 | NaNs: 1


Epoch 40/40:  42%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé        | 3700/8750 [50:28<51:00,  1.65it/s, loss=0.5307, audio=0.0426, latent=4.8817, nans=1]


  Step 3700/8750 | Loss: 0.646049 | Audio: 0.101503 | Latent: 5.445468 | NaNs: 1


Epoch 40/40:  43%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå        | 3800/8750 [51:27<49:26,  1.67it/s, loss=0.5095, audio=0.0294, latent=4.8005, nans=1]


  Step 3800/8750 | Loss: 0.645660 | Audio: 0.101299 | Latent: 5.443606 | NaNs: 1


Epoch 40/40:  45%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã        | 3900/8750 [52:27<47:25,  1.70it/s, loss=0.6144, audio=0.0517, latent=5.6264, nans=1]


  Step 3900/8750 | Loss: 0.645271 | Audio: 0.101074 | Latent: 5.441977 | NaNs: 1


Epoch 40/40:  46%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä        | 4000/8750 [53:27<47:23,  1.67it/s, loss=0.7490, audio=0.1999, latent=5.4916, nans=1]


  Step 4000/8750 | Loss: 0.645115 | Audio: 0.101039 | Latent: 5.440765 | NaNs: 1


Epoch 40/40:  47%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà        | 4100/8750 [54:26<45:49,  1.69it/s, loss=0.7194, audio=0.1743, latent=5.4502, nans=1]


  Step 4100/8750 | Loss: 0.645012 | Audio: 0.100991 | Latent: 5.440210 | NaNs: 1


Epoch 40/40:  48%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè       | 4200/8750 [55:25<44:36,  1.70it/s, loss=0.5615, audio=0.0209, latent=5.4060, nans=1]


  Step 4200/8750 | Loss: 0.645404 | Audio: 0.101007 | Latent: 5.443973 | NaNs: 1


Epoch 40/40:  49%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé       | 4300/8750 [56:25<44:17,  1.67it/s, loss=0.5840, audio=0.0317, latent=5.5237, nans=1]


  Step 4300/8750 | Loss: 0.645458 | Audio: 0.101032 | Latent: 5.444258 | NaNs: 1


Epoch 40/40:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ       | 4600/8750 [59:23<40:13,  1.72it/s, loss=0.5664, audio=0.0630, latent=5.0337, nans=1]


  Step 4600/8750 | Loss: 0.646055 | Audio: 0.101536 | Latent: 5.445185 | NaNs: 1


Epoch 40/40:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ      | 4700/8750 [1:00:22<39:37,  1.70it/s, loss=0.5333, audio=0.0487, latent=4.8457, nans=1]


  Step 4700/8750 | Loss: 0.645741 | Audio: 0.101342 | Latent: 5.443994 | NaNs: 1


Epoch 40/40:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè     | 4800/8750 [1:01:21<38:44,  1.70it/s, loss=0.5870, audio=0.0254, latent=5.6168, nans=1]


  Step 4800/8750 | Loss: 0.645679 | Audio: 0.101336 | Latent: 5.443428 | NaNs: 1


Epoch 40/40:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé     | 4900/8750 [1:02:20<37:53,  1.69it/s, loss=0.5459, audio=0.0442, latent=5.0175, nans=1]


  Step 4900/8750 | Loss: 0.645365 | Audio: 0.101155 | Latent: 5.442099 | NaNs: 1


Epoch 40/40:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç     | 5000/8750 [1:03:21<43:14,  1.45it/s, loss=0.5569, audio=0.0327, latent=5.2416, nans=1]


  Step 5000/8750 | Loss: 0.645054 | Audio: 0.100950 | Latent: 5.441041 | NaNs: 1


Epoch 40/40:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå     | 5100/8750 [1:04:22<36:49,  1.65it/s, loss=0.7266, audio=0.1763, latent=5.5030, nans=1]


  Step 5100/8750 | Loss: 0.644843 | Audio: 0.100872 | Latent: 5.439707 | NaNs: 1


Epoch 40/40:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã     | 5200/8750 [1:05:22<35:02,  1.69it/s, loss=0.5702, audio=0.0345, latent=5.3566, nans=1]


  Step 5200/8750 | Loss: 0.644974 | Audio: 0.101015 | Latent: 5.439591 | NaNs: 1


Epoch 40/40:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä     | 5300/8750 [1:06:20<33:29,  1.72it/s, loss=0.7029, audio=0.1791, latent=5.2384, nans=1]


  Step 5300/8750 | Loss: 0.645203 | Audio: 0.101103 | Latent: 5.441003 | NaNs: 1


Epoch 40/40:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà     | 5400/8750 [1:07:20<32:44,  1.71it/s, loss=0.6435, audio=0.0513, latent=5.9221, nans=1]


  Step 5400/8750 | Loss: 0.645063 | Audio: 0.101121 | Latent: 5.439417 | NaNs: 1


Epoch 40/40:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 5500/8750 [1:08:18<31:53,  1.70it/s, loss=0.7167, audio=0.1980, latent=5.1865, nans=1]


  Step 5500/8750 | Loss: 0.645151 | Audio: 0.101243 | Latent: 5.439083 | NaNs: 1


Epoch 40/40:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 5600/8750 [1:09:18<32:29,  1.62it/s, loss=0.5827, audio=0.0299, latent=5.5284, nans=1]


  Step 5600/8750 | Loss: 0.644956 | Audio: 0.101265 | Latent: 5.436915 | NaNs: 1


Epoch 40/40:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 5700/8750 [1:10:19<32:07,  1.58it/s, loss=0.5911, audio=0.0275, latent=5.6357, nans=1]


  Step 5700/8750 | Loss: 0.645006 | Audio: 0.101377 | Latent: 5.436291 | NaNs: 1


Epoch 40/40:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 5800/8750 [1:11:22<31:14,  1.57it/s, loss=0.5721, audio=0.0470, latent=5.2515, nans=1]


  Step 5800/8750 | Loss: 0.645019 | Audio: 0.101301 | Latent: 5.437176 | NaNs: 1


Epoch 40/40:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 5900/8750 [1:12:25<29:03,  1.63it/s, loss=0.7662, audio=0.1776, latent=5.8851, nans=1]


  Step 5900/8750 | Loss: 0.645186 | Audio: 0.101503 | Latent: 5.436830 | NaNs: 1


Epoch 40/40:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 6000/8750 [1:13:27<28:56,  1.58it/s, loss=0.9053, audio=0.3276, latent=5.7767, nans=1]


  Step 6000/8750 | Loss: 0.645194 | Audio: 0.101593 | Latent: 5.436006 | NaNs: 1


Epoch 40/40:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 6002/8750 [1:13:29<28:22,  1.61it/s, loss=0.8985, audio=0.3234, latent=5.7513, nans=1]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Epoch 40/40:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 6700/8750 [1:20:45<21:25,  1.60it/s, loss=0.5778, audio=0.0390, latent=5.3877, nans=1]


  Step 6700/8750 | Loss: 0.644873 | Audio: 0.101602 | Latent: 5.432713 | NaNs: 1


Epoch 40/40:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 6800/8750 [1:21:48<20:04,  1.62it/s, loss=0.6000, audio=0.0314, latent=5.6859, nans=1]


  Step 6800/8750 | Loss: 0.645081 | Audio: 0.101768 | Latent: 5.433124 | NaNs: 1


Epoch 40/40:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 6900/8750 [1:22:53<19:11,  1.61it/s, loss=0.7290, audio=0.1961, latent=5.3286, nans=1]


  Step 6900/8750 | Loss: 0.645149 | Audio: 0.101817 | Latent: 5.433317 | NaNs: 1


Epoch 40/40:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 7000/8750 [1:23:56<18:35,  1.57it/s, loss=0.6898, audio=0.1921, latent=4.9767, nans=1]


  Step 7000/8750 | Loss: 0.645310 | Audio: 0.102033 | Latent: 5.432767 | NaNs: 1


Epoch 40/40:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 7100/8750 [1:24:59<17:14,  1.60it/s, loss=0.5567, audio=0.0739, latent=4.8286, nans=1]


  Step 7100/8750 | Loss: 0.645250 | Audio: 0.102030 | Latent: 5.432207 | NaNs: 1


Epoch 40/40:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 7200/8750 [1:26:01<16:58,  1.52it/s, loss=0.5359, audio=0.0412, latent=4.9477, nans=1]


  Step 7200/8750 | Loss: 0.645233 | Audio: 0.102075 | Latent: 5.431579 | NaNs: 1


Epoch 40/40:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 7300/8750 [1:27:06<15:18,  1.58it/s, loss=0.6219, audio=0.0314, latent=5.9048, nans=1]


  Step 7300/8750 | Loss: 0.645048 | Audio: 0.101961 | Latent: 5.430876 | NaNs: 1


Epoch 40/40:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 7400/8750 [1:28:08<14:13,  1.58it/s, loss=0.5079, audio=0.0309, latent=4.7703, nans=1]


  Step 7400/8750 | Loss: 0.645075 | Audio: 0.101976 | Latent: 5.430983 | NaNs: 1


Epoch 40/40:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 7500/8750 [1:29:16<13:47,  1.51it/s, loss=0.9130, audio=0.3254, latent=5.8760, nans=1]


  Step 7500/8750 | Loss: 0.645034 | Audio: 0.102022 | Latent: 5.430113 | NaNs: 1


Epoch 40/40:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 7600/8750 [1:30:22<12:47,  1.50it/s, loss=0.7530, audio=0.1702, latent=5.8276, nans=1]


  Step 7600/8750 | Loss: 0.645134 | Audio: 0.102109 | Latent: 5.430246 | NaNs: 1


Epoch 40/40:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 7700/8750 [1:31:29<11:35,  1.51it/s, loss=0.5890, audio=0.0424, latent=5.4661, nans=1]


  Step 7700/8750 | Loss: 0.645029 | Audio: 0.102056 | Latent: 5.429729 | NaNs: 1


Epoch 40/40:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 7800/8750 [1:32:34<09:40,  1.64it/s, loss=0.6835, audio=0.0405, latent=6.4294, nans=1]


  Step 7800/8750 | Loss: 0.644789 | Audio: 0.101932 | Latent: 5.428569 | NaNs: 1


Epoch 40/40:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 7900/8750 [1:33:37<09:08,  1.55it/s, loss=0.5813, audio=0.0329, latent=5.4835, nans=1]


  Step 7900/8750 | Loss: 0.644651 | Audio: 0.101875 | Latent: 5.427762 | NaNs: 1


Epoch 40/40:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 8000/8750 [1:34:41<08:26,  1.48it/s, loss=0.6361, audio=0.0325, latent=6.0358, nans=1]


  Step 8000/8750 | Loss: 0.644677 | Audio: 0.101856 | Latent: 5.428205 | NaNs: 1


Epoch 40/40:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 8100/8750 [1:35:45<06:55,  1.57it/s, loss=0.7690, audio=0.1800, latent=5.8896, nans=1]


  Step 8100/8750 | Loss: 0.644582 | Audio: 0.101720 | Latent: 5.428622 | NaNs: 1


Epoch 40/40:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 8200/8750 [1:36:48<05:48,  1.58it/s, loss=0.5264, audio=0.0467, latent=4.7976, nans=1]


  Step 8200/8750 | Loss: 0.644406 | Audio: 0.101609 | Latent: 5.427969 | NaNs: 1


Epoch 40/40:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 8300/8750 [1:37:52<05:06,  1.47it/s, loss=0.5063, audio=0.0376, latent=4.6864, nans=1]


  Step 8300/8750 | Loss: 0.644214 | Audio: 0.101498 | Latent: 5.427162 | NaNs: 1


Epoch 40/40:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 8400/8750 [1:38:56<03:41,  1.58it/s, loss=0.7041, audio=0.1773, latent=5.2682, nans=1]


  Step 8400/8750 | Loss: 0.643983 | Audio: 0.101367 | Latent: 5.426151 | NaNs: 1


Epoch 40/40:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 8500/8750 [1:40:01<02:53,  1.44it/s, loss=0.6526, audio=0.0327, latent=6.1990, nans=1]


  Step 8500/8750 | Loss: 0.643859 | Audio: 0.101192 | Latent: 5.426672 | NaNs: 1


Epoch 40/40:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 8600/8750 [1:41:11<01:37,  1.54it/s, loss=0.6048, audio=0.0402, latent=5.6462, nans=1]


  Step 8600/8750 | Loss: 0.643757 | Audio: 0.101194 | Latent: 5.425629 | NaNs: 1


Epoch 40/40:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 8700/8750 [1:42:14<00:31,  1.60it/s, loss=0.7553, audio=0.1813, latent=5.7406, nans=1]


  Step 8700/8750 | Loss: 0.643716 | Audio: 0.101219 | Latent: 5.424962 | NaNs: 1


Epoch 40/40: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8750/8750 [1:42:47<00:00,  1.42it/s, loss=0.5556, audio=0.0494, latent=5.0619, nans=1]



‚ö†Ô∏è Epoch had 1 NaN occurrences


Validation 40/40: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1875/1875 [17:27<00:00,  1.79it/s, loss=0.5464, audio=0.0466, latent=4.9972]



EPOCH 40/40 SUMMARY
Train Loss:  0.643765 (Audio: 0.101314, Latent: 5.424511)
Val Loss:    0.636827 (Audio: 0.098961, Latent: 5.378655)
Learning Rate: 8.68e-06

‚úÖ NEW BEST MODEL! Val Loss: 0.636827


TRAINING COMPLETE!

TESTING BEST MODEL



  best_ckpt = torch.load(cfg.best_model_path, map_location=cfg.device)


Loaded best model from epoch 39


Validation 41/40: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1875/1875 [17:21<00:00,  1.80it/s, loss=0.5522, audio=0.0314, latent=5.2080]



FINAL TEST RESULTS
Test Loss:  0.639296
  Audio Loss:  0.101333
  Latent Loss: 5.379628

Generating training curves...
‚úì Plot saved to: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/training_curves.png
‚úì Summary saved to: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/training_summary.json

ALL FILES SAVED
‚úì Best model: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/model_best.pt
‚úì Checkpoint: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/model.pt
‚úì Training curves: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/training_curves.png
‚úì Summary: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/training_summary.json

üéâ TRAINING PIPELINE COMPLETE! üéâ

Next steps:
1. Check training curves for convergence
2. Use inference script to test on new audio
3. Fine-tune hyperparameters if needed

Good luck with your production model! üöÄ


# Inference

In [16]:
"""
Enhanced Inference Script for DAC-VAE Audio Effect Generator
Handles variable-length audio and all common formats

FEATURES:
- No audiotools dependency
- Handles any audio length (not limited to 5 seconds)
- Supports all common formats (.wav, .mp3, .flac, .ogg, .m4a, etc.)
- Uses soundfile for robust cross-platform compatibility
- Chunk-based processing for long audio files
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import soundfile as sf
import numpy as np
import os
from transformers import AutoTokenizer, AutoModel
from einops import rearrange
import argparse
from tqdm import tqdm

# DAC import
try:
    import dac
    print("‚úì DAC library imported successfully")
except ImportError:
    print("‚ùå DAC not installed. Run: pip install descript-audio-codec")
    exit(1)

#############################################
#     MODEL ARCHITECTURE (SAME AS TRAINING)
#############################################

class CrossAttention(nn.Module):
    """Cross-attention between audio latents and text embeddings"""
    
    def __init__(self, audio_dim, text_dim, n_heads=8):
        super().__init__()
        self.n_heads = n_heads
        self.scale = (audio_dim // n_heads) ** -0.5
        self.to_q = nn.Linear(audio_dim, audio_dim)
        self.to_k = nn.Linear(text_dim, audio_dim)
        self.to_v = nn.Linear(text_dim, audio_dim)
        self.to_out = nn.Linear(audio_dim, audio_dim)
        
    def forward(self, x, context):
        B, C, T = x.shape
        x_flat = rearrange(x, 'b c t -> b t c')
        q = self.to_q(x_flat)
        k = self.to_k(context)
        v = self.to_v(context)
        q = rearrange(q, 'b t (h d) -> b h t d', h=self.n_heads)
        k = rearrange(k, 'b s (h d) -> b h s d', h=self.n_heads)
        v = rearrange(v, 'b s (h d) -> b h s d', h=self.n_heads)
        attn = torch.einsum('bhqd,bhkd->bhqk', q, k) * self.scale
        attn = F.softmax(attn, dim=-1)
        out = torch.einsum('bhqk,bhvd->bhqd', attn, v)
        out = rearrange(out, 'b h t d -> b t (h d)')
        out = self.to_out(out)
        return rearrange(out, 'b t c -> b c t')

class ResidualBlock(nn.Module):
    """Residual block with group normalization"""
    
    def __init__(self, channels):
        super().__init__()
        self.conv1 = nn.Conv1d(channels, channels, 3, padding=1)
        self.conv2 = nn.Conv1d(channels, channels, 3, padding=1)
        self.norm1 = nn.GroupNorm(8, channels)
        self.norm2 = nn.GroupNorm(8, channels)
        self.act = nn.SiLU()
        
    def forward(self, x):
        residual = x
        x = self.act(self.norm1(self.conv1(x)))
        x = self.act(self.norm2(self.conv2(x)))
        return x + residual

class DownBlock(nn.Module):
    """Downsampling block with optional cross-attention"""
    
    def __init__(self, in_c, out_c, text_dim=768, use_attn=False):
        super().__init__()
        self.use_attn = use_attn
        self.conv = nn.Conv1d(in_c, out_c, 3, padding=1)
        self.res1 = ResidualBlock(out_c)
        self.res2 = ResidualBlock(out_c)
        if use_attn:
            self.attn = CrossAttention(out_c, text_dim)
        self.downsample = nn.Conv1d(out_c, out_c, 4, stride=2, padding=1)
        
    def forward(self, x, text_emb=None):
        x = self.conv(x)
        x = self.res1(x)
        x = self.res2(x)
        if self.use_attn and text_emb is not None:
            x = x + self.attn(x, text_emb)
        skip = x
        x = self.downsample(x)
        return x, skip

class UpBlock(nn.Module):
    """Upsampling block with skip connections and optional cross-attention"""
    
    def __init__(self, in_c, out_c, skip_c, text_dim=768, use_attn=False):
        super().__init__()
        self.use_attn = use_attn
        self.upsample = nn.ConvTranspose1d(in_c, out_c, 4, stride=2, padding=1)
        self.conv = nn.Conv1d(out_c + skip_c, out_c, 3, padding=1)
        self.res1 = ResidualBlock(out_c)
        self.res2 = ResidualBlock(out_c)
        if use_attn:
            self.attn = CrossAttention(out_c, text_dim)
        
    def forward(self, x, skip, text_emb=None):
        x = self.upsample(x)
        if x.size(-1) != skip.size(-1):
            x = F.interpolate(x, size=skip.size(-1), mode='linear', align_corners=False)
        x = torch.cat([x, skip], dim=1)
        x = self.conv(x)
        x = self.res1(x)
        x = self.res2(x)
        if self.use_attn and text_emb is not None:
            x = x + self.attn(x, text_emb)
        return x

class LatentUNet(nn.Module):
    """UNet for manipulating DAC latent space"""
    
    def __init__(self, latent_channels, channels, text_dim=768):
        super().__init__()
        self.input_conv = nn.Conv1d(latent_channels, channels[0], 7, padding=3)
        
        self.down_blocks = nn.ModuleList()
        for i in range(len(channels) - 1):
            use_attn = i >= 2
            self.down_blocks.append(DownBlock(channels[i], channels[i+1], text_dim, use_attn))
        
        self.mid_block1 = ResidualBlock(channels[-1])
        self.mid_attn = CrossAttention(channels[-1], text_dim)
        self.mid_block2 = ResidualBlock(channels[-1])
        
        self.up_blocks = nn.ModuleList()
        for i in range(len(channels) - 1, 0, -1):
            use_attn = i >= 2
            self.up_blocks.append(
                UpBlock(channels[i], channels[i-1], channels[i], text_dim, use_attn)
            )
        
        self.output_conv = nn.Conv1d(channels[0], latent_channels, 7, padding=3)
        
    def forward(self, z, text_emb):
        original_length = z.size(-1)
        x = self.input_conv(z)
        
        skips = []
        for down in self.down_blocks:
            x, skip = down(x, text_emb)
            skips.append(skip)
        
        x = self.mid_block1(x)
        x = x + self.mid_attn(x, text_emb)
        x = self.mid_block2(x)
        
        for up in self.up_blocks:
            skip = skips.pop()
            x = up(x, skip, text_emb)
        
        x = self.output_conv(x)
        
        if x.size(-1) != original_length:
            x = F.interpolate(x, size=original_length, mode='linear', align_corners=False)
        
        return x

class AudioEffectModel(nn.Module):
    """Complete model: Text Encoder + UNet + DAC"""
    
    def __init__(self, dac_model, latent_channels, unet_channels, text_dim):
        super().__init__()
        self.text_encoder = AutoModel.from_pretrained("bert-base-uncased")
        self.dac = dac_model
        self.unet = LatentUNet(latent_channels, unet_channels, text_dim)
        
    @torch.no_grad()
    def generate(self, wav_in, prompt, sample_rate):
        """Generate audio with effect applied"""
        self.eval()
        
        # Ensure correct shape
        if wav_in.dim() == 2:
            wav_in = wav_in.unsqueeze(1)
        
        # Tokenize prompt
        tokens = tokenizer(
            [prompt],
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        ).to(wav_in.device)
        
        # Encode text
        text_output = self.text_encoder(
            input_ids=tokens.input_ids,
            attention_mask=tokens.attention_mask
        )
        text_emb = text_output.last_hidden_state
        
        # Encode audio to latents
        z_in = self.dac.encoder(wav_in)
        
        # Process with UNet
        z_out = self.unet(z_in, text_emb)
        
        # Decode to waveform
        wav_out = self.dac.decoder(z_out)
        
        return wav_out

#############################################
#     ENHANCED INFERENCE CLASS
#############################################
class AudioEffectInference:
    def __init__(self, model_path, dac_model_path=None, device='cuda'):
        """Initialize inference pipeline"""
        self.device = device if torch.cuda.is_available() else 'cpu'
        
        print("="*60)
        print("LOADING MODEL FOR INFERENCE (NO AUDIOTOOLS)")
        print("="*60)
        
        # Load checkpoint
        print(f"Loading checkpoint from: {model_path}")
        ckpt = torch.load(model_path, map_location=self.device)
        
        # Get config
        config = ckpt['config']
        self.sample_rate = config['sample_rate']
        latent_channels = config['latent_channels']
        unet_channels = config['unet_channels']
        text_dim = config['text_dim']
        
        print(f"‚úì Sample rate: {self.sample_rate} Hz")
        print(f"‚úì Latent channels: {latent_channels}")
        print(f"‚úì UNet channels: {unet_channels}")
        
        # Load DAC model
        if dac_model_path is None:
            dac_model_path = "C:/Users/user/.cache/dac/weights_44khz_16kbps.pth"
        
        print(f"Loading DAC model from: {dac_model_path}")
        
        if not os.path.exists(dac_model_path):
            print(f"\n‚ùå DAC model not found at: {dac_model_path}")
            print("Please download it first!")
            exit(1)
        
        self.dac_model = dac.DAC.load(dac_model_path)
        self.dac_model = self.dac_model.to(self.device)
        self.dac_model.eval()
        print("‚úì DAC model loaded")
        
        # Create model
        self.model = AudioEffectModel(
            dac_model=self.dac_model,
            latent_channels=latent_channels,
            unet_channels=unet_channels,
            text_dim=text_dim
        ).to(self.device)
        
        # Load weights
        self.model.load_state_dict(ckpt['model'])
        self.model.eval()
        print("‚úì Model weights loaded")
        
        # Load tokenizer
        global tokenizer
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        print("‚úì Tokenizer loaded")
        
        print(f"‚úì Device: {self.device}")
        print("="*60 + "\n")
    
    def process_audio(self, input_path, output_path, prompt):
        """
        Process audio file with effect (Simple version - works for most cases)
        """
        import soundfile as sf
        
        print(f"Processing: {input_path}")
        print(f"Effect: '{prompt}'")
        
        # Load audio using soundfile (handles all formats)
        wav, sr = sf.read(input_path)
        wav = torch.from_numpy(wav).float()
        
        # Ensure correct shape: (channels, samples)
        if wav.dim() == 1:
            wav = wav.unsqueeze(0)
        elif wav.dim() == 2 and wav.size(0) > wav.size(1):
            wav = wav.t()
        
        # Resample if needed
        if sr != self.sample_rate:
            print(f"Resampling from {sr} Hz to {self.sample_rate} Hz")
            wav = torchaudio.functional.resample(wav, sr, self.sample_rate)
        
        # Convert to mono
        if wav.size(0) > 1:
            print("Converting to mono")
            wav = wav.mean(dim=0, keepdim=True)
        
        # Add batch dimension and move to device
        wav = wav.unsqueeze(0).to(self.device)
        
        print(f"Input shape: {wav.shape}")
        print("Generating...")
        
        # Generate
        with torch.no_grad():
            wav_out = self.model.generate(wav, prompt, self.sample_rate)
        
        # Move to CPU and remove batch dimension
        wav_out = wav_out.squeeze(0).cpu()
        
        # Match original length
        current_length = wav_out.size(-1)
        target_length = wav.squeeze(0).size(-1)
        
        if current_length != target_length:
            print(f"Adjusting length: {current_length} -> {target_length}")
            if current_length > target_length:
                wav_out = wav_out[..., :target_length]
            else:
                wav_out = F.pad(wav_out, (0, target_length - current_length))
        
        print(f"Output shape: {wav_out.shape}")
        
        # Save using soundfile
        wav_out_np = wav_out.squeeze(0).numpy()
        os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
        sf.write(output_path, wav_out_np, self.sample_rate)
        
        print(f"‚úì Saved to: {output_path}\n")
    
    def batch_process(self, input_dir, output_dir, prompt):
        """Process all audio files in a directory"""
        os.makedirs(output_dir, exist_ok=True)
        
        # ‚úÖ Support all common audio formats
        audio_extensions = ['.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac', '.wma', '.aiff']
        audio_files = [
            f for f in os.listdir(input_dir)
            if os.path.splitext(f)[1].lower() in audio_extensions
        ]
        
        if not audio_files:
            print(f"‚ùå No audio files found in {input_dir}")
            return
        
        print(f"Found {len(audio_files)} audio files")
        print(f"Effect: '{prompt}'\n")
        
        for i, filename in enumerate(audio_files, 1):
            print(f"[{i}/{len(audio_files)}] Processing: {filename}")
            
            input_path = os.path.join(input_dir, filename)
            name, ext = os.path.splitext(filename)
            output_filename = f"{name}_processed.wav"  # ‚úÖ Always save as WAV
            output_path = os.path.join(output_dir, output_filename)
            
            try:
                self.process_audio(input_path, output_path, prompt)
            except Exception as e:
                print(f"‚ùå Error processing {filename}: {e}\n")
        
        print(f"\n‚úÖ Batch processing complete!")
        print(f"   Processed: {len(audio_files)} files")
        print(f"   Output directory: {output_dir}")
    


#############################################
#     MAIN FUNCTION
#############################################

def main():
    parser = argparse.ArgumentParser(
        description='Enhanced Audio Effect Generator Inference',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Single file (any length, any format)
  python inference_enhanced.py --model model_best.pt --input song.mp3 --output song_rain.wav --prompt "add rain sounds"
  
  # Batch processing
  python inference_enhanced.py --model model_best.pt --input audio_folder/ --output results/ --prompt "add birds chirping"
        """
    )
    
    parser.add_argument('--model', type=str, required=True,
                       help='Path to trained model checkpoint (.pt file)')
    parser.add_argument('--input', type=str, required=True,
                       help='Input audio file or directory')
    parser.add_argument('--output', type=str, required=True,
                       help='Output audio file or directory')
    parser.add_argument('--prompt', type=str, required=True,
                       help='Effect description (e.g., "add rain sounds")')
    parser.add_argument('--dac-model', type=str, default=None,
                       help='Path to DAC model weights')
    parser.add_argument('--device', type=str, default='cuda',
                       help='Device to use (cuda or cpu)')
    
    args = parser.parse_args()
    
    # Initialize inference
    try:
        inference = AudioEffectInference(args.model, args.dac_model, args.device)
    except Exception as e:
        print(f"‚ùå Failed to load model: {e}")
        return
    
    # Process
    if os.path.isfile(args.input):
        inference.process_audio(args.input, args.output, args.prompt)
    elif os.path.isdir(args.input):
        inference.batch_process(args.input, args.output, args.prompt)
    else:
        print(f"‚ùå Error: {args.input} is not a valid file or directory")

if __name__ == '__main__':
    main()

‚úì DAC library imported successfully


usage: ipykernel_launcher.py [-h] --model MODEL --input INPUT --output OUTPUT --prompt PROMPT [--dac-model DAC_MODEL]
                             [--device DEVICE]
ipykernel_launcher.py: error: the following arguments are required: --model, --input, --output, --prompt


SystemExit: 2

In [10]:
if __name__ == '__main__':
    # Example usage (can be run directly or via command line)
    
    # For direct usage in script:
    
    model_path = "C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/model_best.pt"
    input_audio = "C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/22.wav"
    output_audio = "C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/22_dogs.wav"
    effect_prompt = "add dogs sounds"
    
    inference = AudioEffectInference(model_path, device='cuda')
    inference.process_audio(input_audio, output_audio, effect_prompt)
    
    # For command line usage:
    main()

ENHANCED AUDIO EFFECT INFERENCE
Loading checkpoint from: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/model_best.pt


  ckpt = torch.load(model_path, map_location=self.device)


‚úì Sample rate: 44100 Hz
‚úì Max chunk length: 5.0s
‚úì Latent channels: 128
‚úì UNet channels: [64, 128, 256, 512]
Loading DAC model from: C:/Users/user/.cache/dac/weights_44khz_16kbps.pth
‚úì DAC model loaded
‚úì Model weights loaded
‚úì Tokenizer loaded
‚úì Device: cuda

Processing: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/22.wav
Effect: 'add dogs sounds'
  Original: 22050 Hz, 5.00s
  Resampling: 22050 Hz ‚Üí 44100 Hz
  Input shape: torch.Size([1, 220500])
  Output shape: torch.Size([1, 220500])
‚úì Saved to: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/22_dogs.wav



usage: ipykernel_launcher.py [-h] --model MODEL --input INPUT --output OUTPUT --prompt PROMPT [--dac-model DAC_MODEL]
                             [--device DEVICE]
ipykernel_launcher.py: error: the following arguments are required: --model, --input, --output, --prompt


SystemExit: 2

In [23]:
if __name__ == '__main__':
    # Example usage (can be run directly or via command line)
    
    # For direct usage in script:
    
    model_path = "C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/model_best.pt"
    input_audio = "C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/zahra.wav"
    output_audio = "C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/zahra_dogs.wav"
    effect_prompt = "add dogs sounds"
    
    inference = AudioEffectInference(model_path, device='cuda')
    inference.process_audio(input_audio, output_audio, effect_prompt)
    
    # For command line usage:
    main()

LOADING MODEL FOR INFERENCE (NO AUDIOTOOLS)
Loading checkpoint from: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/model_best.pt


  ckpt = torch.load(model_path, map_location=self.device)


‚úì Sample rate: 44100 Hz
‚úì Latent channels: 128
‚úì UNet channels: [64, 128, 256, 512]
Loading DAC model from: C:/Users/user/.cache/dac/weights_44khz_16kbps.pth
‚úì DAC model loaded
‚úì Model weights loaded
‚úì Tokenizer loaded
‚úì Device: cuda

Processing: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/zahra.wav
Effect: 'add dogs sounds'
Resampling from 16000 Hz to 44100 Hz
Input shape: torch.Size([1, 1, 239808])
Generating...
Adjusting length: 239616 -> 239808
Output shape: torch.Size([1, 239808])
‚úì Saved to: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/zahra_dogs.wav



usage: ipykernel_launcher.py [-h] --model MODEL --input INPUT --output OUTPUT --prompt PROMPT [--dac-model DAC_MODEL]
                             [--device DEVICE]
ipykernel_launcher.py: error: the following arguments are required: --model, --input, --output, --prompt


SystemExit: 2

In [27]:
if __name__ == '__main__':
    # Example usage (can be run directly or via command line)
    
    # For direct usage in script:
    
    model_path = "C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/model_best.pt"
    input_audio = "C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/arabic_XBmfzfHL.wav"
    output_audio = "C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/arabic_XBmfzfHL_dogs.wav"
    effect_prompt = "add dogs sounds"
    
    inference = AudioEffectInference(model_path, device='cuda')
    inference.process_audio(input_audio, output_audio, effect_prompt)
    
    # For command line usage:
    main()

LOADING MODEL FOR INFERENCE (NO AUDIOTOOLS)
Loading checkpoint from: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/model_best.pt


  ckpt = torch.load(model_path, map_location=self.device)


‚úì Sample rate: 44100 Hz
‚úì Latent channels: 128
‚úì UNet channels: [64, 128, 256, 512]
Loading DAC model from: C:/Users/user/.cache/dac/weights_44khz_16kbps.pth
‚úì DAC model loaded


'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /bert-base-uncased/resolve/main/config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000016909E4EDB0>: Failed to resolve \'huggingface.co\' ([Errno 11002] getaddrinfo failed)"))'), '(Request ID: dac7b205-75cf-439c-9a97-736d5ad35abd)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/config.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /bert-base-uncased/resolve/main/config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000016A1E199280>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: d396ccd6-ccbd-4248-92ae-5b4e3a76aa0c)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/config.json
Retrying in 2s [Retry 

‚úì Model weights loaded
‚úì Tokenizer loaded
‚úì Device: cuda

Processing: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/arabic_XBmfzfHL.wav
Effect: 'add dogs sounds'
Resampling from 48000 Hz to 44100 Hz
Converting to mono
Input shape: torch.Size([1, 1, 219618])
Generating...
Adjusting length: 219136 -> 219618
Output shape: torch.Size([1, 219618])
‚úì Saved to: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/arabic_XBmfzfHL_dogs.wav



usage: ipykernel_launcher.py [-h] --model MODEL --input INPUT --output OUTPUT --prompt PROMPT [--dac-model DAC_MODEL]
                             [--device DEVICE]
ipykernel_launcher.py: error: the following arguments are required: --model, --input, --output, --prompt


SystemExit: 2

In [33]:
if __name__ == '__main__':
    # Example usage (can be run directly or via command line)
    
    # For direct usage in script:
    
    model_path = "C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/model_best.pt"
    input_audio = "C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/english_OJCIvTNk.wav"
    output_audio = "C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/english_OJCIvTNk_dogs.wav"
    effect_prompt = "add dogs sounds"
    
    inference = AudioEffectInference(model_path, device='cuda')
    inference.process_audio(input_audio, output_audio, effect_prompt)
    
    # For command line usage:
    main()

LOADING MODEL FOR INFERENCE (NO AUDIOTOOLS)
Loading checkpoint from: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/model_best.pt


  ckpt = torch.load(model_path, map_location=self.device)


‚úì Sample rate: 44100 Hz
‚úì Latent channels: 128
‚úì UNet channels: [64, 128, 256, 512]
Loading DAC model from: C:/Users/user/.cache/dac/weights_44khz_16kbps.pth
‚úì DAC model loaded
‚úì Model weights loaded
‚úì Tokenizer loaded
‚úì Device: cuda

Processing: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/english_OJCIvTNk.wav
Effect: 'add dogs sounds'
Resampling from 48000 Hz to 44100 Hz
Converting to mono
Input shape: torch.Size([1, 1, 213003])
Generating...
Adjusting length: 212992 -> 213003
Output shape: torch.Size([1, 213003])
‚úì Saved to: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/english_OJCIvTNk_dogs.wav



usage: ipykernel_launcher.py [-h] --model MODEL --input INPUT --output OUTPUT --prompt PROMPT [--dac-model DAC_MODEL]
                             [--device DEVICE]
ipykernel_launcher.py: error: the following arguments are required: --model, --input, --output, --prompt


SystemExit: 2

In [40]:
if __name__ == '__main__':
    # Example usage (can be run directly or via command line)
    
    # For direct usage in script:
    
    model_path = "C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/model_best.pt"
    input_audio = "C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/french.wav"
    output_audio = "C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/french_dogs.wav"
    effect_prompt = "add dogs sounds"
    
    inference = AudioEffectInference(model_path, device='cuda')
    inference.process_audio(input_audio, output_audio, effect_prompt)
    
    # For command line usage:
    main()

LOADING MODEL FOR INFERENCE (NO AUDIOTOOLS)
Loading checkpoint from: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/model_best.pt


  ckpt = torch.load(model_path, map_location=self.device)


‚úì Sample rate: 44100 Hz
‚úì Latent channels: 128
‚úì UNet channels: [64, 128, 256, 512]
Loading DAC model from: C:/Users/user/.cache/dac/weights_44khz_16kbps.pth
‚úì DAC model loaded
‚úì Model weights loaded
‚úì Tokenizer loaded
‚úì Device: cuda

Processing: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/french.wav
Effect: 'add dogs sounds'
Resampling from 16000 Hz to 44100 Hz
Input shape: torch.Size([1, 1, 233634])
Generating...
Adjusting length: 233472 -> 233634
Output shape: torch.Size([1, 233634])
‚úì Saved to: C:/Users/user/Desktop/yassine/EchoMind/data/NEW_DATA/result_DAC/inference_40/french_dogs.wav



usage: ipykernel_launcher.py [-h] --model MODEL --input INPUT --output OUTPUT --prompt PROMPT [--dac-model DAC_MODEL]
                             [--device DEVICE]
ipykernel_launcher.py: error: the following arguments are required: --model, --input, --output, --prompt


SystemExit: 2