In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from collections import defaultdict
import time
import warnings
import pandas as pd
import re
import math
warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION - SCALABLE
# ============================================================================
CONFIG = {
    # Scalable Architecture Parameters
    'd_model': 256,
    'n_heads': 8,
    'n_layers': 4,
    'n_minds': 2,
    'd_ff': 512,
    'dropout': 0.1,
    
    # Dialectical Parameters
    'bottleneck_ratio': 0.25,
    
    # Training
    'vocab_size': 5000,
    'max_seq_len': 128,
    'batch_size': 8,
    'num_epochs': 20,
    'learning_rate': 0.0005,
    
    # Data
    'train_samples': 600,
    'test_samples': 150,
    
    # Device
    'device': 'cuda' if torch.cuda.is_available() else 'cpu'
}

# ============================================================================
# UTILITY FUNCTIONS
# ============================================================================

def print_separator(title="", char="=", width=70):
    if title:
        padding = (width - len(title) - 2) // 2
        print(f"\n{char * padding} {title} {char * padding}")
    else:
        print(char * width)

def print_config():
    print_separator("SCALABLE DIALECTICAL TRANSFORMER CONFIG")
    for key, value in CONFIG.items():
        print(f"  {key:25s}: {value}")
    print_separator()

def count_parameters(model):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total, trainable

# ============================================================================
# TOKENIZER
# ============================================================================

class SimpleTokenizer:
    def __init__(self, vocab_size=5000):
        self.vocab_size = vocab_size
        self.word2idx = {'<PAD>': 0, '<UNK>': 1, '<START>': 2, '<END>': 3}
        self.idx2word = {0: '<PAD>', 1: '<UNK>', 2: '<START>', 3: '<END>'}
        self.next_idx = 4
        
    def fit(self, texts):
        word_counts = defaultdict(int)
        for text in texts:
            for word in self._tokenize(text):
                word_counts[word] += 1
        
        sorted_words = sorted(word_counts.items(), key=lambda x: -x[1])
        for word, _ in sorted_words[:self.vocab_size - 4]:
            if word not in self.word2idx:
                self.word2idx[word] = self.next_idx
                self.idx2word[self.next_idx] = word
                self.next_idx += 1
                
    def _tokenize(self, text):
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s]', ' ', text)
        return text.split()
    
    def encode(self, text, max_len):
        words = self._tokenize(text)
        indices = [self.word2idx.get(w, 1) for w in words]
        if len(indices) < max_len:
            indices = indices + [0] * (max_len - len(indices))
        else:
            indices = indices[:max_len]
        return indices

# ============================================================================
# DIALECTICAL ATTENTION - CORE INNOVATION
# ============================================================================

class DialecticalAttention(nn.Module):
    """
    Multi-head attention split into "minds" that debate.
    
    Standard: All heads work together
    Dialectical: Heads split into minds ‚Üí debate ‚Üí merge
    """
    def __init__(self, d_model, n_heads, n_minds=2, dropout=0.1):
        super().__init__()
        assert n_heads % n_minds == 0, "n_heads must be divisible by n_minds"
        
        self.d_model = d_model
        self.n_heads = n_heads
        self.n_minds = n_minds
        self.heads_per_mind = n_heads // n_minds
        self.d_head = d_model // n_heads
        
        # Each mind has Q, K, V projections
        self.mind_qkv = nn.ModuleList([
            nn.Linear(d_model, 3 * d_model // n_minds, bias=False)
            for _ in range(n_minds)
        ])
        
        # Output projection
        self.out_proj = nn.Linear(d_model, d_model, bias=False)
        
        # Mind-specific biases (creates perspectives)
        self.mind_bias = nn.Parameter(torch.randn(n_minds, d_model) * 0.02)
        
        # Cross-mind debate
        bottleneck_dim = int(d_model * CONFIG['bottleneck_ratio'])
        self.thought_compress = nn.Linear(d_model // n_minds, bottleneck_dim)
        self.thought_expand = nn.Linear(bottleneck_dim * n_minds, d_model // n_minds)
        
        # Debate gate
        self.debate_gate = nn.Sequential(
            nn.Linear(d_model // n_minds * 2, d_model // n_minds),
            nn.Sigmoid()
        )
        
        self.dropout = nn.Dropout(dropout)
        self.scale = self.d_head ** -0.5
        
    def forward(self, x, mask=None, return_mind_outputs=False):
        batch_size, seq_len, _ = x.shape
        
        mind_outputs = []
        mind_thoughts = []
        
        # Phase 1: Each mind processes independently
        for mind_idx in range(self.n_minds):
            x_mind = x + self.mind_bias[mind_idx].unsqueeze(0).unsqueeze(0)
            
            qkv = self.mind_qkv[mind_idx](x_mind)
            qkv = qkv.reshape(batch_size, seq_len, 3, self.heads_per_mind, self.d_head)
            qkv = qkv.permute(2, 0, 3, 1, 4)
            q, k, v = qkv[0], qkv[1], qkv[2]
            
            attn = torch.matmul(q, k.transpose(-2, -1)) * self.scale
            
            if mask is not None:
                attn = attn.masked_fill(mask == 0, float('-inf'))
            
            attn = F.softmax(attn, dim=-1)
            attn = self.dropout(attn)
            
            out = torch.matmul(attn, v)
            out = out.transpose(1, 2).reshape(batch_size, seq_len, -1)
            mind_outputs.append(out)
            
            thought = self.thought_compress(out)
            mind_thoughts.append(thought)
        
        # Phase 2: Cross-mind debate
        all_thoughts = torch.cat(mind_thoughts, dim=-1)
        shared_insight = self.thought_expand(all_thoughts)
        
        # Phase 3: Incorporate shared insight
        debated_outputs = []
        for mind_idx, out in enumerate(mind_outputs):
            gate_input = torch.cat([out, shared_insight], dim=-1)
            gate = self.debate_gate(gate_input)
            debated = out * (1 - gate) + shared_insight * gate
            debated_outputs.append(debated)
        
        # Combine
        combined = torch.cat(debated_outputs, dim=-1)
        output = self.out_proj(combined)
        
        if return_mind_outputs:
            return output, mind_outputs, mind_thoughts
        return output

class DialecticalTransformerBlock(nn.Module):
    """Transformer block with dialectical attention."""
    def __init__(self, d_model, n_heads, d_ff, n_minds=2, dropout=0.1):
        super().__init__()
        
        self.attn_norm = nn.LayerNorm(d_model)
        self.ffn_norm = nn.LayerNorm(d_model)
        
        self.attention = DialecticalAttention(d_model, n_heads, n_minds, dropout)
        
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model),
            nn.Dropout(dropout)
        )
        
    def forward(self, x, mask=None, return_details=False):
        normed = self.attn_norm(x)
        
        if return_details:
            attn_out, mind_outputs, mind_thoughts = self.attention(
                normed, mask, return_mind_outputs=True
            )
        else:
            attn_out = self.attention(normed, mask)
            mind_outputs, mind_thoughts = None, None
        
        x = x + attn_out
        
        normed = self.ffn_norm(x)
        ffn_out = self.ffn(normed)
        x = x + ffn_out
        
        if return_details:
            return x, mind_outputs, mind_thoughts
        return x

# ============================================================================
# FULL DIALECTICAL TRANSFORMER
# ============================================================================

class DialecticalTransformer(nn.Module):
    """Scalable Dialectical Transformer."""
    def __init__(self):
        super().__init__()
        self.name = "DialecticalTransformer"
        
        d_model = CONFIG['d_model']
        n_heads = CONFIG['n_heads']
        n_layers = CONFIG['n_layers']
        n_minds = CONFIG['n_minds']
        d_ff = CONFIG['d_ff']
        dropout = CONFIG['dropout']
        
        self.token_embed = nn.Embedding(CONFIG['vocab_size'], d_model)
        self.pos_embed = nn.Embedding(CONFIG['max_seq_len'], d_model)
        
        self.layers = nn.ModuleList([
            DialecticalTransformerBlock(d_model, n_heads, d_ff, n_minds, dropout)
            for _ in range(n_layers)
        ])
        
        self.final_norm = nn.LayerNorm(d_model)
        
        self.head = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, 1)
        )
        
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            
    def forward(self, x, return_details=False):
        batch_size, seq_len = x.shape
        
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0)
        h = self.token_embed(x) + self.pos_embed(positions)
        
        mask = torch.tril(torch.ones(seq_len, seq_len, device=x.device))
        mask = mask.unsqueeze(0).unsqueeze(0)
        
        all_mind_outputs = []
        all_mind_thoughts = []
        
        for layer in self.layers:
            if return_details:
                h, mind_outputs, mind_thoughts = layer(h, mask, return_details=True)
                all_mind_outputs.append(mind_outputs)
                all_mind_thoughts.append(mind_thoughts)
            else:
                h = layer(h, mask)
        
        h = self.final_norm(h)
        output = self.head(h.mean(dim=1))
        
        if return_details:
            return output, all_mind_outputs, all_mind_thoughts
        return output, []

# ============================================================================
# BASELINE MODELS
# ============================================================================

class StandardTransformer(nn.Module):
    """Standard transformer without dialectical mechanisms."""
    def __init__(self):
        super().__init__()
        self.name = "StandardTransformer"
        
        d_model = CONFIG['d_model']
        n_heads = CONFIG['n_heads']
        n_layers = CONFIG['n_layers']
        
        self.token_embed = nn.Embedding(CONFIG['vocab_size'], d_model)
        self.pos_embed = nn.Embedding(CONFIG['max_seq_len'], d_model)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=CONFIG['d_ff'],
            dropout=CONFIG['dropout'],
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        
        self.final_norm = nn.LayerNorm(d_model)
        
        self.head = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Dropout(CONFIG['dropout']),
            nn.Linear(d_model, 1)
        )
        
    def forward(self, x, return_details=False):
        batch_size, seq_len = x.shape
        
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0)
        h = self.token_embed(x) + self.pos_embed(positions)
        
        mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(x.device)
        
        h = self.encoder(h, mask=mask)
        h = self.final_norm(h)
        
        output = self.head(h.mean(dim=1))
        
        return output, []

class PersonaTransformer(nn.Module):
    """Previous winner adapted to same scale."""
    def __init__(self):
        super().__init__()
        self.name = "PersonaTransformer"
        
        d_model = CONFIG['d_model']
        n_minds = CONFIG['n_minds']
        
        self.token_embed = nn.Embedding(CONFIG['vocab_size'], d_model)
        self.pos_embed = nn.Embedding(CONFIG['max_seq_len'], d_model)
        
        self.shared_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=CONFIG['n_heads'],
            dim_feedforward=CONFIG['d_ff'],
            dropout=CONFIG['dropout'],
            batch_first=True
        )
        
        self.personas = nn.Parameter(torch.randn(n_minds, d_model) * 0.1)
        nn.init.orthogonal_(self.personas)
        
        self.persona_strength = nn.Parameter(torch.ones(n_minds) * 0.3)
        self.hidden_update = nn.GRUCell(d_model, d_model)
        
        self.final_norm = nn.LayerNorm(d_model)
        
        self.head = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Dropout(CONFIG['dropout']),
            nn.Linear(d_model, 1)
        )
        
    def forward(self, x, return_details=False):
        batch_size, seq_len = x.shape
        
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0)
        h = self.token_embed(x) + self.pos_embed(positions)
        
        hidden = torch.zeros(batch_size * seq_len, CONFIG['d_model']).to(x.device)
        
        thoughts = []
        
        for i in range(CONFIG['n_minds']):
            persona = self.personas[i] * torch.sigmoid(self.persona_strength[i])
            h_persona = h + persona.unsqueeze(0).unsqueeze(0)
            
            flat_h = h_persona.reshape(-1, CONFIG['d_model'])
            hidden = self.hidden_update(flat_h, hidden)
            
            seq_h = hidden.reshape(batch_size, seq_len, CONFIG['d_model'])
            processed = self.shared_layer(seq_h)
            
            thoughts.append(processed.mean(dim=1).detach())
            
            h = processed
            hidden = processed.reshape(-1, CONFIG['d_model'])
        
        h = self.final_norm(h)
        output = self.head(h.mean(dim=1))
        
        return output, thoughts

# ============================================================================
# METRICS
# ============================================================================

def compute_mind_diversity(mind_outputs):
    if mind_outputs is None or len(mind_outputs) == 0:
        return 0.0
    
    total_sim = 0
    count = 0
    
    for layer_minds in mind_outputs:
        if layer_minds is None:
            continue
        for i in range(len(layer_minds)):
            for j in range(i + 1, len(layer_minds)):
                m1 = layer_minds[i].reshape(layer_minds[i].size(0), -1)
                m2 = layer_minds[j].reshape(layer_minds[j].size(0), -1)
                sim = F.cosine_similarity(m1, m2).mean()
                total_sim += sim.item()
                count += 1
    
    if count == 0:
        return 0.0
    
    return 1 - (total_sim / count)

def compute_convergence(mind_thoughts):
    if mind_thoughts is None or len(mind_thoughts) < 2:
        return 0.0
    
    first_layer = mind_thoughts[0]
    last_layer = mind_thoughts[-1]
    
    if first_layer is None or last_layer is None:
        return 0.0
    
    first_sim = 0
    last_sim = 0
    count = 0
    
    for i in range(len(first_layer)):
        for j in range(i + 1, len(first_layer)):
            f1 = first_layer[i].reshape(first_layer[i].size(0), -1)
            f2 = first_layer[j].reshape(first_layer[j].size(0), -1)
            first_sim += F.cosine_similarity(f1, f2).mean().item()
            
            l1 = last_layer[i].reshape(last_layer[i].size(0), -1)
            l2 = last_layer[j].reshape(last_layer[j].size(0), -1)
            last_sim += F.cosine_similarity(l1, l2).mean().item()
            count += 1
    
    if count == 0:
        return 0.0
    
    return (last_sim - first_sim) / count

def compute_metrics(pred, target, mind_outputs, mind_thoughts):
    metrics = {}
    metrics['mse'] = F.mse_loss(pred, target).item()
    metrics['mae'] = F.l1_loss(pred, target).item()
    metrics['diversity'] = compute_mind_diversity(mind_outputs)
    metrics['convergence'] = compute_convergence(mind_thoughts)
    return metrics

# ============================================================================
# DATA LOADING
# ============================================================================

def load_data():
    print_separator("LOADING DATA")
    
    try:
        df = pd.read_excel('/kaggle/input/chain-of-thoughts-for-arithmetic-prompts/cot_arithmatic_data.xlsx')
        print(f"  Loaded {len(df)} samples")
        
        problems = df['prompt_format_one'].tolist()
        answers = df['prompt_one_answer'].tolist()
        
        tokenizer = SimpleTokenizer(vocab_size=CONFIG['vocab_size'])
        tokenizer.fit(problems)
        
        X, Y = [], []
        for prob, ans in zip(problems, answers):
            if pd.notna(prob) and pd.notna(ans):
                X.append(tokenizer.encode(str(prob), CONFIG['max_seq_len']))
                try:
                    Y.append(float(ans))
                except:
                    Y.append(0.0)
        
        X = torch.tensor(X, dtype=torch.long)
        Y = torch.tensor(Y, dtype=torch.float32)
        Y = (Y - Y.mean()) / (Y.std() + 1e-8)
        Y = Y.unsqueeze(1)
        
        n_train = min(CONFIG['train_samples'], int(len(X) * 0.8))
        n_test = min(CONFIG['test_samples'], len(X) - n_train)
        
        print(f"  Train: {n_train}, Test: {n_test}")
        
        return (X[:n_train].to(CONFIG['device']), 
                Y[:n_train].to(CONFIG['device']),
                X[n_train:n_train+n_test].to(CONFIG['device']), 
                Y[n_train:n_train+n_test].to(CONFIG['device']))
        
    except Exception as e:
        print(f"  Error: {e}, using synthetic data")
        total = CONFIG['train_samples'] + CONFIG['test_samples']
        X = torch.randint(1, CONFIG['vocab_size'], (total, CONFIG['max_seq_len']))
        Y = X.float().mean(dim=1, keepdim=True)
        Y = (Y - Y.mean()) / Y.std()
        
        return (X[:CONFIG['train_samples']].to(CONFIG['device']),
                Y[:CONFIG['train_samples']].to(CONFIG['device']),
                X[CONFIG['train_samples']:].to(CONFIG['device']),
                Y[CONFIG['train_samples']:].to(CONFIG['device']))

# ============================================================================
# TRAINING - FIXED
# ============================================================================

def train_epoch(model, optimizer, X, Y):
    model.train()
    total_loss = 0
    n_batches = 0
    
    for i in range(0, len(X), CONFIG['batch_size']):
        batch_x = X[i:i+CONFIG['batch_size']]
        batch_y = Y[i:i+CONFIG['batch_size']]
        
        if len(batch_x) < 2:
            continue
        
        optimizer.zero_grad()
        
        pred, _ = model(batch_x, return_details=False)
        loss = F.mse_loss(pred, batch_y)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        total_loss += loss.item()
        n_batches += 1
    
    return total_loss / max(n_batches, 1)

def evaluate(model, X, Y):
    model.eval()
    all_metrics = defaultdict(list)
    
    with torch.no_grad():
        for i in range(0, len(X), CONFIG['batch_size']):
            batch_x = X[i:i+CONFIG['batch_size']]
            batch_y = Y[i:i+CONFIG['batch_size']]
            
            if len(batch_x) < 2:
                continue
            
            if hasattr(model, 'layers') and hasattr(model.layers[0], 'attention'):
                pred, _ = model(batch_x, return_details=False)
                _, mind_outputs, mind_thoughts = model(batch_x, return_details=True)
            else:
                pred, thoughts = model(batch_x, return_details=False)
                mind_outputs, mind_thoughts = None, None
            
            metrics = compute_metrics(pred, batch_y, mind_outputs, mind_thoughts)
            for k, v in metrics.items():
                all_metrics[k].append(v)
    
    return {k: np.mean(v) for k, v in all_metrics.items()}

# ============================================================================
# MAIN ARENA
# ============================================================================

def run_scalable_arena():
    print_separator("SCALABLE DIALECTICAL TRANSFORMER ARENA", "=", 70)
    print_config()
    
    X_train, Y_train, X_test, Y_test = load_data()
    
    print_separator("MODEL INITIALIZATION")
    models = {
        'Standard': StandardTransformer(),
        'Persona': PersonaTransformer(),
        'Dialectical': DialecticalTransformer()
    }
    
    for name, model in models.items():
        model = model.to(CONFIG['device'])
        params, _ = count_parameters(model)
        print(f"  {name}: {params:,} params")
    
    all_results = {}
    
    for model_name, model in models.items():
        print_separator(f"TRAINING: {model_name}", "-", 70)
        
        model = model.to(CONFIG['device'])
        optimizer = optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=0.01)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, CONFIG['num_epochs'])
        
        best_mse = float('inf')
        history = []
        
        for epoch in range(CONFIG['num_epochs']):
            start = time.time()
            
            train_loss = train_epoch(model, optimizer, X_train, Y_train)
            test_metrics = evaluate(model, X_test, Y_test)
            
            scheduler.step()  # Step once per epoch, not per batch
            
            if test_metrics['mse'] < best_mse:
                best_mse = test_metrics['mse']
            
            history.append({'epoch': epoch + 1, 'train_loss': train_loss, 'test': test_metrics})
            
            div = test_metrics.get('diversity', 0)
            conv = test_metrics.get('convergence', 0)
            conv_str = "‚úì" if conv > 0 else "‚úó"
            
            print(f"  Epoch {epoch+1:2d} | Loss: {train_loss:.4f} | "
                  f"MSE: {test_metrics['mse']:.4f} | "
                  f"Div: {div:.3f} | Conv: {conv:.3f} {conv_str}")
        
        all_results[model_name] = {
            'best_mse': best_mse,
            'history': history,
            'final': test_metrics
        }
    
    print_separator("FINAL RESULTS", "=", 70)
    
    print(f"\n{'Model':<20} {'Best MSE':>12} {'Diversity':>12} {'Convergence':>12}")
    print("-" * 60)
    
    for name, res in all_results.items():
        div = res['final'].get('diversity', 0)
        conv = res['final'].get('convergence', 0)
        print(f"{name:<20} {res['best_mse']:>12.6f} {div:>12.4f} {conv:>12.4f}")
    
    print_separator("ANALYSIS", "-", 70)
    
    best_model = min(all_results.keys(), key=lambda k: all_results[k]['best_mse'])
    baseline_mse = all_results['Standard']['best_mse']
    
    print(f"\n  üèÜ WINNER: {best_model}")
    print(f"     MSE: {all_results[best_model]['best_mse']:.6f}")
    
    print(f"\n  Improvements over Standard Transformer:")
    for name, res in all_results.items():
        if name != 'Standard':
            imp = (baseline_mse - res['best_mse']) / baseline_mse * 100
            print(f"    {name}: {'+' if imp > 0 else ''}{imp:.1f}%")
    
    print_separator("LEARNING CURVES", "-", 70)
    
    for name, res in all_results.items():
        mses = [h['test']['mse'] for h in res['history']]
        min_mse, max_mse = min(mses), max(mses)
        range_mse = max_mse - min_mse if max_mse > min_mse else 1
        
        print(f"\n  {name}:")
        for i, mse in enumerate(mses[:10]):  # Show first 10 epochs
            norm = (mse - min_mse) / range_mse
            bar_len = int((1 - norm) * 25)
            bar = "‚ñà" * bar_len + "‚ñë" * (25 - bar_len)
            print(f"    E{i+1:2d}: {bar} {mse:.6f}")
    
    print_separator("SCALING GUIDE", "=", 70)
    print("""
  üöÄ To scale this architecture to LLM size:
  
  Current (Proof of Concept):
    d_model=256, n_layers=4, n_heads=8, n_minds=2
    ~3.7M params
  
  Medium (GPT-2 scale):
    d_model=768, n_layers=12, n_heads=12, n_minds=2
    ~117M params
  
  Large (GPT-3 Small scale):
    d_model=1024, n_layers=24, n_heads=16, n_minds=2
    ~350M params
  
  XL (GPT-3 scale):
    d_model=4096, n_layers=32, n_heads=32, n_minds=4
    ~1.3B params
  
  üí° Key: Dialectical overhead is only ~10%
     You get "thinking" almost for free!
    """)
    
    print_separator("COMPLETE", "=", 70)
    
    return all_results

# ============================================================================
# RUN
# ============================================================================

if __name__ == "__main__":
    torch.manual_seed(42)
    np.random.seed(42)
    
    results = run_scalable_arena()



  d_model                  : 256
  n_heads                  : 8
  n_layers                 : 4
  n_minds                  : 2
  d_ff                     : 512
  dropout                  : 0.1
  bottleneck_ratio         : 0.25
  vocab_size               : 5000
  max_seq_len              : 128
  batch_size               : 8
  num_epochs               : 20
  learning_rate            : 0.0005
  train_samples            : 600
  test_samples             : 150
  device                   : cuda

  Loaded 100 samples
  Train: 68, Test: 18

  Standard: 3,487,745 params
  Persona: 2,301,699 params
  Dialectical: 3,716,353 params

------------------------- TRAINING: Standard -------------------------
  Epoch  1 | Loss: 1.3704 | MSE: 0.0010 | Div: 0.000 | Conv: 0.000 ‚úó
  Epoch  2 | Loss: 1.1941 | MSE: 0.0099 | Div: 0.000 | Conv: 0.000 ‚úó
  Epoch  3 | Loss: 1.1741 | MSE: 0.0067 | Div: 0.000 | Conv: 0.000 ‚úó
  Epoch  4 | Loss: 1.1180 | MSE: 0.0059 | Div: 0.000 | Conv: 0.000 ‚úó
  Epoch  5 | Los