# M-RMS (Multi-Agent Risk Management System) Training - Google Colab

This notebook trains the M-RMS agent ensemble consisting of:
- **PositionSizer**: Determines optimal position sizes based on risk
- **StopLossAgent**: Sets dynamic stop-loss levels
- **TakeProfitAgent**: Optimizes take-profit targets
- **RiskCoordinator**: Ensemble coordinator for unified risk decisions

Optimized for Google Colab Pro with GPU support and 24-hour runtime.

## 1. Environment Setup & Imports

In [None]:
# Mount Google Drive and setup environment
import sys
import os
from pathlib import Path

# Check if running in Colab
try:
    import google.colab
    IN_COLAB = True
    print("🚀 Running in Google Colab")
except ImportError:
    IN_COLAB = False
    print("💻 Running locally")

# Mount Drive if in Colab
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Set project path
    PROJECT_PATH = Path('/content/drive/MyDrive/AlgoSpace-8')
    sys.path.insert(0, str(PROJECT_PATH))
else:
    PROJECT_PATH = Path.cwd().parent.parent
    sys.path.insert(0, str(PROJECT_PATH))

In [None]:
# Install required packages
if IN_COLAB:
    !pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
    !pip install -q numpy pandas h5py pyyaml tensorboard wandb optuna mlflow
    !pip install -q tqdm matplotlib seaborn scikit-learn psutil gputil

In [None]:
# Core imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

import numpy as np
import pandas as pd
from datetime import datetime
import json
import yaml
from typing import Dict, List, Tuple, Optional, Any
from dataclasses import dataclass, asdict
import logging
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Setup imports
from notebooks.utils.colab_setup import ColabSetup, SessionMonitor, setup_colab_training
from notebooks.utils.drive_manager import DriveManager, DataStreamer
from notebooks.utils.checkpoint_manager import CheckpointManager, CheckpointScheduler

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Initialize Colab environment
if IN_COLAB:
    colab_setup = setup_colab_training(
        project_name="AlgoSpace-8",
        mount_drive=True,
        setup_wandb=True,
        keep_alive=True
    )
    
    # Initialize managers
    drive_manager = DriveManager(str(PROJECT_PATH))
    checkpoint_manager = CheckpointManager(drive_manager)
    session_monitor = SessionMonitor(max_runtime_hours=23.5)
    
    device = colab_setup.device
else:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"🎮 Using device: {device}")

## 2. M-RMS Agent Architectures

In [None]:
@dataclass
class MRMSConfig:
    """Configuration for M-RMS agents."""
    # Network dimensions
    state_dim: int = 256  # From Main Core embeddings
    hidden_dim: int = 128
    action_dim: int = 10  # Discretized risk levels
    
    # Training parameters
    learning_rate: float = 1e-4
    batch_size: int = 256
    gamma: float = 0.99
    tau: float = 0.005
    
    # Risk parameters
    max_position_size: float = 0.1  # 10% max position
    max_stop_loss: float = 0.02  # 2% max stop
    max_take_profit: float = 0.05  # 5% max profit
    
    # Memory settings
    memory_size: int = 100000
    min_memory_size: int = 10000
    
    # Ensemble settings
    ensemble_hidden_dim: int = 64
    coordination_weight: float = 0.3

In [None]:
class PositionSizer(nn.Module):
    """Determines optimal position sizes based on risk assessment."""
    
    def __init__(self, config: MRMSConfig):
        super().__init__()
        self.config = config
        
        # Feature extraction
        self.feature_net = nn.Sequential(
            nn.Linear(config.state_dim, config.hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(config.hidden_dim),
            nn.Dropout(0.1),
            nn.Linear(config.hidden_dim, config.hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(config.hidden_dim)
        )
        
        # Risk assessment heads
        self.volatility_head = nn.Linear(config.hidden_dim, 32)
        self.confidence_head = nn.Linear(config.hidden_dim, 32)
        
        # Position size output
        self.position_net = nn.Sequential(
            nn.Linear(config.hidden_dim + 64, config.hidden_dim),
            nn.ReLU(),
            nn.Linear(config.hidden_dim, config.action_dim),
            nn.Softmax(dim=-1)
        )
        
    def forward(self, state: torch.Tensor) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
        """Forward pass returning position sizes and risk metrics."""
        features = self.feature_net(state)
        
        # Assess risk factors
        volatility = torch.sigmoid(self.volatility_head(features))
        confidence = torch.sigmoid(self.confidence_head(features))
        
        # Combine for position sizing
        risk_features = torch.cat([features, volatility, confidence], dim=-1)
        position_probs = self.position_net(risk_features)
        
        # Convert to actual position sizes
        position_sizes = position_probs * self.config.max_position_size
        
        metrics = {
            'volatility': volatility.mean(dim=-1),
            'confidence': confidence.mean(dim=-1),
            'position_probs': position_probs
        }
        
        return position_sizes, metrics

In [None]:
class StopLossAgent(nn.Module):
    """Sets dynamic stop-loss levels based on market conditions."""
    
    def __init__(self, config: MRMSConfig):
        super().__init__()
        self.config = config
        
        # Market condition analyzer
        self.market_analyzer = nn.Sequential(
            nn.Linear(config.state_dim, config.hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(config.hidden_dim),
            nn.Linear(config.hidden_dim, 64),
            nn.ReLU()
        )
        
        # Volatility-based adjustment
        self.volatility_net = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.Sigmoid()  # 0-1 volatility score
        )
        
        # Stop-loss level predictor
        self.stop_loss_net = nn.Sequential(
            nn.Linear(64 + 16, config.hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(config.hidden_dim, config.action_dim),
            nn.Softmax(dim=-1)
        )
        
    def forward(self, state: torch.Tensor, position_size: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
        """Forward pass returning stop-loss levels."""
        # Analyze market conditions
        market_features = self.market_analyzer(state)
        volatility_score = self.volatility_net(market_features)
        
        # Combine features for stop-loss decision
        combined = torch.cat([market_features, volatility_score], dim=-1)
        stop_loss_probs = self.stop_loss_net(combined)
        
        # Convert to actual stop-loss percentages
        stop_loss_levels = stop_loss_probs * self.config.max_stop_loss
        
        # Adjust based on position size if provided
        if position_size is not None:
            # Tighter stops for larger positions
            size_factor = 1.0 - (position_size / self.config.max_position_size) * 0.3
            stop_loss_levels = stop_loss_levels * size_factor
        
        metrics = {
            'volatility_score': volatility_score.mean(dim=-1),
            'stop_loss_probs': stop_loss_probs,
            'adjusted_stops': stop_loss_levels
        }
        
        return stop_loss_levels, metrics

In [None]:
class TakeProfitAgent(nn.Module):
    """Optimizes take-profit targets based on market momentum."""
    
    def __init__(self, config: MRMSConfig):
        super().__init__()
        self.config = config
        
        # Momentum analyzer
        self.momentum_net = nn.Sequential(
            nn.Linear(config.state_dim, config.hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(config.hidden_dim),
            nn.Linear(config.hidden_dim, 64),
            nn.ReLU()
        )
        
        # Trend strength estimator
        self.trend_net = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.Tanh()  # -1 to 1 trend score
        )
        
        # Take-profit predictor
        self.profit_net = nn.Sequential(
            nn.Linear(64 + 16, config.hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(config.hidden_dim, config.action_dim),
            nn.Softmax(dim=-1)
        )
        
    def forward(self, state: torch.Tensor, stop_loss: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
        """Forward pass returning take-profit levels."""
        # Analyze momentum
        momentum_features = self.momentum_net(state)
        trend_score = self.trend_net(momentum_features)
        
        # Combine for profit target decision
        combined = torch.cat([momentum_features, trend_score], dim=-1)
        profit_probs = self.profit_net(combined)
        
        # Convert to actual take-profit percentages
        take_profit_levels = profit_probs * self.config.max_take_profit
        
        # Ensure minimum risk-reward ratio
        if stop_loss is not None:
            min_rr_ratio = 2.0  # 2:1 risk-reward minimum
            min_profit = stop_loss * min_rr_ratio
            take_profit_levels = torch.maximum(take_profit_levels, min_profit)
        
        metrics = {
            'trend_score': trend_score.mean(dim=-1),
            'profit_probs': profit_probs,
            'risk_reward_ratio': (take_profit_levels / (stop_loss + 1e-6)).mean() if stop_loss is not None else None
        }
        
        return take_profit_levels, metrics

In [None]:
class RiskCoordinator(nn.Module):
    """Ensemble coordinator for unified risk management decisions."""
    
    def __init__(self, config: MRMSConfig):
        super().__init__()
        self.config = config
        
        # Individual agent processors
        self.position_processor = nn.Linear(config.action_dim, config.ensemble_hidden_dim)
        self.stop_processor = nn.Linear(config.action_dim, config.ensemble_hidden_dim)
        self.profit_processor = nn.Linear(config.action_dim, config.ensemble_hidden_dim)
        
        # Cross-agent attention
        self.cross_attention = nn.MultiheadAttention(
            embed_dim=config.ensemble_hidden_dim,
            num_heads=4,
            dropout=0.1
        )
        
        # Final decision network
        self.decision_net = nn.Sequential(
            nn.Linear(config.ensemble_hidden_dim * 3, config.hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(config.hidden_dim),
            nn.Linear(config.hidden_dim, config.ensemble_hidden_dim),
            nn.ReLU(),
            nn.Linear(config.ensemble_hidden_dim, 3)  # Scaling factors for each agent
        )
        
    def forward(self, position_output: torch.Tensor, stop_output: torch.Tensor, 
                profit_output: torch.Tensor) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
        """Coordinate agent outputs into final risk parameters."""
        # Process individual outputs
        pos_features = self.position_processor(position_output)
        stop_features = self.stop_processor(stop_output)
        profit_features = self.profit_processor(profit_output)
        
        # Stack for attention (seq_len, batch, features)
        stacked = torch.stack([pos_features, stop_features, profit_features], dim=0)
        
        # Apply cross-attention
        attended, attention_weights = self.cross_attention(stacked, stacked, stacked)
        
        # Flatten and make final decision
        flattened = attended.transpose(0, 1).reshape(position_output.size(0), -1)
        coordination_factors = torch.sigmoid(self.decision_net(flattened))
        
        # Apply coordination
        coordinated_position = position_output * coordination_factors[:, 0:1]
        coordinated_stop = stop_output * coordination_factors[:, 1:2]
        coordinated_profit = profit_output * coordination_factors[:, 2:3]
        
        final_params = {
            'position_size': coordinated_position.max(dim=-1)[0],
            'stop_loss': coordinated_stop.max(dim=-1)[0],
            'take_profit': coordinated_profit.max(dim=-1)[0]
        }
        
        return final_params, attention_weights

## 3. M-RMS Ensemble Training System

In [None]:
class MRMSEnsemble:
    """Complete M-RMS ensemble with training capabilities."""
    
    def __init__(self, config: MRMSConfig, device: torch.device):
        self.config = config
        self.device = device
        
        # Initialize agents
        self.position_sizer = PositionSizer(config).to(device)
        self.stop_loss_agent = StopLossAgent(config).to(device)
        self.take_profit_agent = TakeProfitAgent(config).to(device)
        self.coordinator = RiskCoordinator(config).to(device)
        
        # Target networks for stable training
        self.target_position = PositionSizer(config).to(device)
        self.target_stop = StopLossAgent(config).to(device)
        self.target_profit = TakeProfitAgent(config).to(device)
        
        # Initialize target networks
        self._update_targets(tau=1.0)
        
        # Optimizers
        self.position_opt = optim.Adam(self.position_sizer.parameters(), lr=config.learning_rate)
        self.stop_opt = optim.Adam(self.stop_loss_agent.parameters(), lr=config.learning_rate)
        self.profit_opt = optim.Adam(self.take_profit_agent.parameters(), lr=config.learning_rate)
        self.coord_opt = optim.Adam(self.coordinator.parameters(), lr=config.learning_rate * 0.5)
        
        # Memory buffer
        self.memory = []
        
        # Training metrics
        self.metrics = {
            'position_loss': [],
            'stop_loss': [],
            'profit_loss': [],
            'coord_loss': [],
            'risk_reward_ratio': [],
            'position_accuracy': []
        }
        
    def forward(self, state: torch.Tensor) -> Dict[str, torch.Tensor]:
        """Forward pass through entire ensemble."""
        # Get individual agent outputs
        position_output, pos_metrics = self.position_sizer(state)
        stop_output, stop_metrics = self.stop_loss_agent(state, position_output.max(dim=-1)[0])
        profit_output, profit_metrics = self.take_profit_agent(state, stop_output.max(dim=-1)[0])
        
        # Coordinate decisions
        final_params, attention = self.coordinator(position_output, stop_output, profit_output)
        
        # Add metrics
        final_params['attention_weights'] = attention
        final_params.update({f'pos_{k}': v for k, v in pos_metrics.items()})
        final_params.update({f'stop_{k}': v for k, v in stop_metrics.items()})
        final_params.update({f'profit_{k}': v for k, v in profit_metrics.items()})
        
        return final_params
    
    def remember(self, state: torch.Tensor, action: Dict[str, torch.Tensor], 
                 reward: float, next_state: torch.Tensor, done: bool):
        """Store experience in memory."""
        self.memory.append((state, action, reward, next_state, done))
        
        # Limit memory size
        if len(self.memory) > self.config.memory_size:
            self.memory.pop(0)
    
    def train_step(self, batch_size: Optional[int] = None) -> Dict[str, float]:
        """Perform one training step."""
        if len(self.memory) < self.config.min_memory_size:
            return {}
        
        batch_size = batch_size or self.config.batch_size
        
        # Sample batch
        indices = np.random.choice(len(self.memory), batch_size, replace=False)
        batch = [self.memory[i] for i in indices]
        
        # Prepare batch tensors
        states = torch.stack([b[0] for b in batch]).to(self.device)
        actions = {k: torch.stack([b[1][k] for b in batch]) for k in ['position_size', 'stop_loss', 'take_profit']}
        rewards = torch.tensor([b[2] for b in batch], dtype=torch.float32).to(self.device)
        next_states = torch.stack([b[3] for b in batch]).to(self.device)
        dones = torch.tensor([b[4] for b in batch], dtype=torch.float32).to(self.device)
        
        # Train individual agents
        losses = {}
        
        # Position Sizer
        pos_loss = self._train_position_sizer(states, actions['position_size'], rewards, next_states, dones)
        losses['position_loss'] = pos_loss
        
        # Stop Loss Agent
        stop_loss = self._train_stop_loss(states, actions['stop_loss'], rewards, next_states, dones)
        losses['stop_loss'] = stop_loss
        
        # Take Profit Agent
        profit_loss = self._train_take_profit(states, actions['take_profit'], rewards, next_states, dones)
        losses['profit_loss'] = profit_loss
        
        # Coordinator
        coord_loss = self._train_coordinator(states, actions, rewards)
        losses['coord_loss'] = coord_loss
        
        # Update target networks
        self._update_targets()
        
        # Update metrics
        for k, v in losses.items():
            self.metrics[k].append(v)
        
        return losses
    
    def _train_position_sizer(self, states, actions, rewards, next_states, dones):
        """Train position sizing agent."""
        self.position_opt.zero_grad()
        
        # Current Q values
        current_output, _ = self.position_sizer(states)
        current_q = (current_output * actions.unsqueeze(-1)).sum(dim=-1)
        
        # Target Q values
        with torch.no_grad():
            next_output, _ = self.target_position(next_states)
            next_q = next_output.max(dim=-1)[0]
            target_q = rewards + self.config.gamma * next_q * (1 - dones)
        
        # Loss with risk penalty
        base_loss = F.mse_loss(current_q, target_q)
        risk_penalty = (actions - 0.05).clamp(min=0).mean() * 0.1  # Penalize large positions
        
        loss = base_loss + risk_penalty
        loss.backward()
        self.position_opt.step()
        
        return loss.item()
    
    def _train_stop_loss(self, states, actions, rewards, next_states, dones):
        """Train stop loss agent."""
        self.stop_opt.zero_grad()
        
        current_output, _ = self.stop_loss_agent(states)
        current_q = (current_output * actions.unsqueeze(-1)).sum(dim=-1)
        
        with torch.no_grad():
            next_output, _ = self.target_stop(next_states)
            next_q = next_output.max(dim=-1)[0]
            target_q = rewards + self.config.gamma * next_q * (1 - dones)
        
        # Loss with protection bonus
        base_loss = F.mse_loss(current_q, target_q)
        protection_bonus = -actions.mean() * 0.05  # Reward tighter stops
        
        loss = base_loss + protection_bonus
        loss.backward()
        self.stop_opt.step()
        
        return loss.item()
    
    def _train_take_profit(self, states, actions, rewards, next_states, dones):
        """Train take profit agent."""
        self.profit_opt.zero_grad()
        
        current_output, _ = self.take_profit_agent(states)
        current_q = (current_output * actions.unsqueeze(-1)).sum(dim=-1)
        
        with torch.no_grad():
            next_output, _ = self.target_profit(next_states)
            next_q = next_output.max(dim=-1)[0]
            target_q = rewards + self.config.gamma * next_q * (1 - dones)
        
        loss = F.mse_loss(current_q, target_q)
        loss.backward()
        self.profit_opt.step()
        
        return loss.item()
    
    def _train_coordinator(self, states, actions, rewards):
        """Train ensemble coordinator."""
        self.coord_opt.zero_grad()
        
        # Get current predictions
        with torch.no_grad():
            pos_out, _ = self.position_sizer(states)
            stop_out, _ = self.stop_loss_agent(states)
            profit_out, _ = self.take_profit_agent(states)
        
        # Coordinate
        final_params, _ = self.coordinator(pos_out, stop_out, profit_out)
        
        # Loss based on reward alignment
        predicted_rr = final_params['take_profit'] / (final_params['stop_loss'] + 1e-6)
        rr_loss = F.mse_loss(predicted_rr, rewards / 10)  # Scale rewards
        
        # Consistency loss
        consistency_loss = (
            F.mse_loss(final_params['position_size'], actions['position_size']) +
            F.mse_loss(final_params['stop_loss'], actions['stop_loss']) +
            F.mse_loss(final_params['take_profit'], actions['take_profit'])
        ) * 0.1
        
        loss = rr_loss + consistency_loss
        loss.backward()
        self.coord_opt.step()
        
        return loss.item()
    
    def _update_targets(self, tau: Optional[float] = None):
        """Soft update target networks."""
        tau = tau or self.config.tau
        
        for target_param, param in zip(self.target_position.parameters(), self.position_sizer.parameters()):
            target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
            
        for target_param, param in zip(self.target_stop.parameters(), self.stop_loss_agent.parameters()):
            target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
            
        for target_param, param in zip(self.target_profit.parameters(), self.take_profit_agent.parameters()):
            target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
    
    def save_models(self, path: str):
        """Save all models."""
        torch.save({
            'position_sizer': self.position_sizer.state_dict(),
            'stop_loss_agent': self.stop_loss_agent.state_dict(),
            'take_profit_agent': self.take_profit_agent.state_dict(),
            'coordinator': self.coordinator.state_dict(),
            'config': asdict(self.config)
        }, path)
    
    def load_models(self, path: str):
        """Load all models."""
        checkpoint = torch.load(path, map_location=self.device)
        
        self.position_sizer.load_state_dict(checkpoint['position_sizer'])
        self.stop_loss_agent.load_state_dict(checkpoint['stop_loss_agent'])
        self.take_profit_agent.load_state_dict(checkpoint['take_profit_agent'])
        self.coordinator.load_state_dict(checkpoint['coordinator'])
        
        # Update targets
        self._update_targets(tau=1.0)

## 4. Training Environment & Data Loading

In [None]:
class RiskEnvironment:
    """Training environment for M-RMS agents."""
    
    def __init__(self, data_path: str, config: MRMSConfig):
        self.config = config
        self.data = self._load_data(data_path)
        self.current_idx = 0
        self.episode_return = 0
        self.episode_length = 0
        
    def _load_data(self, path: str) -> pd.DataFrame:
        """Load training data."""
        # This would load actual market data
        # For now, create synthetic data
        n_samples = 100000
        
        data = pd.DataFrame({
            'timestamp': pd.date_range('2020-01-01', periods=n_samples, freq='5min'),
            'price': 100 * (1 + np.random.randn(n_samples).cumsum() * 0.001),
            'volume': np.random.lognormal(10, 1, n_samples),
            'volatility': np.abs(np.random.randn(n_samples) * 0.01),
            'momentum': np.random.randn(n_samples) * 0.1
        })
        
        # Add features
        data['returns'] = data['price'].pct_change()
        data['log_volume'] = np.log1p(data['volume'])
        data['price_ma'] = data['price'].rolling(20).mean()
        data['vol_ma'] = data['volatility'].rolling(10).mean()
        
        return data.dropna()
    
    def reset(self) -> torch.Tensor:
        """Reset environment and return initial state."""
        self.current_idx = np.random.randint(1000, len(self.data) - 1000)
        self.episode_return = 0
        self.episode_length = 0
        
        return self._get_state()
    
    def _get_state(self) -> torch.Tensor:
        """Get current state representation."""
        # In real implementation, this would use embeddings from Main Core
        # For now, create a dummy state vector
        state_vector = torch.randn(self.config.state_dim)
        return state_vector
    
    def step(self, action: Dict[str, torch.Tensor]) -> Tuple[torch.Tensor, float, bool, Dict]:
        """Execute action and return next state, reward, done, info."""
        self.current_idx += 1
        self.episode_length += 1
        
        # Calculate reward based on risk-adjusted returns
        current_price = self.data.iloc[self.current_idx]['price']
        next_price = self.data.iloc[self.current_idx + 1]['price']
        price_change = (next_price - current_price) / current_price
        
        # Position-weighted return
        position_size = action['position_size'].item()
        gross_return = price_change * position_size
        
        # Apply stop loss and take profit
        stop_loss = action['stop_loss'].item()
        take_profit = action['take_profit'].item()
        
        if price_change <= -stop_loss:
            actual_return = -stop_loss * position_size
        elif price_change >= take_profit:
            actual_return = take_profit * position_size
        else:
            actual_return = gross_return
        
        # Risk-adjusted reward
        volatility = self.data.iloc[self.current_idx]['volatility']
        sharpe_component = actual_return / (volatility + 1e-6)
        
        # Reward includes return, risk adjustment, and cost
        trading_cost = position_size * 0.0002  # 2 bps
        reward = sharpe_component - trading_cost
        
        # Add risk management bonus/penalty
        rr_ratio = take_profit / (stop_loss + 1e-6)
        if rr_ratio >= 2.0:
            reward += 0.01  # Bonus for good risk-reward
        
        self.episode_return += actual_return
        
        # Check if done
        done = (
            self.episode_length >= 1000 or
            self.current_idx >= len(self.data) - 100 or
            self.episode_return <= -0.1  # 10% drawdown
        )
        
        next_state = self._get_state()
        
        info = {
            'return': actual_return,
            'position_size': position_size,
            'stop_loss': stop_loss,
            'take_profit': take_profit,
            'rr_ratio': rr_ratio,
            'sharpe': sharpe_component
        }
        
        return next_state, reward, done, info

## 5. Training Loop with Monitoring

In [None]:
def train_mrms_ensemble(config: MRMSConfig, 
                       n_episodes: int = 10000,
                       save_interval: int = 100,
                       eval_interval: int = 50):
    """Main training loop for M-RMS ensemble."""
    
    # Initialize components
    ensemble = MRMSEnsemble(config, device)
    env = RiskEnvironment('data/market_data.h5', config)
    
    # Training tracking
    episode_rewards = []
    episode_returns = []
    training_losses = []
    
    # Resume from checkpoint if available
    start_episode = 0
    if IN_COLAB:
        resume_info = checkpoint_manager.get_resume_info()
        if resume_info['available']:
            print(f"📂 Resuming from episode {resume_info['episode']}")
            checkpoint = checkpoint_manager.load_latest()
            ensemble.load_models(checkpoint['state']['model_path'])
            start_episode = resume_info['episode']
    
    # Training loop
    pbar = tqdm(range(start_episode, n_episodes), desc="Training M-RMS")
    
    for episode in pbar:
        # Reset environment
        state = env.reset()
        episode_reward = 0
        episode_info = []
        
        done = False
        while not done:
            # Get action from ensemble
            with torch.no_grad():
                action = ensemble.forward(state.unsqueeze(0).to(device))
                
            # Clean action for environment
            env_action = {
                'position_size': action['position_size'],
                'stop_loss': action['stop_loss'],
                'take_profit': action['take_profit']
            }
            
            # Step environment
            next_state, reward, done, info = env.step(env_action)
            
            # Store experience
            ensemble.remember(state, env_action, reward, next_state, done)
            
            # Update state
            state = next_state
            episode_reward += reward
            episode_info.append(info)
            
            # Train if enough experience
            if len(ensemble.memory) >= config.min_memory_size:
                losses = ensemble.train_step()
                if losses:
                    training_losses.append(losses)
        
        # Record episode metrics
        episode_rewards.append(episode_reward)
        episode_returns.append(env.episode_return)
        
        # Calculate episode statistics
        avg_position = np.mean([info['position_size'] for info in episode_info])
        avg_rr_ratio = np.mean([info['rr_ratio'] for info in episode_info])
        
        # Update progress bar
        pbar.set_postfix({
            'reward': f"{episode_reward:.4f}",
            'return': f"{env.episode_return:.4f}",
            'position': f"{avg_position:.3f}",
            'RR': f"{avg_rr_ratio:.2f}"
        })
        
        # Evaluation
        if episode % eval_interval == 0:
            eval_metrics = evaluate_ensemble(ensemble, env, n_episodes=10)
            print(f"\n📊 Episode {episode} Evaluation:")
            print(f"   Avg Return: {eval_metrics['avg_return']:.4f}")
            print(f"   Sharpe Ratio: {eval_metrics['sharpe_ratio']:.3f}")
            print(f"   Max Drawdown: {eval_metrics['max_drawdown']:.4f}")
            print(f"   Win Rate: {eval_metrics['win_rate']:.3f}")
        
        # Save checkpoint
        if IN_COLAB and episode % save_interval == 0:
            # Check session time
            if session_monitor.is_ending_soon():
                print("\n⚠️ Session ending soon! Saving final checkpoint...")
                save_checkpoint = True
            else:
                save_checkpoint = checkpoint_manager.should_save(episode)
            
            if save_checkpoint:
                # Save models
                model_path = f"/tmp/mrms_models_ep{episode}.pt"
                ensemble.save_models(model_path)
                
                # Create checkpoint
                checkpoint_state = {
                    'episode': episode,
                    'model_path': model_path,
                    'metrics': {
                        'episode_reward': episode_reward,
                        'episode_return': env.episode_return,
                        'avg_position': avg_position,
                        'avg_rr_ratio': avg_rr_ratio
                    },
                    'training_losses': training_losses[-100:]  # Last 100
                }
                
                # Determine if best
                is_best = episode_reward > checkpoint_manager.best_metric
                if is_best:
                    checkpoint_manager.best_metric = episode_reward
                
                # Save checkpoint
                checkpoint_manager.save(
                    state=checkpoint_state,
                    metrics=checkpoint_state['metrics'],
                    is_best=is_best,
                    tag='mrms_training'
                )
                
                print(f"\n💾 Checkpoint saved (episode {episode}, best={is_best})")
            
            # Check if should stop
            if session_monitor.is_ending_soon(buffer_minutes=10):
                print("\n🛑 Stopping training - session ending in 10 minutes")
                break
    
    return ensemble, episode_rewards, episode_returns

In [None]:
def evaluate_ensemble(ensemble: MRMSEnsemble, env: RiskEnvironment, 
                     n_episodes: int = 10) -> Dict[str, float]:
    """Evaluate ensemble performance."""
    
    ensemble.position_sizer.eval()
    ensemble.stop_loss_agent.eval()
    ensemble.take_profit_agent.eval()
    ensemble.coordinator.eval()
    
    episode_returns = []
    episode_lengths = []
    all_returns = []
    win_count = 0
    
    with torch.no_grad():
        for _ in range(n_episodes):
            state = env.reset()
            done = False
            episode_trades = []
            
            while not done:
                action = ensemble.forward(state.unsqueeze(0).to(device))
                
                env_action = {
                    'position_size': action['position_size'],
                    'stop_loss': action['stop_loss'],
                    'take_profit': action['take_profit']
                }
                
                next_state, reward, done, info = env.step(env_action)
                state = next_state
                
                episode_trades.append(info['return'])
                all_returns.append(info['return'])
                
                if info['return'] > 0:
                    win_count += 1
            
            episode_returns.append(env.episode_return)
            episode_lengths.append(env.episode_length)
    
    # Calculate metrics
    returns_array = np.array(all_returns)
    
    metrics = {
        'avg_return': np.mean(episode_returns),
        'std_return': np.std(episode_returns),
        'sharpe_ratio': np.mean(returns_array) / (np.std(returns_array) + 1e-6) * np.sqrt(252),
        'max_drawdown': calculate_max_drawdown(np.cumsum(returns_array)),
        'win_rate': win_count / len(all_returns),
        'avg_episode_length': np.mean(episode_lengths)
    }
    
    # Set back to training mode
    ensemble.position_sizer.train()
    ensemble.stop_loss_agent.train()
    ensemble.take_profit_agent.train()
    ensemble.coordinator.train()
    
    return metrics

def calculate_max_drawdown(cumulative_returns: np.ndarray) -> float:
    """Calculate maximum drawdown from cumulative returns."""
    running_max = np.maximum.accumulate(cumulative_returns)
    drawdown = (cumulative_returns - running_max) / (running_max + 1e-6)
    return np.min(drawdown)

## 6. Main Training Execution

In [None]:
# Initialize configuration
config = MRMSConfig(
    state_dim=256,
    hidden_dim=128,
    action_dim=10,
    learning_rate=1e-4,
    batch_size=256,
    gamma=0.99,
    tau=0.005,
    max_position_size=0.1,
    max_stop_loss=0.02,
    max_take_profit=0.05
)

print("🚀 Starting M-RMS Ensemble Training")
print(f"Configuration: {json.dumps(asdict(config), indent=2)}")

In [None]:
# Train ensemble
ensemble, episode_rewards, episode_returns = train_mrms_ensemble(
    config=config,
    n_episodes=10000,
    save_interval=100,
    eval_interval=50
)

## 7. Results Visualization & Analysis

In [None]:
# Plot training progress
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Episode rewards
axes[0, 0].plot(episode_rewards)
axes[0, 0].set_title('Episode Rewards')
axes[0, 0].set_xlabel('Episode')
axes[0, 0].set_ylabel('Reward')

# Episode returns
axes[0, 1].plot(episode_returns)
axes[0, 1].set_title('Episode Returns')
axes[0, 1].set_xlabel('Episode')
axes[0, 1].set_ylabel('Return')

# Training losses
if ensemble.metrics['position_loss']:
    axes[1, 0].plot(ensemble.metrics['position_loss'], label='Position')
    axes[1, 0].plot(ensemble.metrics['stop_loss'], label='Stop Loss')
    axes[1, 0].plot(ensemble.metrics['profit_loss'], label='Take Profit')
    axes[1, 0].plot(ensemble.metrics['coord_loss'], label='Coordinator')
    axes[1, 0].set_title('Training Losses')
    axes[1, 0].set_xlabel('Training Step')
    axes[1, 0].set_ylabel('Loss')
    axes[1, 0].legend()
    axes[1, 0].set_yscale('log')

# Risk metrics
if ensemble.metrics['risk_reward_ratio']:
    axes[1, 1].plot(ensemble.metrics['risk_reward_ratio'])
    axes[1, 1].set_title('Risk-Reward Ratio')
    axes[1, 1].set_xlabel('Training Step')
    axes[1, 1].set_ylabel('RR Ratio')
    axes[1, 1].axhline(y=2.0, color='r', linestyle='--', label='Target RR')
    axes[1, 1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Final evaluation
print("\n📊 Final Evaluation (100 episodes):")
final_metrics = evaluate_ensemble(ensemble, RiskEnvironment('data/market_data.h5', config), n_episodes=100)

for metric, value in final_metrics.items():
    print(f"{metric}: {value:.4f}")

In [None]:
# Save final models
if IN_COLAB:
    print("\n💾 Saving final models to Drive...")
    
    # Save ensemble
    final_model_path = "/tmp/mrms_ensemble_final.pt"
    ensemble.save_models(final_model_path)
    
    # Save to drive
    models_dict = {
        'position_sizer': ensemble.position_sizer,
        'stop_loss_agent': ensemble.stop_loss_agent,
        'take_profit_agent': ensemble.take_profit_agent,
        'coordinator': ensemble.coordinator
    }
    
    drive_manager.save_model(
        models=models_dict,
        name="mrms_ensemble",
        configs=asdict(config),
        metrics=final_metrics,
        production=True
    )
    
    # Save training summary
    summary = colab_setup.create_training_summary(
        metrics=final_metrics,
        save_path=str(drive_manager.results_path / "mrms_training_summary.md")
    )
    
    print("✅ All models and results saved to Drive!")

## 8. Integration Testing with Main Core

In [None]:
def test_integration_with_main_core():
    """Test M-RMS integration with Main MARL Core embeddings."""
    
    print("\n🔧 Testing M-RMS Integration...")
    
    # Simulate Main Core embeddings
    batch_size = 32
    mock_embeddings = torch.randn(batch_size, config.state_dim).to(device)
    
    # Test forward pass
    with torch.no_grad():
        risk_params = ensemble.forward(mock_embeddings)
    
    # Verify outputs
    assert risk_params['position_size'].shape == (batch_size,)
    assert risk_params['stop_loss'].shape == (batch_size,)
    assert risk_params['take_profit'].shape == (batch_size,)
    
    # Check value ranges
    assert torch.all(risk_params['position_size'] >= 0)
    assert torch.all(risk_params['position_size'] <= config.max_position_size)
    assert torch.all(risk_params['stop_loss'] >= 0)
    assert torch.all(risk_params['stop_loss'] <= config.max_stop_loss)
    assert torch.all(risk_params['take_profit'] >= 0)
    assert torch.all(risk_params['take_profit'] <= config.max_take_profit)
    
    # Check risk-reward ratios
    rr_ratios = risk_params['take_profit'] / (risk_params['stop_loss'] + 1e-6)
    avg_rr = rr_ratios.mean().item()
    
    print(f"✅ Integration test passed!")
    print(f"   Avg Position Size: {risk_params['position_size'].mean():.4f}")
    print(f"   Avg Stop Loss: {risk_params['stop_loss'].mean():.4f}")
    print(f"   Avg Take Profit: {risk_params['take_profit'].mean():.4f}")
    print(f"   Avg RR Ratio: {avg_rr:.2f}")

# Run integration test
test_integration_with_main_core()

## 9. Export for Production

In [None]:
# Export models for production deployment
if IN_COLAB:
    print("\n📦 Exporting models for production...")
    
    # Create TorchScript versions
    ensemble.position_sizer.eval()
    ensemble.stop_loss_agent.eval() 
    ensemble.take_profit_agent.eval()
    ensemble.coordinator.eval()
    
    # Example inputs for tracing
    example_state = torch.randn(1, config.state_dim).to(device)
    example_position = torch.randn(1, config.action_dim).to(device)
    example_stop = torch.randn(1, config.action_dim).to(device)
    example_profit = torch.randn(1, config.action_dim).to(device)
    
    # Script models
    scripted_models = {
        'position_sizer': torch.jit.trace(ensemble.position_sizer, example_state),
        'stop_loss_agent': torch.jit.trace(ensemble.stop_loss_agent, (example_state, torch.tensor([0.05]).to(device))),
        'take_profit_agent': torch.jit.trace(ensemble.take_profit_agent, (example_state, torch.tensor([0.01]).to(device))),
        'coordinator': torch.jit.trace(ensemble.coordinator, (example_position, example_stop, example_profit))
    }
    
    # Save scripted models
    production_dir = drive_manager.model_path / "production" / "mrms_ensemble"
    production_dir.mkdir(exist_ok=True)
    
    for name, model in scripted_models.items():
        model.save(str(production_dir / f"{name}_scripted.pt"))
    
    # Create deployment package
    deployment_package = drive_manager.create_training_package("mrms_deployment")
    
    print(f"✅ Production models exported to: {production_dir}")
    print(f"📦 Deployment package created: {deployment_package}")

## Summary

This notebook successfully trained the M-RMS (Multi-Agent Risk Management System) ensemble consisting of:

1. **PositionSizer**: Determines optimal position sizes based on market conditions
2. **StopLossAgent**: Sets dynamic stop-loss levels for risk protection
3. **TakeProfitAgent**: Optimizes take-profit targets based on momentum
4. **RiskCoordinator**: Coordinates all agents for unified risk decisions

The ensemble is now ready for integration with the Main MARL Core system and production deployment.