In [None]:
!pip install torch torchvision

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: C:\Python314\python.exe -m pip install --upgrade pip


In [None]:
import torch


ModuleNotFoundError: No module named 'torch'

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from collections import deque
import random

class ComradeRLAgent:
    def __init__(self, state_dim=10, action_dim=5):
        self.state_dim = state_dim
        self.action_dim = action_dim
        
        # Actor network (policy): State -> Action
        self.actor = Actor(state_dim, action_dim)
        self.actor_target = Actor(state_dim, action_dim)
        
        # Twin Critic networks (value): State, Action -> Q-value
        self.critic_1 = Critic(state_dim, action_dim)
        self.critic_2 = Critic(state_dim, action_dim)
        self.critic_1_target = Critic(state_dim, action_dim)
        self.critic_2_target = Critic(state_dim, action_dim)
        
        # Hierarchical tier policy for discrete decisions
        self.tier_policy = TierPolicyNetwork(state_dim)
        
        # Replay buffer with prioritized experience replay
        self.replay_buffer = PrioritizedReplayBuffer(capacity=1000000)
        
        # Hyperparameters tuned for pricing dynamics
        self.gamma = 0.99          # Discount factor
        self.tau = 0.005           # Soft update rate
        self.policy_noise = 0.2    # Target policy smoothing
        self.noise_clip = 0.5      # Noise clipping
        self.policy_freq = 2       # Delayed policy updates
        self.batch_size = 256
        self.exploration_noise = 0.3
        
        self.total_it = 0
        
    def select_action(self, state, evaluate=False):
        """
        Hierarchical action selection:
        1. Tier decision (discrete) - which tier to optimize for
        2. Continuous action (price adjustment, notification, etc.)
        """
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        
        # Tier selection (discrete action via Gumbel-Softmax)
        tier_logits = self.tier_policy(state_tensor)
        if evaluate:
            tier = torch.argmax(tier_logits, dim=-1)
        else:
            tier = F.gumbel_softmax(tier_logits, hard=True)
        
        # Continuous action from actor
        action = self.actor(state_tensor, tier).cpu().data.numpy().flatten()
        
        if not evaluate:
            # Add exploration noise (Ornstein-Uhlenbeck for temporal correlation)
            noise = self.ou_noise.sample()
            action = action + noise * self.exploration_noise
            action = np.clip(action, -1, 1)  # Action space normalized
        
        # Denormalize actions
        action_denorm = self.denormalize_action(action, tier)
        
        return action_denorm, tier
    
    def denormalize_action(self, action, tier):
        """Convert normalized [-1, 1] actions to actual values"""
        tier_config = TIER_CONFIGS[tier]
        
        return {
            'delta_price': np.interp(action[0], [-1, 1], [-20, 20]),  # ±$20
            'delta_group_target': int(np.interp(action[1], [-1, 1], [-2, 2])),  # ±2 members
            'notification_push': int(np.interp(action[2], [-1, 1], [0, tier_config['max_notifications']])),
            'promo_duration': int(np.interp(action[3], [-1, 1], [0, 72])),  # 0-72 hours
            'match_aggressiveness': np.interp(action[4], [-1, 1], [0, 1])  # 0-1 matching score
        }
    
    def train(self):
        """TD3 training with twin critics and delayed policy updates"""
        if len(self.replay_buffer) < self.batch_size:
            return {}
        
        self.total_it += 1
        
        # Sample with priorities
        batch, indices, weights = self.replay_buffer.sample(self.batch_size)
        state, action, reward, next_state, done = batch
        
        # Add target policy smoothing
        noise = (torch.randn_like(action) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip)
        next_action = (self.actor_target(next_state) + noise).clamp(-1, 1)
        
        # Compute target Q-value (minimum of twin critics)
        target_q1 = self.critic_1_target(next_state, next_action)
        target_q2 = self.critic_2_target(next_state, next_action)
        target_q = torch.min(target_q1, target_q2)
        target_q = reward + (1 - done) * self.gamma * target_q
        
        # Update critics
        current_q1 = self.critic_1(state, action)
        current_q2 = self.critic_2(state, action)
        
        critic_loss = F.mse_loss(current_q1, target_q, reduction='none') * weights
        critic_loss = critic_loss.mean() + F.mse_loss(current_q2, target_q, reduction='none') * weights.mean()
        
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_1.parameters(), max_norm=1.0)
        torch.nn.utils.clip_grad_norm_(self.critic_2.parameters(), max_norm=1.0)
        self.critic_optimizer.step()
        
        # Delayed policy updates
        actor_loss = None
        if self.total_it % self.policy_freq == 0:
            # Actor loss: maximize Q-value
            actor_action = self.actor(state)
            actor_loss = -self.critic_1(state, actor_action).mean()
            
            # Add entropy bonus for exploration
            entropy = -torch.sum(actor_action * torch.log(actor_action + 1e-10), dim=-1).mean()
            actor_loss += 0.01 * entropy
            
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
            
            # Soft update target networks
            self.soft_update()
        
        # Update priorities in replay buffer
        priorities = torch.abs(current_q1 - target_q).detach().cpu().numpy() + 1e-6
        self.replay_buffer.update_priorities(indices, priorities)
        
        return {
            'critic_loss': critic_loss.item(),
            'actor_loss': actor_loss.item() if actor_loss else None,
            'q_value': current_q1.mean().item()
        }


class Actor(nn.Module):
    """Policy network with tier-conditioned output"""
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super().__init__()
        
        self.shared = nn.Sequential(
            nn.Linear(state_dim + 4, hidden_dim),  # +4 for tier one-hot
            nn.LayerNorm(hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU()
        )
        
        # Price adjustment head (careful, market-sensitive)
        self.price_head = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Tanh()  # [-1, 1] normalized
        )
        
        # Group target head (discrete steps)
        self.group_head = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Tanh()
        )
        
        # Notification head (resource allocation)
        self.notify_head = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()  # [0, 1] -> scale to max
        )
        
        # Promotion head (time-limited)
        self.promo_head = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
        
        # Matching head (supplier-user fit)
        self.match_head = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
        
    def forward(self, state, tier):
        tier_onehot = F.one_hot(tier, num_classes=4).float()
        x = torch.cat([state, tier_onehot], dim=-1)
        x = self.shared(x)
        
        return torch.cat([
            self.price_head(x),
            self.group_head(x),
            self.notify_head(x),
            self.promo_head(x),
            self.match_head(x)
        ], dim=-1)


class Critic(nn.Module):
    """Twin Q-networks for reduced overestimation bias"""
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super().__init__()
        
        # Q1 architecture
        self.q1 = nn.Sequential(
            nn.Linear(state_dim + action_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
        
    def forward(self, state, action):
        x = torch.cat([state, action], dim=-1)
        return self.q1(x)

ModuleNotFoundError: No module named 'torch'

In [None]:
class CurriculumTrainer:
    """
    Train RL agent through progressively complex scenarios:
    1. Single tier, static demand (convergence)
    2. Single tier, dynamic demand (adaptation)
    3. Multi-tier, single supplier (progression)
    4. Multi-tier, multi-supplier (competition)
    5. Full market with external shocks (robustness)
    """
    
    def __init__(self):
        self.curriculum_stages = [
            {
                'name': 'Static Free Tier',
                'duration': 1000,
                'config': {
                    'demand_volatility': 0.1,
                    'supplier_count': 1,
                    'tier_options': ['Free'],
                    'market_shocks': False,
                    'competitor_presence': False
                },
                'success_threshold': {'avg_reward': 50, 'stability': 0.9}
            },
            {
                'name': 'Dynamic Free Tier',
                'duration': 2000,
                'config': {
                    'demand_volatility': 0.3,
                    'supplier_count': 2,
                    'tier_options': ['Free'],
                    'market_shocks': True,
                    'competitor_presence': False
                },
                'success_threshold': {'avg_reward': 80, 'stability': 0.85}
            },
            {
                'name': 'Tier Progression',
                'duration': 3000,
                'config': {
                    'demand_volatility': 0.4,
                    'supplier_count': 3,
                    'tier_options': ['Free', 'Standard', 'Premium'],
                    'market_shocks': True,
                    'competitor_presence': True
                },
                'success_threshold': {'avg_reward': 120, 'tier_upgrade_rate': 0.3}
            },
            {
                'name': 'Full Market',
                'duration': 5000,
                'config': {
                    'demand_volatility': 0.5,
                    'supplier_count': 5,
                    'tier_options': ['Free', 'Standard', 'Premium', 'Gold'],
                    'market_shocks': True,
                    'competitor_presence': True,
                    'seasonality': True
                },
                'success_threshold': {'avg_reward': 200, 'platform_profit': 1000}
            }
        ]
        
    def train(self, agent):
        for stage_idx, stage in enumerate(self.curriculum_stages):
            print(f"\n{'='*50}")
            print(f"STAGE {stage_idx + 1}: {stage['name']}")
            print(f"{'='*50}")
            
            env = ComradeEnvironment(stage['config'])
            episode_rewards = []
            
            for episode in range(stage['duration']):
                state = env.reset()
                episode_reward = 0
                done = False
                
                while not done:
                    action = agent.select_action(state, evaluate=False)
                    next_state, reward, done, info = env.step(action)
                    
                    # Store transition with curriculum stage label
                    agent.replay_buffer.add((
                        state, action, reward, next_state, done,
                        {'stage': stage_idx, 'difficulty': stage_idx / 4}
                    ))
                    
                    # Train agent
                    metrics = agent.train()
                    
                    state = next_state
                    episode_reward += reward
                
                episode_rewards.append(episode_reward)
                
                # Progress logging
                if episode % 100 == 0:
                    avg_reward = np.mean(episode_rewards[-100:])
                    print(f"Episode {episode}: Avg Reward = {avg_reward:.2f}, "
                          f"Epsilon = {agent.exploration_noise:.3f}")
                
                # Automatic stage progression check
                if self.check_stage_completion(episode_rewards, stage['success_threshold']):
                    print(f"✓ Stage {stage['name']} completed at episode {episode}")
                    # Save checkpoint
                    agent.save(f"checkpoint_stage_{stage_idx}.pt")
                    break
            
            # Decay exploration for next stage
            agent.exploration_noise *= 0.8
    
    def check_stage_completion(self, rewards, thresholds):
        """Check if agent has mastered current stage"""
        if len(rewards) < 100:
            return False
        
        recent_rewards = rewards[-100:]
        avg_reward = np.mean(recent_rewards)
        stability = 1 - (np.std(recent_rewards) / (abs(avg_reward) + 1e-6))
        
        checks = []
        if 'avg_reward' in thresholds:
            checks.append(avg_reward > thresholds['avg_reward'])
        if 'stability' in thresholds:
            checks.append(stability > thresholds['stability'])
        
        return all(checks)

In [None]:
class ProductionPricingEngine:
    """
    Deployed RL agent with safety guards and A/B testing
    """
    
    def __init__(self, model_path):
        self.agent = ComradeRLAgent()
        self.agent.load(model_path)
        self.agent.actor.eval()  # Inference mode
        
        # Safety constraints (hard limits)
        self.price_bounds = {
            'min_margin': 0.05,  # 5% minimum supplier margin
            'max_discount': 0.60,  # 60% max discount from retail
            'max_price_change': 0.20  # 20% max change per hour
        }
        
        # A/B testing framework
        self.experiment_assignments = {}
        
        # Fallback rules (if ML fails)
        self.fallback_pricing = DifferentialEquationPricing()
        
    def get_price_recommendation(self, context):
        """
        Real-time price recommendation with safety checks
        """
        try:
            # Build state vector from real-time data
            state = self.build_state_vector(context)
            
            # Get RL action
            action, tier = self.agent.select_action(state, evaluate=True)
            
            # Apply safety constraints
            safe_action = self.apply_safety_constraints(action, context)
            
            # A/B test: 10% of traffic uses fallback for comparison
            if self.should_use_fallback(context['user_id']):
                safe_action = self.fallback_pricing.compute(context)
                source = 'fallback'
            else:
                source = 'rl_agent'
            
            # Log for monitoring
            self.log_decision(context, state, action, safe_action, source)
            
            return {
                'price': safe_action['price'],
                'target_group_size': safe_action['group_target'],
                'notification_strategy': safe_action['notifications'],
                'promo_duration': safe_action['promo'],
                'confidence': self.compute_confidence(state),
                'source': source
            }
            
        except Exception as e:
            # Graceful degradation
            return self.fallback_pricing.compute(context)
    
    def apply_safety_constraints(self, action, context):
        """Hard constraints to prevent harmful pricing"""
        current_price = context['current_price']
        proposed_price = action['price']
        
        # Constraint 1: Max price change rate
        max_change = current_price * self.price_bounds['max_price_change']
        if abs(proposed_price - current_price) > max_change:
            proposed_price = current_price + np.sign(proposed_price - current_price) * max_change
        
        # Constraint 2: Minimum supplier margin
        cost = context['supplier_cost']
        min_price = cost / (1 - self.price_bounds['min_margin'])
        proposed_price = max(proposed_price, min_price)
        
        # Constraint 3: Maximum discount
        retail = context['retail_price']
        max_discount_price = retail * (1 - self.price_bounds['max_discount'])
        proposed_price = max(proposed_price, max_discount_price)
        
        action['price'] = round(proposed_price, 2)
        return action
    
    def online_learning_update(self, feedback_batch):
        """
        Continual learning from production feedback
        """
        # Experience replay from production
        for feedback in feedback_batch:
            self.agent.replay_buffer.add(feedback)
        
        # Conservative update (small learning rate)
        for _ in range(10):  # Limited updates per batch
            metrics = self.agent.train()
        
        # Evaluate on holdout set
        eval_reward = self.evaluate_on_historical_data()
        
        # Only deploy if improvement > 2%
        if eval_reward > self.current_performance * 1.02:
            self.deploy_update()
            return True
        else:
            return False  # Rollback to previous version