In [4]:
import os
import numpy as np
import tensorflow as tf
import random
from collections import deque
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Set seeds for reproducibility
SEED_VAL = 42
random.seed(SEED_VAL)
np.random.seed(SEED_VAL)
tf.random.set_seed(SEED_VAL)

print("="*70)
print("‚úÖ IMPORTS SUCCESSFUL")
print("="*70)
print(f"   TensorFlow version: {tf.__version__}")
print(f"   NumPy version: {np.__version__}")
print(f"   Random seed: {SEED_VAL}")
print("="*70)

‚úÖ IMPORTS SUCCESSFUL
   TensorFlow version: 2.14.0
   NumPy version: 1.24.3
   Random seed: 42


In [5]:
class ReplayBuffer:
    """Experience Replay Buffer for DQN"""
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        """Add experience to buffer"""
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        """Sample random batch from buffer"""
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        
        return (
            np.array(states, dtype=np.float32),
            np.array(actions, dtype=np.int32),
            np.array(rewards, dtype=np.float32),
            np.array(next_states, dtype=np.float32),
            np.array(dones, dtype=np.float32)
        )
    
    def __len__(self):
        return len(self.buffer)

print("="*70)
print("‚úÖ REPLAY BUFFER CREATED")
print("="*70)
print("   Capacity: 10,000 experiences")
print("   Function: Store and sample transitions for training")
print("="*70)

‚úÖ REPLAY BUFFER CREATED
   Capacity: 10,000 experiences
   Function: Store and sample transitions for training


## 3. REPLAY BUFFER

In [6]:
class DQNAgentRDX(tf.keras.Model):
    """
    DQN Agent v·ªõi RDX feature extraction
    Architecture: [3‚Üí32‚Üí32‚Üí32‚Üí14]
    Compatible v·ªõi A2CAgentRDX ƒë·ªÉ c√≥ th·ªÉ so s√°nh RDX features
    """
    def __init__(self, hidden_size=32, num_actions=14, num_features=3):
        super(DQNAgentRDX, self).__init__()
        # Shared layers for feature extraction
        self.dense1 = tf.keras.layers.Dense(hidden_size, activation='relu', name='dense1')
        self.dense2 = tf.keras.layers.Dense(hidden_size, activation='relu', name='dense2')
        self.dense3 = tf.keras.layers.Dense(hidden_size, activation='relu', name='dense3')  # RDX features
        
        # Q-values output
        self.q_values = tf.keras.layers.Dense(num_actions, name='q_values')
    
    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        rdx_features = self.dense3(x)  # 32-dim RDX representation
        q_vals = self.q_values(rdx_features)
        return q_vals, rdx_features

print("="*70)
print("‚úÖ DQNAgentRDX MODEL DEFINED")
print("="*70)
print("   Architecture: [3‚Üí32‚Üí32‚Üí32‚Üí14]")
print("   Input: State (3 features)")
print("   Hidden layers: 32‚Üí32‚Üí32")
print("   Output: Q-values (14 actions)")
print("   RDX features: 32-dimensional t·ª´ dense3 layer")
print("="*70)

‚úÖ DQNAgentRDX MODEL DEFINED
   Architecture: [3‚Üí32‚Üí32‚Üí32‚Üí14]
   Input: State (3 features)
   Hidden layers: 32‚Üí32‚Üí32
   Output: Q-values (14 actions)
   RDX features: 32-dimensional t·ª´ dense3 layer


## 2. DQN MODEL ARCHITECTURE

In [7]:
import os
import numpy as np
import tensorflow as tf
import random
from collections import deque
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Set seeds for reproducibility
SEED_VAL = 42
random.seed(SEED_VAL)
np.random.seed(SEED_VAL)
tf.random.set_seed(SEED_VAL)

print("="*70)
print("‚úÖ IMPORTS SUCCESSFUL")
print("="*70)
print(f"   TensorFlow version: {tf.__version__}")
print(f"   NumPy version: {np.__version__}")
print(f"   Random seed: {SEED_VAL}")
print("="*70)

‚úÖ IMPORTS SUCCESSFUL
   TensorFlow version: 2.14.0
   NumPy version: 1.24.3
   Random seed: 42


## 1. IMPORTS & SETUP

# ü§ñ TRAINING DQN FOR INVENTORY MANAGEMENT

## Objective:
Train DQN agent v·ªõi environment gi·ªëng A2C/A2C_mod t·ª´ training.py

## Configuration:
- **Episodes**: 600
- **Steps per episode**: 900
- **Total steps**: 540,000
- **Architecture**: [3‚Üí32‚Üí32‚Üí32‚Üí14] (same as A2C)
- **State**: [inventory, sales_forecast, waste_rate]
- **Actions**: 14 discrete levels

---

In [8]:
# =================================================================
# 3. DQN TRAINING AGENT
# =================================================================

class DQNTrainer:
    """DQN Training with Target Network and Experience Replay"""
    def __init__(self, env, hidden_size=32, lr=0.001, gamma=0.99, 
                 epsilon_start=1.0, epsilon_end=0.01, epsilon_decay=0.995):
        self.env = env
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        
        # Q-network v√† Target network
        self.q_network = DQNAgentRDX(hidden_size=hidden_size, num_actions=env.n_actions)
        self.target_network = DQNAgentRDX(hidden_size=hidden_size, num_actions=env.n_actions)
        
        # Initialize networks
        dummy_state = tf.constant([[0.5, 0.2, 0.01]], dtype=tf.float32)
        self.q_network(dummy_state)
        self.target_network(dummy_state)
        
        # Copy weights
        self.update_target_network()
        
        # Optimizer
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
        
        # Replay buffer
        self.replay_buffer = ReplayBuffer(capacity=10000)
        
    def update_target_network(self):
        """Copy weights from Q-network to Target network"""
        self.target_network.set_weights(self.q_network.get_weights())
    
    def select_action(self, state, training=True):
        """Epsilon-greedy action selection"""
        if training and np.random.random() < self.epsilon:
            return np.random.randint(0, self.env.n_actions)
        else:
            state_tensor = tf.constant([state], dtype=tf.float32)
            q_values, _ = self.q_network(state_tensor)
            return tf.argmax(q_values[0]).numpy()
    
    def train_step(self, batch_size=64):
        """Single training step"""
        if len(self.replay_buffer) < batch_size:
            return 0.0
        
        # Sample batch
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size)
        
        # Convert to tensors
        states_t = tf.constant(states, dtype=tf.float32)
        actions_t = tf.constant(actions, dtype=tf.int32)
        rewards_t = tf.constant(rewards, dtype=tf.float32)
        next_states_t = tf.constant(next_states, dtype=tf.float32)
        dones_t = tf.constant(dones, dtype=tf.float32)
        
        with tf.GradientTape() as tape:
            # Current Q-values
            q_values, _ = self.q_network(states_t)
            action_masks = tf.one_hot(actions_t, self.env.n_actions)
            q_values_selected = tf.reduce_sum(q_values * action_masks, axis=1)
            
            # Target Q-values (Double DQN)
            next_q_values, _ = self.target_network(next_states_t)
            next_q_max = tf.reduce_max(next_q_values, axis=1)
            
            # TD target
            targets = rewards_t + self.gamma * next_q_max * (1 - dones_t)
            
            # Loss
            loss = tf.reduce_mean(tf.square(targets - q_values_selected))
        
        # Backpropagation
        gradients = tape.gradient(loss, self.q_network.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.q_network.trainable_variables))
        
        return loss.numpy()
    
    def train(self, num_episodes=600, batch_size=64, update_target_freq=10, 
              verbose=True, save_freq=20, save_path=None):
        """Train DQN agent"""
        episode_rewards = []
        losses = []
        
        for episode in range(num_episodes):
            state = self.env.reset()
            episode_reward = 0
            episode_loss = []
            done = False
            
            while not done:
                # Select action
                action = self.select_action(state, training=True)
                
                # Execute action
                next_state, reward, done, info = self.env.step(action)
                
                # Store experience
                self.replay_buffer.push(state, action, reward, next_state, done)
                
                # Train
                if len(self.replay_buffer) >= batch_size:
                    loss = self.train_step(batch_size)
                    episode_loss.append(loss)
                
                state = next_state
                episode_reward += reward
            
            # Update epsilon
            self.epsilon = max(self.epsilon_end, self.epsilon * self.epsilon_decay)
            
            # Update target network
            if (episode + 1) % update_target_freq == 0:
                self.update_target_network()
            
            # Store metrics
            episode_rewards.append(episode_reward)
            avg_loss = np.mean(episode_loss) if episode_loss else 0
            losses.append(avg_loss)
            
            # Save checkpoint
            if save_path and (episode + 1) % save_freq == 0:
                checkpoint_dir = save_path
                os.makedirs(checkpoint_dir, exist_ok=True)
                
                checkpoint = tf.train.Checkpoint(
                    q_network=self.q_network,
                    optimizer=self.optimizer
                )
                checkpoint.save(os.path.join(checkpoint_dir, f'ckpt'))
            
            # Verbose
            if verbose and (episode + 1) % 10 == 0:
                avg_reward = np.mean(episode_rewards[-10:])
                print(f"Episode {episode+1}/{num_episodes} | "
                      f"Avg Reward: {avg_reward:.2f} | "
                      f"Epsilon: {self.epsilon:.3f} | "
                      f"Loss: {avg_loss:.4f}")
        
        return episode_rewards, losses

print("‚úÖ DQNTrainer created")
print(f"   Features: Target Network, Experience Replay, Epsilon-Greedy")
print(f"   Ready to train!")

‚úÖ DQNTrainer created
   Features: Target Network, Experience Replay, Epsilon-Greedy
   Ready to train!


## 4. DQN TRAINER

In [9]:
# =================================================================
# ENVIRONMENT GI·ªêNG TRAINING.PY (A2C/A2C_MOD)
# =================================================================
import numpy as np
class A2CStyleInventoryEnv:
    """
    Environment t∆∞∆°ng t·ª± training.py c·ªßa A2C/A2C_mod
    - State: [inventory_level, sales_forecast, waste_rate]
    - Dynamics: gi·ªëng training.py
    - Rewards: gi·ªëng training.py
    """
    def __init__(self, num_products=220, num_timesteps=900, waste_rate=0.025):
        self.num_products = num_products
        self.num_timesteps = num_timesteps
        self.waste_rate = waste_rate
        
        # Action space: 14 discrete levels (gi·ªëng training.py)
        self.action_space = np.array([0, 0.005, 0.01, 0.0125, 0.015, 0.0175, 
                                      0.02, 0.03, 0.04, 0.08, 0.12, 0.2, 0.5, 1.0])
        self.n_actions = len(self.action_space)
        
        # Generate synthetic sales data (normalized [0, 1])
        # Gi·ªëng nh∆∞ trong training.py, sales ƒë∆∞·ª£c normalize b·ªüi capacity
        self._generate_sales_data()
        
        self.reset()
    
    def _generate_sales_data(self):
        """Generate synthetic sales patterns"""
        # T·∫°o sales patterns v·ªõi seasonality v√† trend
        t = np.arange(self.num_timesteps)
        
        # Base demand with seasonality (weekly pattern)
        base = 0.3 + 0.15 * np.sin(2 * np.pi * t / 7)  # Weekly cycle
        
        # Add monthly trend
        trend = 0.1 * np.sin(2 * np.pi * t / 30)  # Monthly cycle
        
        # Random noise
        noise = np.random.uniform(-0.05, 0.05, self.num_timesteps)
        
        # Combine
        self.sales_pattern = np.clip(base + trend + noise, 0.1, 0.8)
        
        # Initialize for all products (v·ªõi variation)
        self.sales_data = np.zeros((self.num_timesteps, self.num_products))
        for i in range(self.num_products):
            product_factor = np.random.uniform(0.8, 1.2)
            self.sales_data[:, i] = self.sales_pattern * product_factor
        
        self.sales_data = np.clip(self.sales_data, 0.0, 1.0)
    
    def reset(self):
        """Reset environment - gi·ªëng training.py"""
        # Random initial inventory: 0 <= x <= 1 (eq 2 in training.py)
        self.x = np.random.uniform(0, 1, self.num_products).astype(np.float32)
        
        # Waste estimate
        self.q = self.waste_rate * self.x
        
        self.t = 0
        self.total_reward = 0
        
        # Get current state
        return self._get_state()
    
    def _get_state(self):
        """
        State construction gi·ªëng training.py:
        s = [x, sales_forecast, q]
        """
        # Current inventory
        x_norm = self.x  # Already normalized [0, 1]
        
        # Sales forecast (current timestep)
        sales_forecast = self.sales_data[self.t % self.num_timesteps]
        
        # Waste estimate
        q = self.q
        
        # Stack: (num_products, 3) -> average to (3,) for single state
        # Trong training.py, state l√† per-product, nh∆∞ng ƒë·ªÉ ƒë∆°n gi·∫£n h√≥a v·ªõi DQN,
        # ta average across products
        state = np.array([
            np.mean(x_norm),
            np.mean(sales_forecast),
            np.mean(q)
        ], dtype=np.float32)
        
        return state
    
    def step(self, action_idx):
        """
        Execute action - dynamics gi·ªëng training.py
        """
        # Convert action index to actual order level
        u = self.action_space[action_idx]
        
        # Apply action to all products (simplified - same action for all)
        u_array = np.full(self.num_products, u, dtype=np.float32)
        
        # Get current sales
        sales = self.sales_data[self.t % self.num_timesteps]
        
        # Dynamics (gi·ªëng training.py):
        # 1. Add order to inventory
        x_u = np.minimum(1.0, self.x + u_array)
        
        # 2. Calculate overstock
        overstock = np.maximum(0, (self.x + u_array) - 1.0)
        
        # 3. Meet demand (sales)
        x_prime = np.maximum(0, x_u - sales)
        
        # 4. Calculate stockout
        stockout = np.maximum(0, sales - x_u)
        
        # 5. Update waste for next step
        self.q = self.waste_rate * x_prime
        
        # Reward calculation (inspired by training.py):
        # In training.py, reward includes:
        # - Negative for stockout (lost sales)
        # - Negative for overstock (wasted inventory)
        # - Negative for holding cost
        
        # Stockout penalty (lost revenue)
        stockout_cost = -10.0 * np.sum(stockout)
        
        # Overstock penalty
        overstock_cost = -5.0 * np.sum(overstock)
        
        # Holding cost
        holding_cost = -0.5 * np.sum(x_prime)
        
        # Order cost
        order_cost = -2.0 if u > 0 else 0
        
        # Waste cost
        waste_cost = -5.0 * np.sum(self.q)
        
        # Revenue from sales
        actual_sales = sales - stockout
        revenue = 15.0 * np.sum(actual_sales)
        
        # Total reward
        reward = revenue + stockout_cost + overstock_cost + holding_cost + order_cost + waste_cost
        
        # Update state
        self.x = x_prime
        self.t += 1
        self.total_reward += reward
        
        # Check done
        done = (self.t >= self.num_timesteps)
        
        # Info
        info = {
            'inventory': np.mean(self.x),
            'sales': np.mean(sales),
            'stockout': np.sum(stockout),
            'overstock': np.sum(overstock),
            'waste': np.sum(self.q),
            'reward': reward
        }
        
        return self._get_state(), reward, done, info

print("‚úÖ A2CStyleInventoryEnv created")
print(f"   Based on training.py structure")
print(f"   Num products: 100")
print(f"   Timesteps per episode: 900")
print(f"   Action space: 14 levels (same as A2C)")
print(f"   State: [inventory, sales_forecast, waste_rate]")
print(f"   Reward: revenue - costs (stockout, holding, waste, order)")

‚úÖ A2CStyleInventoryEnv created
   Based on training.py structure
   Num products: 100
   Timesteps per episode: 900
   Action space: 14 levels (same as A2C)
   State: [inventory, sales_forecast, waste_rate]
   Reward: revenue - costs (stockout, holding, waste, order)


In [None]:
# =================================================================
# TRAIN DQN V·ªöI A2C-STYLE ENVIRONMENT - 600 EPISODES √ó 900 STEPS
# =================================================================

print("="*70)
print("üöÄ TRAINING DQN V·ªöI A2C-STYLE ENVIRONMENT")
print("="*70)

# Create A2C-style environment
env_a2c_style = A2CStyleInventoryEnv(
    num_products=220,
    num_timesteps=900,  # 900 steps per episode
    waste_rate=0.025
)

# Create DQN trainer
trainer_v2 = DQNTrainer(
    env=env_a2c_style,
    hidden_size=32,
    lr=0.001,
    gamma=0.99,
    epsilon_start=1.0,
    epsilon_end=0.01,
    epsilon_decay=0.998  # Slower decay for 600 episodes
)

print("\nüìã Training Configuration:")
print(f"   Environment: A2CStyleInventoryEnv (based on training.py)")
print(f"   Episodes: 600")
print(f"   Steps per episode: 900")
print(f"   Total steps: 540,000")
print(f"   Num products: 100")
print(f"   State: [avg_inventory, avg_sales_forecast, avg_waste]")
print(f"   Actions: 14 discrete levels (same as A2C)")
print(f"   Hidden size: 32")
print(f"   Learning rate: 0.001")
print(f"   Gamma: 0.99")
print(f"   Batch size: 64")
print(f"   Epsilon decay: 0.998 (slower for more exploration)")

print("\n‚ö†Ô∏è  L∆∞u √Ω: Training 600 episodes c√≥ th·ªÉ m·∫•t 10-15 ph√∫t")
print("="*70)
print("‚è≥ Starting training...")

# Train
checkpoint_path_v2 = r'c:\Study\NCKH\QLKHO-RL\checkpointDQN_A2Cstyle'

rewards_v2, losses_v2 = trainer_v2.train(
    num_episodes=600,
    batch_size=64,
    update_target_freq=10,
    verbose=True,
    save_freq=50,  # Save every 50 episodes
    save_path=checkpoint_path_v2
)

print("\n" + "="*70)
print("‚úÖ TRAINING HO√ÄN T·∫§T!")
print("="*70)
print(f"\nüìä Final Statistics:")
print(f"   Total episodes: {len(rewards_v2)}")
print(f"   Average reward (last 50): {np.mean(rewards_v2[-50:]):.2f}")
print(f"   Max reward: {np.max(rewards_v2):.2f}")
print(f"   Min reward: {np.min(rewards_v2):.2f}")
print(f"   Final epsilon: {trainer_v2.epsilon:.4f}")
print(f"   Checkpoint saved to: {checkpoint_path_v2}")
print("="*70)

üöÄ TRAINING DQN V·ªöI A2C-STYLE ENVIRONMENT

üìã Training Configuration:
   Environment: A2CStyleInventoryEnv (based on training.py)
   Episodes: 600
   Steps per episode: 900
   Total steps: 540,000
   Num products: 100
   State: [avg_inventory, avg_sales_forecast, avg_waste]
   Actions: 14 discrete levels (same as A2C)
   Hidden size: 32
   Learning rate: 0.001
   Gamma: 0.99
   Batch size: 64
   Epsilon decay: 0.998 (slower for more exploration)

‚ö†Ô∏è  L∆∞u √Ω: Training 600 episodes c√≥ th·ªÉ m·∫•t 10-15 ph√∫t
‚è≥ Starting training...
Episode 10/600 | Avg Reward: 30201.70 | Epsilon: 0.980 | Loss: 635.4204
Episode 340/600 | Avg Reward: 290770.81 | Epsilon: 0.506 | Loss: 231028.1250
Episode 350/600 | Avg Reward: 286830.74 | Epsilon: 0.496 | Loss: 237516.0156
Episode 360/600 | Avg Reward: 291939.20 | Epsilon: 0.486 | Loss: 247701.9062
Episode 370/600 | Avg Reward: 291239.03 | Epsilon: 0.477 | Loss: 267156.3438
Episode 380/600 | Avg Reward: 286167.89 | Epsilon: 0.467 | Loss: 321059

---

## üéâ TRAINING COMPLETE!

### Summary:
- ‚úÖ DQN trained v·ªõi 600 episodes √ó 900 steps
- ‚úÖ Environment gi·ªëng A2C/A2C_mod t·ª´ training.py
- ‚úÖ Architecture t∆∞∆°ng th√≠ch: [3‚Üí32‚Üí32‚Üí32‚Üí14]
- ‚úÖ Checkpoint saved for comparison

### Next Steps:
1. Load checkpoint n√†y v√†o [RDX-MSX.ipynb](RDX-MSX.ipynb)
2. So s√°nh RDX features v·ªõi A2C v√† A2C_mod
3. Ph√¢n t√≠ch decision-making differences

### Key Files:
- **Checkpoint**: `checkpointDQN_A2Cstyle/`
- **Visualization**: `dqn_training_results.png`
- **Notebook**: [Train_DQN.ipynb](Train_DQN.ipynb)

In [11]:
# =================================================================
# SAVE FINAL MODEL
# =================================================================

print("="*70)
print("üíæ SAVING FINAL MODEL")
print("="*70)

final_checkpoint_path = r'c:\Study\NCKH\QLKHO-RL\checkpointDQN_A2Cstyle'
os.makedirs(final_checkpoint_path, exist_ok=True)

checkpoint = tf.train.Checkpoint(
    q_network=trainer_v2.q_network,
    optimizer=trainer_v2.optimizer
)
checkpoint.save(os.path.join(final_checkpoint_path, 'ckpt-final'))

print(f"   ‚úÖ Final model saved to:")
print(f"      {final_checkpoint_path}")
print(f"\n   üìù Use this checkpoint for:")
print(f"      - RDX analysis")
print(f"      - Comparison with A2C/A2C_mod")
print(f"      - Testing and evaluation")
print("="*70)

üíæ SAVING FINAL MODEL
   ‚úÖ Final model saved to:
      c:\Study\NCKH\QLKHO-RL\checkpointDQN_A2Cstyle

   üìù Use this checkpoint for:
      - RDX analysis
      - Comparison with A2C/A2C_mod
      - Testing and evaluation


## 7. SAVE FINAL MODEL

In [None]:
# =================================================================
# TEST AGENT PERFORMANCE
# =================================================================

print("="*70)
print("üß™ TESTING TRAINED DQN AGENT")
print("="*70)

test_episodes = 10
test_rewards = []

for ep in range(test_episodes):
    state = env_a2c_style.reset()
    episode_reward = 0
    done = False
    
    while not done:
        action = trainer_v2.select_action(state, training=False)  # Greedy
        next_state, reward, done, info = env_a2c_style.step(action)
        episode_reward += reward
        state = next_state
    
    test_rewards.append(episode_reward)
    print(f"   Test Episode {ep+1}: Reward = {episode_reward:.2f}")

print(f"\nüìä Test Results:")
print(f"   Average reward: {np.mean(test_rewards):.2f}")
print(f"   Std deviation: {np.std(test_rewards):.2f}")
print(f"   Min reward: {np.min(test_rewards):.2f}")
print(f"   Max reward: {np.max(test_rewards):.2f}")
print("="*70)

## 6. TEST TRAINED AGENT

In [None]:
# =================================================================
# VISUALIZATION - TRAINING CURVES
# =================================================================

print("="*70)
print("üìä VISUALIZATION: TRAINING CURVES")
print("="*70)

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot 1: Episode Rewards
ax1 = axes[0]
ax1.plot(rewards_v2, alpha=0.3, color='#2E86AB', linewidth=0.5, label='Raw rewards')

# Moving average
window = 20
moving_avg = np.convolve(rewards_v2, np.ones(window)/window, mode='valid')
ax1.plot(range(window-1, len(rewards_v2)), moving_avg, color='#2E86AB', 
         linewidth=2, label=f'Moving Avg ({window})')

ax1.set_xlabel('Episode', fontweight='bold', fontsize=12)
ax1.set_ylabel('Total Reward', fontweight='bold', fontsize=12)
ax1.set_title('DQN Training: Episode Rewards', fontweight='bold', fontsize=14)
ax1.legend()
ax1.grid(alpha=0.3)

# Plot 2: Training Loss
ax2 = axes[1]
ax2.plot(losses_v2, alpha=0.3, color='#E74C3C', linewidth=0.5, label='Raw loss')

# Moving average
moving_avg_loss = np.convolve(losses_v2, np.ones(window)/window, mode='valid')
ax2.plot(range(window-1, len(losses_v2)), moving_avg_loss, color='#E74C3C', 
         linewidth=2, label=f'Moving Avg ({window})')

ax2.set_xlabel('Episode', fontweight='bold', fontsize=12)
ax2.set_ylabel('Loss', fontweight='bold', fontsize=12)
ax2.set_title('DQN Training: Loss', fontweight='bold', fontsize=14)
ax2.legend()
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('dqn_training_results.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nüìà Training Curve Analysis:")
print(f"   Initial reward (ep 1-50): {np.mean(rewards_v2[:50]):.2f}")
print(f"   Middle reward (ep 275-325): {np.mean(rewards_v2[275:325]):.2f}")
print(f"   Final reward (ep 550-600): {np.mean(rewards_v2[-50:]):.2f}")
improvement = ((np.mean(rewards_v2[-50:]) - np.mean(rewards_v2[:50])) / abs(np.mean(rewards_v2[:50])) * 100)
print(f"   Improvement: {improvement:.1f}%")
print(f"\n   üìä Plot saved: dqn_training_results.png")
print("="*70)

## 5. VISUALIZATION & ANALYSIS