In [1]:
import numpy as np
from scipy.stats import entropy

class ReinforcementEnvironment:
    def __init__(self, num_bands, energy_cost=2, reward_factor=5, weight=5, max_timestep=180):
        self.num_bands = num_bands
        self.energy_cost = energy_cost
        self.reward_factor = reward_factor
        self.max_timestep = max_timestep
        self.weight = weight
        self.signal_band = {band: [] for band in range(self.num_bands)}
        self.current_timestep = 0
        self.transition_matrixes = {band: {} for band in range(self.num_bands)}
        self.init_bands()
        self.current_state = self.get_current_state()
    
    def init_bands(self):
        """Initialize each band with two initial signal values (0 or 1)"""
        for band in range(self.num_bands):
            # First signal chosen with equal probability
            t1 = np.random.choice([0, 1])
            
            # Second signal chosen with random probability distribution
            t_m1 = np.random.rand(2,2)
            t_m1 /= t_m1.sum(axis=1,keepdims=True)  # Normalize to create valid probability distribution
            t2 = np.random.choice([0, 1], p=t_m1[t1])
            # t_m2 = {
            #     (0, 0): np.random.rand(2),
            #     (0, 1): np.random.rand(2),
            #     (1, 0): np.random.rand(2),
            #     (1, 1): np.random.rand(2)
            # }
            # for k in t_m2:
            #     t_m2[k] /= t_m2[k].sum()
            t_m2 = {
            (0, 0): np.random.dirichlet([1, 1]),  # Generates a valid probability distribution over {0,1}
            (0, 1): np.random.dirichlet([1, 1]),
            (1, 0): np.random.dirichlet([1, 1]),
            (1, 1): np.random.dirichlet([1, 1])
            }
            self.transition_matrixes[band] = t_m2
            self.signal_band[band] = [t1, t2]  
    
    def step(self, action):
        """
        Execute one time step within the environment
        
        Args:
            action: tuple (band, prediction) where band is the selected frequency band
                   and prediction is the predicted signal value (0 or 1)
        
        Returns:
            tuple: (observation, reward, done, info)
        """
        self.current_timestep += 1
        
        band = action[0]
        prediction = action[1]
        
        reward = self._calculate_reward(self.current_state[band], prediction)
        
        self.generate_state()
        
        observation = self.construct_observation_space()
        
        done = self.current_timestep >= self.max_timestep
        
        info = {
            "timestep": self.current_timestep,
            "correct_prediction": self.current_state[band] == prediction,
            "state": self.current_state
        }
        
        return observation, reward, done, info
    
    def _calculate_reward(self, actual_signal, prediction):
        """Calculate reward based on prediction accuracy and signal value"""
        if actual_signal == prediction:
            # Correct prediction
            reward = self.reward_factor * self.weight - self.energy_cost
        elif actual_signal == 0:
            # Incorrect prediction when signal is 0
            reward = self.reward_factor - self.energy_cost
        else:  # actual_signal == 1
            # Incorrect prediction when signal is 1
            reward = self.reward_factor - self.energy_cost * self.weight
        
        return reward
    
    def generate_state(self):
        """Generate next state for all bands based on transition probabilities"""
        for band in range(self.num_bands):
            # Get last two signals for this band
            p_2 = tuple(self.signal_band[band][-2:])
            
            t_m2 = self.transition_matrixes[band]
            
            next_signal = np.random.choice([0, 1], p=t_m2[p_2])
            
            self.signal_band[band].append(next_signal)
            self.signal_band[band].pop(0)
        
        # Update current state
        self.current_state = self.get_current_state()
        
        return self.current_state
    
    def get_current_state(self):
        """Return the current state as a list of the most recent signal for each band"""
        return [self.signal_band[band][-1] for band in range(self.num_bands)]
    
    def reset(self):
        """Reset the environment to initial state and return initial observation"""
        self.signal_band = {band: [] for band in range(self.num_bands)}
        self.current_timestep = 0
        self.init_bands()
        self.current_state = self.get_current_state()
        return self.construct_observation_space()
    
    def construct_observation_space(self, window_size=10):
        """
        Construct observation space with entropy calculations for each band
        
        Args:
            window_size: Number of recent signals to consider for entropy calculation
            
        Returns:
            list: Entropy values for each band
        """
        observation = []
        for band in range(self.num_bands):
            signal_values = np.array(self.signal_band[band][-window_size:])
            
            if len(signal_values) <= window_size:
                entropy_value = 0
            else:
                value_counts = np.bincount(signal_values, minlength=2)
                
                probability_distribution = value_counts / len(signal_values)
                
                # Handle edge cases
                if np.all(probability_distribution == 0):
                    entropy_value = 0
                else:
                    # Calculate entropy using scipy function
                    entropy_value = entropy(probability_distribution, base=2)
            
            observation.append(entropy_value)
        
        return observation
    
    def soft_reset(self):
        self.signal_band = {band: self.signal_band[band][-2:] for band in range(self.num_bands)}
        self.current_timestep = 0
        self.generate_state()
        return self.construct_observation_space()
        

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import matplotlib.pyplot as plt
class DuelingDeepQNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        """
        Dueling Deep Q-Network Architecture
        
        Separates value estimation into:
        1. State value function (V)
        2. Advantage function (A)
        
        Args:
            input_dim (int): Dimension of input observation space
            output_dim (int): Number of possible actions
        """
        super(DuelingDeepQNetwork, self).__init__()
        
        # Shared feature layers
        self.feature_layer = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU()
        )
        
        # Value stream
        self.value_stream = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # Single value output
        )
        
        # Advantage stream
        self.advantage_stream = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)  # One output per action
        )
    
    def forward(self, x):
        # Ensure input is float tensor and has correct shape
        x = x.float()
        if x.dim() == 1:
            x = x.unsqueeze(0)
        
        # Extract shared features
        features = self.feature_layer(x)
        
        # Compute value and advantage streams
        values = self.value_stream(features)
        advantages = self.advantage_stream(features)
        
        # Combine value and advantage streams
        # Q(s,a) = V(s) + (A(s,a) - mean(A(s,a)))
        action_values = values + (advantages - advantages.mean(dim=1, keepdim=True))
        
        return action_values

class DuelingDQNAgent:
    def __init__(self, env, input_dim, output_dim, exploration_type='epsilon'):
        """
        Dueling DQN Agent with flexible exploration strategies
        
        Args:
            env: Environment
            input_dim: Input dimension
            output_dim: Output dimension
            exploration_type: 'epsilon' or 'softmax'
        """
        self.env = env
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.exploration_type = exploration_type
        
        # Device configuration
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Networks
        self.q_network = DuelingDeepQNetwork(input_dim, output_dim).to(self.device)
        self.target_network = DuelingDeepQNetwork(input_dim, output_dim).to(self.device)
        self.target_network.load_state_dict(self.q_network.state_dict())
        
        # Hyperparameters
        self.learning_rate = 0.0005
        self.gamma = 0.99  # Discount factor
        
        # Exploration parameters
        if exploration_type == 'epsilon':
            self.epsilon = 1.0
            self.epsilon_min = 0.01
            self.epsilon_decay = 0.9975
            self.epsilon_decay_steps = 500
        else:  # softmax
            self.temperature = 1.0
            self.min_temperature = 0.1
            self.temperature_decay = 0.9995
            self.temperature_decay_steps = 500
        
        # Replay memory
        self.replay_memory = []
        self.memory_size = 20000
        self.batch_size = 128
        
        # Optimizer
        self.optimizer = optim.Adam(
            self.q_network.parameters(), 
            lr=self.learning_rate, 
            weight_decay=1e-5
        )
        
        # Loss function
        self.loss_fn = F.smooth_l1_loss
    
    def select_action(self, state):
        """
        Action selection with flexible exploration strategies
        """
        if self.exploration_type == 'epsilon':
            # Epsilon-greedy exploration
            self.epsilon = max(
                self.epsilon_min, 
                self.epsilon * (self.epsilon_decay ** (1 / self.epsilon_decay_steps))
            )
            
            if np.random.rand() <= self.epsilon:
                # Random exploration
                band = np.random.randint(0, len(state))
                prediction = np.random.randint(0, 2)
                return (band, prediction)
        else:  # softmax
            # Softmax exploration
            self.temperature = max(
                self.min_temperature, 
                self.temperature * (self.temperature_decay ** (1 / self.temperature_decay_steps))
            )
        
        # Exploitation phase (common for both exploration types)
        with torch.no_grad():
            # Prepare state tensor
            state_tensor = torch.tensor(state, dtype=torch.float32).to(self.device).unsqueeze(0)
            
            # Get Q-values from the network
            q_values = self.q_network(state_tensor).squeeze(0).cpu().numpy()
            
            if self.exploration_type == 'softmax':
                # Compute softmax probabilities with temperature scaling
                scaled_q_values = q_values / self.temperature
                scaled_q_values -= np.max(scaled_q_values)
                
                exp_q = np.exp(scaled_q_values)
                action_probs = exp_q / np.sum(exp_q)
                
                # Sample action based on softmax probabilities
                action_idx = np.random.choice(len(q_values), p=action_probs)
            else:
                # Greedy selection for epsilon strategy
                action_idx = np.argmax(q_values)
            
            # Convert action index to (band, prediction)
            band = action_idx // 2
            prediction = action_idx % 2
            
            return (band, prediction)
    
    def store_transition(self, state, action, reward, next_state, done):
        """
        Store transition in replay memory
        """
        experience = (state, action, reward, next_state, done)
        
        if len(self.replay_memory) < self.memory_size:
            self.replay_memory.append(experience)
        else:
            # Replace random experience if memory is full
            idx = random.randint(0, len(self.replay_memory) - 1)
            self.replay_memory[idx] = experience
    
    def experience_replay(self):
        """
        Experience replay with Double DQN approach
        """
        if len(self.replay_memory) < self.batch_size:
            return
        
        # Sample batch
        batch = random.sample(self.replay_memory, self.batch_size)
        
        # Prepare batch tensors
        states = torch.tensor(np.array([b[0] for b in batch]), dtype=torch.float32).to(self.device)
        actions = [b[1] for b in batch]
        rewards = torch.tensor(np.array([b[2] for b in batch]), dtype=torch.float32).to(self.device)
        next_states = torch.tensor(np.array([b[3] for b in batch]), dtype=torch.float32).to(self.device)
        dones = torch.tensor(np.array([b[4] for b in batch]), dtype=torch.float32).to(self.device)
        
        # Current Q-values
        current_q_values = self.q_network(states)
        
        # Double DQN: select actions from main network, evaluate from target
        next_q_values_main = self.q_network(next_states)
        next_q_values_target = self.target_network(next_states)
        
        max_next_actions = next_q_values_main.argmax(1)
        max_next_q_values = next_q_values_target.gather(1, max_next_actions.unsqueeze(1)).squeeze(1)
        
        # Compute target Q-values
        target_q_values = rewards + (1 - dones) * self.gamma * max_next_q_values
        
        # Select current Q-values for taken actions
        batch_actions = [action[0] * 2 + action[1] for action in actions]
        current_q_values = current_q_values.gather(1, torch.tensor(batch_actions, dtype=torch.long).unsqueeze(1).to(self.device)).squeeze(1)
        
        # Compute loss
        loss = self.loss_fn(current_q_values, target_q_values.detach())
        
        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), max_norm=1)
        self.optimizer.step()
    
    def train(self, episodes=1000):
        """
        Training loop with comprehensive metrics
        """
        total_rewards = []
        self.env.reset()
        
        for episode in range(episodes):
            # Reset environment
            state = self.env.soft_reset()
            total_reward = 0
            done = False
            correct_predictions = 0
            total_predictions = 0
            
            while not done:
                action = self.select_action(state)
                next_state, reward, done, info = self.env.step(action)
                
                total_predictions += 1
                if info['correct_prediction']:
                    correct_predictions += 1
                
                self.store_transition(state, action, reward, next_state, done)
                
                state = next_state
                total_reward += reward
                
                self.experience_replay()
            
            # Periodic target network update
            if episode % 100 == 0:
                self.target_network.load_state_dict(self.q_network.state_dict())
            
            total_rewards.append(total_reward)
            
            # Logging
            if episode % 50 == 0:
                accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
                print(f"Episode {episode}, "
                      f"Total Reward: {total_reward:.2f}, "
                      f"Accuracy: {accuracy:.2%}, "
                      f"{self.exploration_type.capitalize()} Value: {getattr(self, self.exploration_type):.4f}")
        
        return total_rewards
    
def plot_rewards(rewards, window_size=10):
        plt.figure(figsize=(12, 6), facecolor='white')
        plt.plot(rewards, alpha=0.5, color='lightblue', label='Episode Reward')
        moving_average = np.convolve(rewards, np.ones(window_size)/window_size, mode='valid')
        plt.plot(np.arange(window_size-1, len(rewards)), moving_average, color='blue', linewidth=2, label=f'{window_size}-Episode Moving Avg')
        plt.title('Training Reward over Episodes', fontweight='bold')
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.show() 


def main():
    np.random.seed(42)
    torch.manual_seed(42)
    random.seed(42)
    
    num_bands = 10
    env = ReinforcementEnvironment(num_bands)
    
    input_dim = num_bands
    output_dim = num_bands * 2
    
    # Epsilon-greedy agent
    epsilon_agent = DuelingDQNAgent(env, input_dim, output_dim, exploration_type='epsilon')
    epsilon_rewards = epsilon_agent.train(episodes=500)
    
    # Softmax agent
    softmax_agent = DuelingDQNAgent(env, input_dim, output_dim, exploration_type='softmax')
    softmax_rewards = softmax_agent.train(episodes=500)
    
    # Plotting rewards would be done here
    plot_rewards(epsilon_rewards, title='Dueling DQN with Epsilon Exploration')
    plot_rewards(softmax_rewards, title='Dueling DQN with Softmax Exploration')

# Assuming plot_rewards function is defined similarly to previous implementations

In [3]:
main()

Episode 0, Total Reward: 2172.00, Accuracy: 52.22%, Epsilon Value: 0.9991
Episode 50, Total Reward: 2280.00, Accuracy: 51.67%, Epsilon Value: 0.9551
Episode 100, Total Reward: 2028.00, Accuracy: 57.78%, Epsilon Value: 0.9130
Episode 150, Total Reward: 2264.00, Accuracy: 56.11%, Epsilon Value: 0.8728
Episode 200, Total Reward: 2336.00, Accuracy: 60.00%, Epsilon Value: 0.8343
Episode 250, Total Reward: 1972.00, Accuracy: 57.78%, Epsilon Value: 0.7976
Episode 300, Total Reward: 2164.00, Accuracy: 58.89%, Epsilon Value: 0.7624
Episode 350, Total Reward: 2584.00, Accuracy: 58.33%, Epsilon Value: 0.7288
Episode 400, Total Reward: 2396.00, Accuracy: 67.22%, Epsilon Value: 0.6967
Episode 450, Total Reward: 2616.00, Accuracy: 68.89%, Epsilon Value: 0.6660


AttributeError: 'DuelingDQNAgent' object has no attribute 'softmax'