In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import random
import seaborn as sns
from scipy.stats import entropy
torch.set_default_device("cuda")

In [None]:
class ReinforcemnetEnvironment():
    def __init__(self,num_bands,energy_cost=0.2,reward_factor=5,weight=2,max_timestep=180):
        self.num_bands = num_bands
        self.energy_cost = energy_cost
        self.reward_factor = reward_factor
        self.max_timestep = max_timestep
        self.weight = weight
        self.signal_band = {band:[] for band in range(self.num_bands)}
        self.init_bands()
    
    def init_bands(self):
        for band in range(self.num_bands):
            t1 = np.random.choice([0,1])
            t_m1 = np.random.rand(2)
            t_m1 /= t_m1.sum()
            t2 = np.random.choice([0,1],p=t_m1)
            self.signal_band[band].append(t1)
            self.signal_band[band].append(t2)
    
    
    def step(self,state,action):
        reward = 0
        band = action[0]
        prediction = action[1]
        
        if state[band] == prediction:
            reward = self.reward_factor*self.weight - self.energy_cost
        elif state[band] != prediction and state[band] == 0:
            reward = self.reward_factor - self.energy_cost
        elif state[band] != prediction and state[band] == 1:
            reward = self.reward_factor - self.energy_cost*2
        
        return reward
    
    def generate_state(self):
        for band in range(self.num_bands):
            p_2 = tuple(self.signal_band[band][-2:])
            t_m2 = {
                (0,0):np.random.rand(2),
                (0,1):np.random.rand(2),
                (1,0):np.random.rand(2),
                (1,1):np.random.rand(2)
            }
            
            for k in t_m2:
                t_m2[k] /= t_m2[k].sum()
            
            nx = np.random.choice([0,1],p=t_m2[p_2])
            self.signal_band[band].append(nx)
            self.signal_band[band].pop(0)
        
        state = [self.signal_band[val][-1] for val in self.signal_band] 
        return state
    
    def reset(self):
        self.signal_band = self.signal_band = {band:[] for band in range(self.num_bands)}
        self.init_bands()
    
    def construct_observation_space(self,window_size=2):
        observation = []
        for band in range(self.num_bands):
            signal_values = np.array(self.signal_band[band][-window_size:])
            if len(signal_values) == 0:
                entropy_value = 0
            else:  
                value_counts = np.bincount(signal_values, minlength=2)  
                probability_distribution = value_counts / value_counts.sum()  

                if probability_distribution.sum() == 0:
                    entropy_value = 0  
                else:
                    entropy_value = entropy(probability_distribution, base=2)  
            
            observation.append(entropy_value)
        
        return observation
        
        
    
                
     

In [None]:
import numpy as np
from scipy.stats import entropy

class ReinforcementEnvironment:
    def __init__(self, num_bands, energy_cost=0.2, reward_factor=5, weight=2, max_timestep=180):
        self.num_bands = num_bands
        self.energy_cost = energy_cost
        self.reward_factor = reward_factor
        self.max_timestep = max_timestep
        self.weight = weight
        self.signal_band = {band: [] for band in range(self.num_bands)}
        self.current_timestep = 0
        self.transition_matrixes = {band: {} for band in range(self.num_bands)}
        self.init_bands()
        self.current_state = self.get_current_state()
    
    def init_bands(self):
        """Initialize each band with two initial signal values (0 or 1)"""
        for band in range(self.num_bands):
            # First signal chosen with equal probability
            t1 = np.random.choice([0, 1])
            
            # Second signal chosen with random probability distribution
            t_m1 = np.random.rand(2,2)
            t_m1 /= t_m1.sum(axis=1,keepdims=True)  # Normalize to create valid probability distribution
            t2 = np.random.choice([0, 1], p=t_m1[t1])
            t_m2 = {
                (0, 0): np.random.rand(2),
                (0, 1): np.random.rand(2),
                (1, 0): np.random.rand(2),
                (1, 1): np.random.rand(2)
            }
            for k in t_m2:
                t_m2[k] /= t_m2[k].sum()
            self.transition_matrixes[band] = t_m2
            self.signal_band[band] = [t1, t2]  # Store initial values
    
    def step(self, action):
        """
        Execute one time step within the environment
        
        Args:
            action: tuple (band, prediction) where band is the selected frequency band
                   and prediction is the predicted signal value (0 or 1)
        
        Returns:
            tuple: (observation, reward, done, info)
        """
        # Increment timestep
        self.current_timestep += 1
        
        # Parse action
        band = action[0]
        prediction = action[1]
        
        # Calculate reward based on current state and action
        reward = self._calculate_reward(self.current_state[band], prediction)
        
        # Generate next state
        self.generate_state()
        
        # Get observation for the agent
        observation = self.construct_observation_space()
        
        # Check if episode is done
        done = self.current_timestep >= self.max_timestep
        
        # Additional info
        info = {
            "timestep": self.current_timestep,
            "correct_prediction": self.current_state[band] == prediction,
            "state": self.current_state
        }
        
        return observation, reward, done, info
    
    def _calculate_reward(self, actual_signal, prediction):
        """Calculate reward based on prediction accuracy and signal value"""
        if actual_signal == prediction:
            # Correct prediction
            reward = self.reward_factor * self.weight - self.energy_cost
        elif actual_signal == 0:
            # Incorrect prediction when signal is 0
            reward = self.reward_factor - self.energy_cost
        else:  # actual_signal == 1
            # Incorrect prediction when signal is 1
            reward = self.reward_factor - self.energy_cost * self.weight
        
        return reward
    
    def generate_state(self):
        """Generate next state for all bands based on transition probabilities"""
        for band in range(self.num_bands):
            # Get last two signals for this band
            p_2 = tuple(self.signal_band[band][-2:])
            
            # Create random transition probabilities for all possible previous states
            t_m2 = self.transition_matrixes[band]
            
            # Generate next signal based on transition probability
            next_signal = np.random.choice([0, 1], p=t_m2[p_2])
            
            # Update signal history for this band
            self.signal_band[band].append(next_signal)
            self.signal_band[band].pop(0)
        
        # Update current state
        self.current_state = self.get_current_state()
        
        return self.current_state
    
    def get_current_state(self):
        """Return the current state as a list of the most recent signal for each band"""
        return [self.signal_band[band][-1] for band in range(self.num_bands)]
    
    def reset(self):
        """Reset the environment to initial state and return initial observation"""
        self.signal_band = {band: [] for band in range(self.num_bands)}
        self.current_timestep = 0
        self.init_bands()
        self.current_state = self.get_current_state()
        return self.construct_observation_space()
    
    def construct_observation_space(self, window_size=2):
        """
        Construct observation space with entropy calculations for each band
        
        Args:
            window_size: Number of recent signals to consider for entropy calculation
            
        Returns:
            list: Entropy values for each band
        """
        observation = []
        for band in range(self.num_bands):
            # Get recent signals for this band
            signal_values = np.array(self.signal_band[band][-window_size:])
            
            if len(signal_values) == 0:
                entropy_value = 0
            else:
                # Count occurrences of each value (0 or 1)
                value_counts = np.bincount(signal_values, minlength=2)
                
                # Calculate probability distribution
                probability_distribution = value_counts / len(signal_values)
                
                # Handle edge cases
                if np.all(probability_distribution == 0):
                    entropy_value = 0
                else:
                    # Calculate entropy using scipy function
                    entropy_value = entropy(probability_distribution, base=2)
            
            observation.append(entropy_value)
        
        return observation
    
    def soft_reset(self):
        self.signal_band = {band: [] for band in range(self.num_bands)}
        self.current_timestep = 0
        self.init_bands()
        self.current_state = self.get_current_state()
        return self.construct_observation_space()
        

In [None]:
import numpy as np
from scipy.stats import entropy

class ReinforcementEnvironment:
    def __init__(self, num_bands, energy_cost=0.2, reward_factor=5, weight=2, max_timestep=180):
        self.num_bands = num_bands
        self.energy_cost = energy_cost
        self.reward_factor = reward_factor
        self.max_timestep = max_timestep
        self.weight = weight
        self.signal_band = {band: [] for band in range(self.num_bands)}
        self.current_timestep = 0
        self.transition_matrixes = {band: {} for band in range(self.num_bands)}
        self.init_bands()
        self.current_state = self.get_current_state()
    
    def init_bands(self):
        """Initialize each band with two initial signal values (0 or 1)"""
        for band in range(self.num_bands):
            # First signal chosen with equal probability
            t1 = np.random.choice([0, 1])
            
            # Second signal chosen with random probability distribution
            t_m1 = np.random.rand(2,2)
            t_m1 /= t_m1.sum(axis=1,keepdims=True)  # Normalize to create valid probability distribution
            t2 = np.random.choice([0, 1], p=t_m1[t1])
            t_m2 = {
                (0, 0): np.random.rand(2),
                (0, 1): np.random.rand(2),
                (1, 0): np.random.rand(2),
                (1, 1): np.random.rand(2)
            }
            for k in t_m2:
                t_m2[k] /= t_m2[k].sum()
            self.transition_matrixes[band] = t_m2
            self.signal_band[band] = [t1, t2]  # Store initial values
    
    def step(self, action):
        """
        Execute one time step within the environment
        
        Args:
            action: tuple (band, prediction) where band is the selected frequency band
                and prediction is the predicted signal value (0 or 1)
        
        Returns:
            tuple: (observation, reward, done, info)
        """
        # Increment timestep
        self.current_timestep += 1
        
        # Parse action
        band = action[0]
        prediction = action[1]
        
        # Calculate reward based on current state and action
        reward = self._calculate_reward(self.current_state[band], prediction)
        
        # Generate next state
        self.generate_state()
        
        # Get observation for the agent
        observation = self.construct_observation_space()
        
        # Check if episode is done
        done = self.current_timestep >= self.max_timestep
        
        # Additional info
        info = {
            "timestep": self.current_timestep,
            "correct_prediction": self.current_state[band] == prediction,
            "state": self.current_state
        }
        
        return observation, reward, done, info
    
    def _calculate_reward(self, actual_signal, prediction):
        """Calculate reward based on prediction accuracy and signal value"""
        if actual_signal == prediction:
            # Correct prediction
            reward = self.reward_factor * self.weight - self.energy_cost
        elif actual_signal == 0:
            # Incorrect prediction when signal is 0
            reward = self.reward_factor - self.energy_cost
        else:  # actual_signal == 1
            # Incorrect prediction when signal is 1
            reward = self.reward_factor - self.energy_cost * self.weight
        
        return reward
    
    def generate_state(self):
        """Generate next state for all bands based on transition probabilities"""
        for band in range(self.num_bands):
            # Get last two signals for this band
            p_2 = tuple(self.signal_band[band][-2:])
            
            # Create random transition probabilities for all possible previous states
            t_m2 = self.transition_matrixes[band]
            # Generate next signal based on transition probability
            next_signal = np.random.choice([0, 1], p=t_m2[p_2])
            
            # Update signal history for this band
            self.signal_band[band].append(next_signal)
            self.signal_band[band].pop(0)
        
        # Update current state
        self.current_state = self.get_current_state()
        
        return self.current_state
    
    def get_current_state(self):
        """Return the current state as a list of the most recent signal for each band"""
        return [self.signal_band[band][-1] for band in range(self.num_bands)]
    
    def reset(self):
        """Reset the environment to initial state and return initial observation"""
        self.signal_band = {band: [] for band in range(self.num_bands)}
        self.current_timestep = 0
        self.init_bands()
        self.current_state = self.get_current_state()
        return self.construct_observation_space()
    
    def construct_observation_space(self, window_size=2):
        """
        Construct observation space with entropy calculations and transition probabilities for each band
        
        Args:
            window_size: Number of recent signals to consider for entropy calculation
            
        Returns:
            dict: Contains entropy values and transition probabilities for each band
        """
        observation = {}
        for band in range(self.num_bands):
            # Get recent signals for this band
            signal_values = np.array(self.signal_band[band])
            
            # Calculate entropy
            if len(signal_values) < window_size:
                entropy_value = 0
            else:
                recent_signals = signal_values[-window_size:]
                # Count occurrences of each value (0 or 1)
                value_counts = np.bincount(recent_signals, minlength=2)
                
                # Calculate probability distribution
                probability_distribution = value_counts / len(recent_signals)
                
                # Handle edge cases
                if np.all(probability_distribution == 0):
                    entropy_value = 0
                else:
                    # Calculate entropy using scipy function
                    entropy_value = entropy(probability_distribution, base=2)
            
            # Calculate transition probabilities (0->1 and 1->0)
            transitions_0to1 = 0
            transitions_1to0 = 0
            total_0 = 0
            total_1 = 0
            
            # We need at least 2 signals to calculate transitions
            if len(signal_values) >= 2:
                # Count transitions
                for i in range(len(signal_values)-1):
                    if signal_values[i] == 0:
                        total_0 += 1
                        if signal_values[i+1] == 1:
                            transitions_0to1 += 1
                    elif signal_values[i] == 1:
                        total_1 += 1
                        if signal_values[i+1] == 0:
                            transitions_1to0 += 1
            
            # Calculate probabilities
            prob_0to1 = transitions_0to1 / total_0 if total_0 > 0 else 0
            prob_1to0 = transitions_1to0 / total_1 if total_1 > 0 else 0
            
            # Store all information for this band
            observation[band] = {
                'entropy': entropy_value,
                'prob_0to1': prob_0to1,
                'prob_1to0': prob_1to0
            }
        
        return observation