In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque
from scipy.stats import entropy
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class SBEOS_Environment:
    def __init__(self,max_timesteps=180,reward=10,penalty=5,window_size=10):
        self.max_timesteps = max_timesteps
        self.reward = reward
        self.penalty = penalty
        self.window_size = window_size
        self.band = np.array([])
        self.init_band()
    def init_band(self):
        t1 = np.random.choice([0, 1])
        t_m1 = np.random.rand(2,2)
        t_m1 /= t_m1.sum(axis=1,keepdims=True)
        t2 = np.random.choice([0, 1], p=t_m1[t1])
        t_m2 = {
        (0, 0): np.random.dirichlet([1, 1]),  
        (0, 1): np.random.dirichlet([1, 1]),
        (1, 0): np.random.dirichlet([1, 1]),
        (1, 1): np.random.dirichlet([1, 1])
        }
        self.transiton_matrix = t_m2
        self.band = np.array([t1, t2])
        self.noise_mean = np.random.uniform(-0.1, 0.1)
        self.noise_std = np.random.uniform(0.01, 0.1)
    def generate_state(self):
        p_2 = tuple(self.band[-2:])
        t_m2 = self.transiton_matrix
        next_state = np.random.choice([0,1],p=t_m2[p_2])
        noise = np.random.normal(self.noise_mean, self.noise_std)
        noisy_state = np.round(np.clip(next_state + noise, 0, 1))
        noisy_state = int(noisy_state)
        self.band = np.append(self.band,noisy_state)
        return noisy_state
    def generate_observation_state(self):
        sign_v = np.array(self.band[-self.window_size:])
        if len(sign_v) < self.window_size:
            entropy_v = 0
        else:
            vc = np.bincount(sign_v,minlength=2)
            pdf = vc/len(sign_v)
            if np.all(pdf == 0):
                entropy_v = 0
            else:
                entropy_v = entropy(pdf,base=2)
        return entropy_v
    def reset(self):
        self.band = np.array(self.band[-self.window_size:])
        self.current_timestep = 0
        self.current_state = self.generate_state()
        return self.generate_observation_state()
    
    def cal_reward(self,actual,prediction):
        if actual == prediction:
            return self.reward
        elif actual != prediction and actual == 1:
            return -self.penalty
        else:
            return self.penalty
    
    def step(self,action):
        self.current_timestep += 1
        reward = self.cal_reward(self.current_state,action)
        self.current_state = self.generate_state()
        observation = self.generate_observation_state()
        done = self.current_timestep >= self.max_timesteps
        info = {
            "timestep": self.current_timestep,
            "correct_prediction": self.current_state == action,
            "state": self.current_state
        }
        return observation,reward,done,info
        


In [None]:
class LSTMDQN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMDQN, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        return self.fc(lstm_out[:, -1, :])  # Take the last timestep output

class DQNAgent:
    def __init__(self, state_size, action_size, hidden_dim=64, lr=0.001, gamma=0.99, batch_size=32, memory_size=10000):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.lr = lr
        self.batch_size = batch_size
        self.memory = deque(maxlen=memory_size)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.model = LSTMDQN(state_size, hidden_dim, action_size).to(self.device)
        self.target_model = LSTMDQN(state_size, hidden_dim, action_size).to(self.device)
        self.target_model.load_state_dict(self.model.state_dict())
        
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        self.loss_fn = nn.MSELoss()
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.choice([0, 1])
        state = torch.FloatTensor(state).unsqueeze(0).unsqueeze(0).to(self.device)
        with torch.no_grad():
            action_values = self.model(state)
        return torch.argmax(action_values).item()
    
    def replay(self):
        if len(self.memory) < self.batch_size:
            return
        batch = random.sample(self.memory, self.batch_size)
        
        states, actions, rewards, next_states, dones = zip(*batch)
        states = torch.FloatTensor(states).unsqueeze(2).to(self.device)
        next_states = torch.FloatTensor(next_states).unsqueeze(2).to(self.device)
        actions = torch.LongTensor(actions).unsqueeze(1).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        
        q_values = self.model(states).gather(1, actions).squeeze()
        with torch.no_grad():
            next_q_values = self.target_model(next_states).max(1)[0]
            target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
        
        loss = self.loss_fn(q_values, target_q_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

# Training the DQN with LSTM on SBEOS_Environment

def train_dqn(episodes=500):
    env = SBEOS_Environment()
    state_size = 1  # Since observation is entropy value
    action_size = 2  # Binary action space {0,1}
    agent = DQNAgent(state_size, action_size)
    
    for ep in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False
        
        while not done:
            action = agent.act([state])
            next_state, reward, done, _ = env.step(action)
            agent.remember([state], action, reward, [next_state], done)
            state = next_state
            total_reward += reward
            agent.replay()
        
        agent.update_target_model()
        print(f"Episode {ep+1}/{episodes}, Total Reward: {total_reward}, Epsilon: {agent.epsilon:.4f}")

train_dqn()