In [19]:
import time
import random
from collections import deque

import numpy as np
import matplotlib.pyplot as plt

In [20]:
import torch
import torch.nn as nn
from torch.optim import Adam
import torch.functional as F
from torchrl.modules import NoisyLinear

import gymnasium as gym

In [21]:
class NN(nn.Module):

    def __init__(self, n_states, n_actions, N = 51, vmin = 0, vmax = 500):
        super().__init__()
        self.n_states = n_states
        self.n_actions = n_actions
        self.N = 51
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.atoms = torch.linspace(vmin, vmax, N).to(self.device)
        self.del_z = self.atoms[1] - self.atoms[0]
        
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim = 2)
        
        self.fc1 = NoisyLinear(self.n_states, 128)
        self.fc2 = NoisyLinear(128, 256)
        self.fc3 = NoisyLinear(256, 256)
        
        self.fcv4 = NoisyLinear(256, 128)
        self.fcv5 = NoisyLinear(128, self.N)

        self.fca4 = NoisyLinear(256, 128)
        self.fca5 = NoisyLinear(128, self.N * self.n_actions)

    def forward(self, states):
        x = self.relu(self.fc3(self.relu(self.fc2(self.relu(self.fc1(states))))))

        values = self.fcv5(self.relu(self.fcv4(x)))
        adv = self.fca5(self.relu(self.fca4(x)))

        values = values.unsqueeze(1)
        adv = adv.reshape(-1, self.n_actions, self.N)

        # print(values.shape, adv.shape)
                                                                         
        qvalues =  values + (adv - torch.mean(adv, dim = 1, keepdim = True))
        probs = self.softmax(qvalues)
        Q = (probs * self.atoms).sum(dim = 2)
        # print(qvalues.shape, probs.shape, Q.shape)
        return qvalues, probs, Q
        

In [22]:
class PrioritizedReplayBuffer:
    def __init__(self, size = 100000, steps = 5, alpha = 0.9, beta = 0.1, beta_increment = 0.001, ep_err = 1e-5, gamma = 0.9):
        self.size = size
        self.steps = steps
        self.alpha = alpha
        self.beta = beta
        self.beta_increment = beta_increment
        self.gamma = gamma
        self.episilon = ep_err

        self.priority = deque(maxlen = size)
        self.buffer = deque(maxlen = size)
        self.n_step_buffer = deque(maxlen = steps)

    def add(self, states, actions, reward, next_state, done):
        item = (states, actions, reward, next_state, done)
        self.n_step_buffer.append(item)
        if len(self.n_step_buffer) < self.steps and not done:
            return 
        
        state, action = self.n_step_buffer[0][:2]
        R = 0
        for idx, (_, _, r, n_s, d) in enumerate(self.n_step_buffer):
            R += (self.gamma ** idx) * r
            if d or idx == self.steps - 1:
                next_state, done, steps = n_s, d, idx + 1
                break
        
        self.buffer.append((state, action, R, next_state, done, steps))
        self.priority.append(max(self.priority) if self.priority else 1.0)
    
    def sample(self, batch_size):
        prios = np.array(list(self.priority))
        prob = prios ** self.alpha
        prob /= prob.sum()

        # print(len(self.buffer), batch_size, prob.shape)
        indices = np.random.choice(len(self.buffer), batch_size, p = prob)
        samples = [self.buffer[i] for i in indices]

        weights = (len(self.buffer) * prob[indices]) ** (-self.beta)
        weights /= weights.max()

        self.beta = np.min([1, self.beta *(1 + self.beta_increment)])

        state, action, reward, next_state, done, steps = zip(*samples)
        return state, action, reward, next_state, done, steps,indices, weights

    def update_priorities(self, indices, td_error):
        for idx, td_error in zip(indices, td_error):
            self.priority[idx] = abs(td_error) + self.episilon

        
        

In [23]:
class Agent:

    def __init__(self, env ,n_states, n_actions, gamma = 0.9, episilon = 1, episilon_decay = 0.995, episilon_min = 0.01):
        self.env = env
        self.n_states = n_states
        self.n_actions = n_actions
        self.gamma = gamma
        self.episilon = episilon
        self.episilon_decay = episilon_decay
        self.episilon_min = episilon_min
        self.target_update_freq = 100
        self.batch_size = 64
        self.steps = 0


        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.policy_net = NN(n_states, n_actions)
        self.target_net = NN(n_states, n_actions)
        self.buffer = PrioritizedReplayBuffer()

        self.policy_net.to(self.device)
        self.target_net.to(self.device)

        self.loss = nn.MSELoss(reduction = 'none')
        self.optimizer = Adam(self.policy_net.parameters(), lr = 1e-3)
        self.addMemory()

    def addMemory(self):
        sum_rewards = 0
        state = self.env.reset()[0]
        for _ in range(1000):

            action = self.env.action_space.sample()
            next_state, reward, term, trun, _ = self.env.step(action)
            done = int(term or trun)
            sum_rewards += 1
            self.buffer.add(state, action, reward, next_state, done)
            if done:
                state = self.env.reset()[0]
            else:
                state = next_state

    def select_action(self, state):        
        state = torch.tensor(state, dtype = torch.float32).unsqueeze(0).to(self.device)
        with torch.no_grad():
            _, _, Q = self.policy_net(state)
        return torch.argmax(Q, dim = 1).item()


    def train(self):
        # if len(self.buffer) < self.batch_size:
        #     return
    
        states, actions, rewards, next_states, done, steps, indices, weights = self.buffer.sample(self.batch_size)
    
        states = torch.tensor(np.array(states), dtype=torch.float32).to(self.device)
        actions = torch.tensor(actions, dtype=torch.int64).unsqueeze(1).to(self.device)
        rewards = torch.tensor(rewards, dtype=torch.float32).unsqueeze(1).to(self.device)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32).to(self.device)
        done = torch.tensor(done, dtype=torch.float32).unsqueeze(1).to(self.device)
        steps = torch.tensor(steps, dtype=torch.float32).unsqueeze(1).to(self.device)
        weights = torch.tensor(weights, dtype=torch.float32).unsqueeze(1).to(self.device)
    
        _, probs, _ = self.policy_net(states)  
        probs_a = probs.gather(1, actions.unsqueeze(-1).expand(-1, -1, self.policy_net.N))  
        probs_a = probs_a.squeeze(1)  
    
        with torch.no_grad():
            _, _, Q_next = self.policy_net(next_states)
            next_actions = torch.argmax(Q_next, dim=1)  
    
            _, next_probs, _ = self.target_net(next_states)  
            next_probs_a = next_probs[range(self.batch_size), next_actions]  
    
            atoms = self.policy_net.atoms  
            delta_z = atoms[1] - atoms[0]
            vmin, vmax = atoms[0], atoms[-1]
    
            Tz = rewards + (self.gamma ** steps) * atoms.unsqueeze(0) * (1 - done)
            Tz = Tz.clamp(vmin, vmax)
    
            b = (Tz - vmin) / delta_z
            l = b.floor().long()
            u = b.ceil().long()
    
            m = torch.zeros_like(next_probs_a)  
            for i in range(self.batch_size):
                for j in range(self.policy_net.N):
                    lj, uj = l[i, j], u[i, j]
                    if lj == uj:
                        m[i, lj] += next_probs_a[i, j]
                    else:
                        m[i, lj] += next_probs_a[i, j] * (uj.float() - b[i, j])
                        m[i, uj] += next_probs_a[i, j] * (b[i, j] - lj.float())
    
        loss = -(m * torch.log(probs_a + 1e-8)).sum(dim=1)
        loss = (weights.squeeze() * loss).mean()
    
        self.loss_value += loss
        self.optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
        self.optimizer.step()
    
        new_priorities = loss.detach().abs().cpu().flatten() + 1e-6
        self.buffer.update_priorities(indices, new_priorities)
    
        self.steps += 1
        if self.steps % self.target_update_freq == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())

    
    
    def playingLoop(self, num_episodes = 500):
        self.num_episodes = num_episodes
        self.rewards_per_episode = []
        self.loss_per_episode = []

        for ep in range(num_episodes):
            self.loss_value = 0
            state = self.env.reset()[0]
            episode_reward = 0
            done = 0

            while not done:

                action = self.select_action(state)
                next_state, reward, term, trun, _ = self.env.step(action)
                done = int(term or trun)
                self.buffer.add(state, action, reward, next_state, done)

                self.train()

                state = next_state
                episode_reward += reward

            # print((1-self.episilon_decay) * self.episilon)   
            self.episilon = max(self.episilon_min, self.episilon_decay * self.episilon)
            self.rewards_per_episode.append(episode_reward)
            self.loss_per_episode.append(self.loss_value.item())

            # if ep % 50 == 0:
            avg_reward = np.mean(self.rewards_per_episode[-10:])
            print(f"Episode {ep} | Avg Reward: {avg_reward:.2f} | Epsilon: {self.episilon:.2f}")

    
    def plot(self, interval = 50):
        moving_average_rewards = np.convolve(self.rewards_per_episode, np.ones(interval)/interval, mode = "same")
        moving_average_loss = np.convolve(self.loss_per_episode, np.ones(interval)/interval, mode = "same")
        plt.plot(range(self.num_episodes), moving_average_rewards, label = "Moving Average")
        plt.plot(range(self.num_episodes), moving_average_loss, label = "Loss Per Episode")
        plt.xlabel("Episodes")
        plt.ylabel("Values per episode")

In [None]:
episodes = 1000
env = gym.make("CartPole-v1", render_mode = "rgb_array")
agent = Agent(env, 4, 2)
rewards =agent.playingLoop(episodes)

Episode 0 | Avg Reward: 10.00 | Epsilon: 0.99
Episode 1 | Avg Reward: 11.00 | Epsilon: 0.99
Episode 2 | Avg Reward: 60.67 | Epsilon: 0.99
Episode 3 | Avg Reward: 81.00 | Epsilon: 0.98
Episode 4 | Avg Reward: 102.80 | Epsilon: 0.98
Episode 5 | Avg Reward: 123.17 | Epsilon: 0.97
Episode 6 | Avg Reward: 128.71 | Epsilon: 0.97
Episode 7 | Avg Reward: 133.75 | Epsilon: 0.96
Episode 8 | Avg Reward: 139.89 | Epsilon: 0.96
Episode 9 | Avg Reward: 153.70 | Epsilon: 0.95
Episode 10 | Avg Reward: 175.80 | Epsilon: 0.95
Episode 11 | Avg Reward: 224.60 | Epsilon: 0.94
Episode 12 | Avg Reward: 228.90 | Epsilon: 0.94
Episode 13 | Avg Reward: 263.50 | Epsilon: 0.93
Episode 14 | Avg Reward: 264.70 | Epsilon: 0.93
Episode 15 | Avg Reward: 262.40 | Epsilon: 0.92
Episode 16 | Avg Reward: 274.50 | Epsilon: 0.92
Episode 17 | Avg Reward: 275.50 | Epsilon: 0.91
Episode 18 | Avg Reward: 298.90 | Epsilon: 0.91
Episode 19 | Avg Reward: 295.40 | Epsilon: 0.90
Episode 20 | Avg Reward: 315.00 | Epsilon: 0.90
Episod

In [None]:


env = gym.make("CartPole-v1", render_mode = "rgb_array")
state, _ = env.reset()
done = False
policy_net = agent.policy_net
policy_net.eval()
total_reward = 0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

while not done:
    with torch.no_grad():
        state = torch.tensor(state).unsqueeze(0).to(device)
        action = torch.argmax(policy_net(state)).item()
    # print(action)
    next_state, reward, term, trun, _  = env.step(action)
    total_reward += reward
    done = term or trun
    state = next_state
    # time.sleep(0.1)
print(total_reward)

