In [1]:
import gym
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import deque, namedtuple
import random
import numpy as np

Global valriables

In [29]:
GAMMA = 0.99
AVG_OVER = 5
LR = 0.001
BUFFER_SIZE = 100
BATCH_SIZE = 64
UPDATE_EVERY = 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
env_cart_pole = gym.make('CartPole-v1')
obs, info = env_cart_pole.reset(seed = 0)
#print(f'Observation space: {obs}, Info: {info}')

env_acro_bot = gym.make('Acrobot-v1')
obs, info = env_acro_bot.reset(seed = 0)
#print(f'Observation space: {obs}, Info: {info}')


In [4]:
def print_env_details(env):
    print("Env name: ", str(env.env)[34:-3])
    print(f'Observation space: {env.observation_space.shape[0]}')
    print(f'Action space: {env.action_space.n}')

In [5]:
print_env_details(env_cart_pole)
print_env_details(env_acro_bot)

Env name:  CartPoleEnv<CartPole-v1>
Observation space: 4
Action space: 2
Env name:  AcrobotEnv<Acrobot-v1>
Observation space: 6
Action space: 3


# Dueling-DQN

In [60]:
class QNetwork1(nn.Module):
    def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64, activation = F.relu):
        super(QNetwork1, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc_v_1 = nn.Linear(fc2_units, 32)
        self.fc_v_2 = nn.Linear(32, 1)
        self.fc_a_1 = nn.Linear(fc2_units, 32)
        self.fc_a_2 = nn.Linear(32, action_size)
        self.activation = activation

    def forward(self, state):
        x = self.activation(self.fc1(state))
        x = self.activation(self.fc2(x))
        v = self.activation(self.fc_v_1(x))
        v = self.fc_v_2(v)
        a = self.activation(self.fc_a_1(x))
        a = self.fc_a_2(a)
        q = v + (a - a.mean())
        return q

In [61]:
model = QNetwork1(4, 2, 0)
model

QNetwork1(
  (fc1): Linear(in_features=4, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc_v_1): Linear(in_features=64, out_features=32, bias=True)
  (fc_v_2): Linear(in_features=32, out_features=1, bias=True)
  (fc_a_1): Linear(in_features=64, out_features=32, bias=True)
  (fc_a_2): Linear(in_features=32, out_features=2, bias=True)
)

In [44]:
s,_ = env_cart_pole.reset()
print(model(torch.tensor(s).float().unsqueeze(0)))
error = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
loss = error(model(torch.tensor(s).float().unsqueeze(0)), torch.tensor([0.0, 0.0]).unsqueeze(0))
print(loss)
loss.backward()
optimizer.step()
print(model(torch.tensor(s).float().unsqueeze(0)))
new_loss = error(model(torch.tensor(s).float().unsqueeze(0)), torch.tensor([0.0, 0.0]).unsqueeze(0))
print(new_loss)

tensor([[0.0986, 0.1155]], grad_fn=<AddBackward0>)
tensor(0.0115, grad_fn=<MseLossBackward0>)
tensor([[0.0851, 0.0967]], grad_fn=<AddBackward0>)
tensor(0.0083, grad_fn=<MseLossBackward0>)


In [47]:
import torchviz

# Create a dummy input tensor
dummy_input = torch.randn(1, state_space)

# Generate the computational graph
graph = torchviz.make_dot(model(dummy_input), params=dict(model.named_parameters()))

# Save the graph as a PDF file
graph.render("computational_graph")


'computational_graph.pdf'

In [64]:
class ReplayBuffer:
    def __init__(self, action_size, buffer_size, batch_size, seed):
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)

    def add(self, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
        return (states, actions, rewards, next_states, dones)
    
    def __len__(self):
        return len(self.memory)
    

In [65]:
class DDQN_Agent():
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.qnetwork_local = QNetwork1(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork1(state_size, action_size, seed).to(device)
        self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)
        
        if(len(self.memory) > BATCH_SIZE):
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)
        
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
        
    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        Q_expected = self.qnetwork_local(states).gather(1, actions.long())

        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.qnetwork_local.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        


In [66]:
def dqn_train(agent,env, n_episodes=10000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):

    scores_window = deque(maxlen=100)
    ''' last 100 scores for checking if the avg is more than 195 '''

    eps = eps_start
    ''' initialize epsilon '''

    for i_episode in range(1, n_episodes+1):
        state = env.reset()[0]
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done, _, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break

        scores_window.append(score)

        eps = max(eps_end, eps_decay*eps)
        ''' decrease epsilon '''

        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")

        if i_episode % 100 == 0:
           print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=195.0:
           print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
           break
    return True

### Type-1 Update rule

\begin{align*}
  Q(s,a;\theta) = V(s;\theta) + (A(s,a;\theta) - \frac{1}{|A|}\sum_{a'\epsilon|A|}A(s,a';\theta))
\end{align*}

In [67]:
def forward(self, state):
        x = self.activation(self.fc1(state))
        x = self.activation(self.fc2(x))
        v = self.activation(self.fc_v_1(x))
        v = self.fc_v_2(v)
        a = self.activation(self.fc_a_1(x))
        a = self.fc_a_2(a)
        q = v + (a - a.mean())
        return q
QNetwork1.forward = forward


In [68]:
state_space = env_cart_pole.observation_space.shape[0]
action_space = env_cart_pole.action_space.n


In [69]:
dualing_dqn_agent = DDQN_Agent(state_space, action_space, seed=0)
dqn_train(dualing_dqn_agent, env_cart_pole)

Episode 100	Average Score: 36.74
Episode 200	Average Score: 114.14
Episode 300	Average Score: 187.25
Episode 370	Average Score: 197.42
Environment solved in 370 episodes!	Average Score: 197.42


True

### Type-2 Update Rule

\begin{align*}
  Q(s,a;\theta) = V(s;\theta) + (A(s,a;\theta) - \max_{a'\epsilon|A|}\sum_{a'\epsilon|A|}A(s,a';\theta))
\end{align*}

In [73]:
def forward(self, state):
        x = self.activation(self.fc1(state))
        x = self.activation(self.fc2(x))
        v = self.activation(self.fc_v_1(x))
        v = self.fc_v_2(v)
        a = self.activation(self.fc_a_1(x))
        a = self.fc_a_2(a)
        q = v + (a - a.max())
        return q
QNetwork1.forward = forward


In [74]:
state_space = env_cart_pole.observation_space.shape[0]
action_space = env_cart_pole.action_space.n


In [75]:
dualing_dqn_agent = DDQN_Agent(state_space, action_space, seed=0)
dqn_train(dualing_dqn_agent, env_cart_pole)

Episode 100	Average Score: 35.71
Episode 200	Average Score: 108.00
Episode 262	Average Score: 203.88
Environment solved in 262 episodes!	Average Score: 203.88


True

# Monte-Carlo REINFORCE

### W/O Baseline

### W/ Baseline