In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd

import numpy as np
import gym
import random
from collections import deque



In [2]:
import config
from config import learning_rate,buffer_size,tau,gamma,MAX_STEPS,BATCH_SIZE
import exp_buffer
from exp_buffer import BasicBuffer
import network
from network import ConvDQN,DQN

In [3]:
env_id = "CartPole-v0"
env = gym.make(env_id)

In [4]:
env.action_space.sample()

0

In [5]:
env.observation_space.sample()

array([ 4.4741817e+00,  9.9331965e+37, -2.7833691e-01, -3.3279325e+38],
      dtype=float32)

In [6]:
def mini_batch_train(env, agent, max_episodes, max_steps, batch_size):
    episode_rewards = []
    episodes=[]
    for episode in range(max_episodes):
        state = env.reset()
        episode_reward = 0
        episodes.append(episode)
        for step in range(max_steps):
            action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.replay_buffer.push(state, action, reward, next_state, done)
            episode_reward += reward

            if len(agent.replay_buffer) > batch_size:
                agent.update(batch_size)   

            if done or step == max_steps-1:
                episode_rewards.append(episode_reward)
                print("Episode " + str(episode) + ": " + str(episode_reward))
                break
            env.render()
            state = next_state

    return episode_rewards,episodes





class DQNAgent:

    def __init__(self, env, use_conv=True, learning_rate=learning_rate(), gamma=gamma(), tau=tau(), buffer_size=buffer_size()):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.tau = tau
        self.replay_buffer = BasicBuffer(max_size=buffer_size)

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.use_conv = use_conv
        if self.use_conv:
            self.model = ConvDQN(env.observation_space.shape, env.action_space.n).to(self.device)
            self.target_model = ConvDQN(env.observation_space.shape, env.action_space.n).to(self.device)
        else:
            self.model = DQN(env.observation_space.shape, env.action_space.n).to(self.device)
            self.target_model = DQN(env.observation_space.shape, env.action_space.n).to(self.device)
        
        # hard copy model parameters to target model parameters
        for target_param, param in zip(self.model.parameters(), self.target_model.parameters()):
            target_param.data.copy_(param)

        self.optimizer = torch.optim.Adam(self.model.parameters())
        
        
    def get_action(self, state, eps=0.20):
        state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device)
        qvals = self.model.forward(state)
        action = np.argmax(qvals.cpu().detach().numpy())
        
        if(np.random.randn() < eps):
            return self.env.action_space.sample()

        return action

    def compute_loss(self, batch):     
        states, actions, rewards, next_states, dones = batch
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones)

        # resize tensors
        actions = actions.view(actions.size(0), 1)
        dones = dones.view(dones.size(0), 1)

        # compute loss
        curr_Q = self.model.forward(states).gather(1, actions)
        next_Q = self.target_model.forward(next_states)
        max_next_Q = torch.max(next_Q, 1)[0]
        max_next_Q = max_next_Q.view(max_next_Q.size(0), 1)
        expected_Q = rewards + (1 - dones) * self.gamma * max_next_Q
        
        loss = F.mse_loss(curr_Q, expected_Q.detach())
        
        
        return loss
    
    def update(self, batch_size):
        batch = self.replay_buffer.sample(batch_size)
        loss = self.compute_loss(batch)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        # target network update
        for target_param, param in zip(self.target_model.parameters(), self.model.parameters()):
            target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param)



In [7]:
MAX_EPISODES = 100
MAX_STEPS = 500
BATCH_SIZE = 32

agent = DQNAgent(env, use_conv=False)
episode_rewards,episodes = mini_batch_train(env, agent, MAX_EPISODES, MAX_STEPS, BATCH_SIZE)

Episode 0: 11.0
Episode 1: 9.0
Episode 2: 10.0
Episode 3: 10.0
Episode 4: 11.0
Episode 5: 12.0
Episode 6: 20.0
Episode 7: 12.0
Episode 8: 11.0
Episode 9: 12.0
Episode 10: 13.0
Episode 11: 61.0
Episode 12: 17.0
Episode 13: 12.0
Episode 14: 16.0
Episode 15: 16.0
Episode 16: 14.0
Episode 17: 18.0
Episode 18: 14.0
Episode 19: 14.0
Episode 20: 18.0


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/mac/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-7360d57099c0>", line 6, in <module>
    episode_rewards,episodes = mini_batch_train(env, agent, MAX_EPISODES, MAX_STEPS, BATCH_SIZE)
  File "<ipython-input-6-142b0cd8bd03>", line 15, in mini_batch_train
    agent.update(batch_size)
  File "<ipython-input-6-142b0cd8bd03>", line 92, in update
    loss = self.compute_loss(batch)
  File "<ipython-input-6-142b0cd8bd03>", line 80, in compute_loss
    next_Q = self.target_model.forward(next_states)
  File "/Users/mac/task2 (bonus)/network.py", line 56, in forward
    qvals = self.fc(state)
  File "/Users/mac/opt/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/Users/mac/opt/anaconda3/lib/python3.8/site-packages/torch/nn/

TypeError: object of type 'NoneType' has no len()