In [6]:
import gym
import numpy as np
import random
import matplotlib.pyplot as plt
from collections import namedtuple, deque

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import utils
import networks
import replay_memory

In [77]:
class Agent:
    """
        Agent to interact and learn from the environment
    """
    def __init__(self, state_size, action_size, seed, lr=0.01):
        """
            ARGS:
            state_size(int):= dimension of each state
            action_size(int):= number of valid actions
            seed(int):= random seed
            lr(float):= learning rate
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.lr = lr
        
        #Instantiate the Policy and Target Networks
        self.DQN_net = networks.QNetwork(self.state_size, self.action_size, seed=seed)
        self.target_net = networks.QNetwork(self.state_size, self.action_size, seed=seed)
        self.optimizer = optim.Adam(self.target_net.parameters(),
                                       lr=self.lr)
        
        #Initialize Replay Memory and time steps
        self.replay_mem = replay_memory.ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, self.seed)
        self.t_step = 0
        
    def step(self, state, action, reward, next_state, done):
        #Add to Experience Replay Memory
        self.replay_mem.add(state, action, reward, next_state, done)
        
        self.t_step = (self.t_step+1) % UPDATE_FREQ
        
        #Sample from ERM if enough samples
        if self.t_step == 0:
            if len(self.replay_mem) > BATCH_SIZE:
                experiences = self.replay_mem.sample()
                self.learn(experiences, GAMMA)
                
    def act(self, state, epsilon=0.):
        """
            Chooses an action based on the given state
        """
        state = torch.from_numpy(state).float().unsqueeze(0)
        self.DQN_net.eval()
        with torch.no_grad():
            action_values = self.DQN_net(state)
        self.DQN_net.train()
        
        #Epsilon greedy selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
        
    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences
        
        #Get Q values for next state
        Q_next_states = self.DQN_net(next_states).detach().max(1)[1].unsqueeze(1)
        #Calculate Q targets for all actions
        Q_targets_next = self.target_net(next_states)
        
        Q_targets = torch.tensor((), dtype=torch.float32).new_empty((len(next_states)))
        
        for i in range(0, len(next_states)):
            terminal = dones[i]
            
            action = Q_next_states[i]
            
            if terminal:
                Q_targets[i] = rewards[i]
            else:
                Q_target = rewards[i] + gamma * Q_targets_next[i][action]
                Q_targets[i] = Q_target
        
        #Get expected Q values from DQN network
        Q_expected = self.DQN_net(states).gather(1, actions)
        
        #Compute and minimize loss
        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        #Update target network using policy network parameters
        self.soft_update(self.DQN_net, self.target_net, TAU)
        
    def soft_update(self, online_network, target_network, tau):
        """
            Soft update the target network params using policy network params according to:
            param_target = tau*param_local + (1 - tau)*param_target
            
            ARGS:
            policy_network(PyTorch Network):= network params will be copied FROM
            target_network(PyTorch Network):= network params will be copied TO
            tau(float):= interpolation factor
        """
        for target_param, online_param in zip(target_network.parameters(),
                                              online_network.parameters()):
            target_param.data.copy_(tau*online_param.data + (1-tau)*target_param.data)

In [79]:
env = gym.make('CartPole-v0')
env.seed(0)

[0]

In [111]:
#Hyperparameters
BUFFER_SIZE = 10000
BATCH_SIZE = 128
GAMMA = 0.99
TAU = 0.001
LR = 0.1
UPDATE_FREQ = 4
EPSILON = 1.0
EPS_MIN = 0.01

In [112]:
#Instantiate an agent
agent = Agent(state_size=env.observation_space.shape[0],
              action_size = env.action_space.n,
              seed=0,
              lr=LR)

In [113]:
#Training loop
episodes = 2000
total_R = np.empty(episodes)
eps = EPSILON
max_t = 300 #Max number of iterations, gym caps this env at 200
total_iters = 0

for i_ep in range(1, episodes+1):
    state = env.reset()
    done = False
    total_r = 0
    t = 0
    
    while not done and t < max_t:
        action = agent.act(state, eps)
        next_state, reward, done, _ = env.step(action)
        
        if done:
            reward = -200
        
        agent.step(state, action, reward, next_state, done)
        state = next_state

        if reward == 1:
            total_r += reward
        
        t += 1
        
        if done:
            total_iters += t
        
    total_R[i_ep-1] = total_r
    eps = EPS_MIN + (1-EPS_MIN)*np.exp(-0.005*i_ep)
    
    if i_ep % 100 == 0:
        print('Episode: {} \tAverage Reward of Prev 100 Episodes: {:.3f} \t Epsilon: {:.6f}'.format(i_ep,
                                                                                            total_R[i_ep-100:i_ep-1].mean(),
                                                                                            eps))

print('Average reward for last 100 episodes: ', total_R[-100:].mean())
print('Total steps: ', total_iters)

plt.plot(total_R)
plt.title('Rewards')
plt.show()

utils.plot_running_avg(total_R)

Episode: 100 	Average Reward of Prev 100 Episodes: 18.444 	 Epsilon: 0.610465
Episode: 200 	Average Reward of Prev 100 Episodes: 12.616 	 Epsilon: 0.374201
Episode: 300 	Average Reward of Prev 100 Episodes: 10.242 	 Epsilon: 0.230899
Episode: 400 	Average Reward of Prev 100 Episodes: 9.313 	 Epsilon: 0.143982
Episode: 500 	Average Reward of Prev 100 Episodes: 9.162 	 Epsilon: 0.091264
Episode: 600 	Average Reward of Prev 100 Episodes: 8.758 	 Epsilon: 0.059289


KeyboardInterrupt: 