# Approximate q-learning (5 pts)

In this notebook you will teach a __pytorch__ neural network to do Q-learning.

In [27]:
# in google colab uncomment this

# import os

# os.system('apt-get update')
# os.system('apt-get install -y xvfb')
# os.system('wget https://raw.githubusercontent.com/yandexdataschool/Practical_DL/fall18/xvfb -O ../xvfb')
# os.system('apt-get install -y python-opengl ffmpeg')
# os.system('pip install pyglet==1.5.0')

# XVFB will be launched if you run on a server
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY")) == 0:
    !bash ../xvfb start
    os.environ['DISPLAY'] = ':1'

In [28]:
import gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import utils
%matplotlib inline

In [29]:
# env = gym.make("CartPole-v0").env
# env.reset()
# n_actions = env.action_space.n
# state_dim = env.observation_space.shape
# state_shape = state_dim[0]

ENV_NAME = 'CartPole-v1'
ENV_NAME = 'CartPole-v0'

def make_env(seed=None):
    # some envs are wrapped with a time limit wrapper by default
    env = gym.make(ENV_NAME).unwrapped
    if seed is not None:
        env.seed(seed)
    return env

env = make_env()
env.reset()
state_shape, n_actions = 4, 2
# state_shape, n_actions = env.observation_space.shape, env.action_space.n

In [30]:
import torch
import torch.nn as nn
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# those who have a GPU but feel unfair to use it can uncomment:
# device = torch.device('cpu')
device

device(type='cpu')

In [31]:
class DQNAgent(nn.Module):
    def __init__(self, state_shape, n_actions, epsilon=0):

        super().__init__()
        self.epsilon = epsilon
        self.n_actions = n_actions
        self.state_shape = state_shape
        # Define your network body here. Please make sure agent is fully contained here
        self.network = nn.Sequential()

        self.network.add_module('layer1', nn.Linear(state_shape, 64))
        self.network.add_module('relu1', nn.ReLU())
        self.network.add_module('layer2', nn.Linear(64, 64))
        self.network.add_module('relu2', nn.ReLU())
        self.network.add_module('layer3', nn.Linear(64, n_actions))
        
        self.opt = None
        
    def get_action(self, state):
        """
        sample actions with epsilon-greedy policy
        recap: with p = epsilon pick random action, else pick action with highest Q(s,a)
        """
        state = torch.tensor(state[None], dtype=torch.float32)
        q_values = self.network(state).detach().numpy()

        action = np.argmax(q_values)
        if np.random.binomial(1,p=self.epsilon):
            action = np.random.choice(range(q_values.shape[-1]))

        return int( action )
    
            
    def forward(self, state_t):
        """
        takes agent's observation (tensor), returns qvalues (tensor)
        :param state_t: a batch states, shape = [batch_size, *state_dim=4]
        """
        # Use your network to compute qvalues for given state
        qvalues = self.network(state_t)
        
        assert qvalues.requires_grad, "qvalues must be a torch tensor with grad"
        assert len(
            qvalues.shape) == 2 and qvalues.shape[0] == state_t.shape[0] and qvalues.shape[1] == n_actions

        return qvalues

    def get_qvalues(self, states):
        """
        like forward, but works on numpy arrays, not tensors
        """
        model_device = next(self.parameters()).device
        states = torch.tensor(states, device=model_device, dtype=torch.float32)
        qvalues = self.forward(states)
        return qvalues.data.cpu().numpy()

    def sample_actions(self, qvalues):
        """pick actions given qvalues. Uses epsilon-greedy exploration strategy. """
        epsilon = self.epsilon
        batch_size, n_actions = qvalues.shape

        random_actions = np.random.choice(n_actions, size=batch_size)
        best_actions = qvalues.argmax(axis=-1)

        should_explore = np.random.choice(
            [0, 1], batch_size, p=[1-epsilon, epsilon])
        return np.where(should_explore, random_actions, best_actions)
#     def get_action(self, state):
#         return int(self.sample_actions(self.get_qvalues([state]))[0])

    def get_qvalues(self, states):
        """
        like forward, but works on numpy arrays, not tensors
        """
        model_device = next(self.parameters()).device
        states = torch.tensor(states, device=model_device, dtype=torch.float32)
        qvalues = self.forward(states)
        return qvalues.data.cpu().numpy()

    def sample_actions(self, qvalues):
        """pick actions given qvalues. Uses epsilon-greedy exploration strategy. """
        epsilon = self.epsilon
        batch_size, n_actions = qvalues.shape

        random_actions = np.random.choice(n_actions, size=batch_size)
        best_actions = qvalues.argmax(axis=-1)

        should_explore = np.random.choice(
            [0, 1], batch_size, p=[1-epsilon, epsilon])
        return np.where(should_explore, random_actions, best_actions)

    def compute_td_loss(self, states, actions, rewards, next_states, is_done, target_network, gamma=0.99, check_shapes=False):
        """ Compute td loss using torch operations only. Use the formula above. """
        states = torch.tensor(
            states, dtype=torch.float32)    # shape: [batch_size, state_size]
        actions = torch.tensor(actions, dtype=torch.long)    # shape: [batch_size]
        rewards = torch.tensor(rewards, dtype=torch.float32)  # shape: [batch_size]
        # shape: [batch_size, state_size]
        next_states = torch.tensor(next_states, dtype=torch.float32)
        is_done = torch.tensor(is_done, dtype=torch.uint8)  # shape: [batch_size]

        # get q-values for all actions in current states
        predicted_qvalues = self.network(states)

        # select q-values for chosen actions
        predicted_qvalues_for_actions = predicted_qvalues[
          range(states.shape[0]), actions
        ]

        # compute q-values for all actions in next states
        predicted_next_qvalues = target_network(next_states)

        # compute V*(next_states) using predicted next q-values
        next_state_values =  torch.max(predicted_next_qvalues, dim=-1)[0]
        assert next_state_values.dtype == torch.float32

        # compute "target q-values" for loss - it's what's inside square parentheses in the above formula.
        target_qvalues_for_actions =  rewards + gamma*next_state_values

        # at the last state we shall use simplified formula: Q(s,a) = r(s,a) since s' doesn't exist
        target_qvalues_for_actions = torch.where(
            is_done, rewards, target_qvalues_for_actions)

        # mean squared error loss to minimize
        loss = torch.mean((predicted_qvalues_for_actions -
                           target_qvalues_for_actions.detach()) ** 2)

        if check_shapes:
            assert predicted_next_qvalues.data.dim(
            ) == 2, "make sure you predicted q-values for all actions in next state"
            assert next_state_values.data.dim(
            ) == 1, "make sure you computed V(s') as maximum over just the actions axis and not all axes"
            assert target_qvalues_for_actions.data.dim(
            ) == 1, "there's something wrong with target q-values, they must be a vector"
        return loss
    
    def generate_session(self, target_network, t_max=100, train=False):
        """play env with approximate q-learning agent and train it at the same time"""
        total_reward = 0
        s = env.reset()

        for t in range(t_max):
            a = self.get_action(s)
            next_s, r, done, _ = env.step(a)

            if train:
                self.opt.zero_grad()
                self.compute_td_loss([s], [a], [r], [next_s], [done], target_network).backward()
                self.opt.step()

            total_reward += r
            s = next_s
            if done:
                break
        
        target_network.load_state_dict(agent.state_dict())
        return total_reward
    
    
def play_and_record(initial_state, agent, env, exp_replay, n_steps=1):
    """
    Play the game for exactly n steps, record every (s,a,r,s', done) to replay buffer. 
    Whenever game ends, add record with done=True and reset the game.
    It is guaranteed that env has done=False when passed to this function.

    PLEASE DO NOT RESET ENV UNLESS IT IS "DONE"

    :returns: return sum of rewards over time and the state in which the env stays
    """
    s = initial_state
    sum_rewards = 0

    # Play the game for n_steps as per instructions above
    for _ in range(n_steps):
        q=agent.get_qvalues(list([s]))
        a=agent.sample_actions(q)[0]
        next_s, r, done, _ = env.step(a)
        sum_rewards=sum_rewards+r
        exp_replay.add(s, a, r, (not done)*next_s, done)
        s=next_s
        if done:
            s = env.reset()

    return sum_rewards, s

In [32]:
agent = DQNAgent(state_shape, n_actions, epsilon=0.5).to(device)
target_network = DQNAgent(agent.state_shape, agent.n_actions, epsilon=0.5).to(device)
# This is how you can load weights from agent into target network
target_network.load_state_dict(agent.state_dict())
agent.opt = torch.optim.Adam(agent.parameters(), lr=1e-4)

In [33]:
from replay_buffer import ReplayBuffer
exp_replay = ReplayBuffer(10**4)


In [34]:

# train network
for i in range(1000):
    session_rewards = [agent.generate_session(target_network,
        train=True) for _ in range(100)]
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(
        i, np.mean(session_rewards), network.epsilon))

    agent.epsilon *= 0.99
    assert agent.epsilon >= 1e-4, "Make sure epsilon is always nonzero during training"

    if np.mean(session_rewards) > 300:
        print("You Win!")
        break

NameError: name 'network' is not defined

In [35]:
def evaluate(env, agent, n_games=1, greedy=False, t_max=10000):
    """ Plays n_games full games. If greedy, picks actions as argmax(qvalues). Returns mean reward. """
    rewards = []
    for _ in range(n_games):
        s = env.reset()
        reward = 0
        for _ in range(t_max):
            qvalues = agent.get_qvalues([s])
            action = qvalues.argmax(axis=-1)[0] if greedy else agent.sample_actions(qvalues)[0]
            s, r, done, _ = env.step(action)
            reward += r
            if done:
                break

        rewards.append(reward)
    return np.mean(rewards)

In [37]:
print(utils)

<module 'utils' from '/home/egor/.local/lib/python3.5/site-packages/utils/__init__.py'>


In [38]:
from tqdm import trange
from IPython.display import clear_output

timesteps_per_epoch = 1
batch_size = 32
total_steps = 4 * 10**4
decay_steps = 1 * 10**4

init_epsilon = 1
final_epsilon = 0.1

loss_freq = 20
refresh_target_network_freq = 100
eval_freq = 1000

max_grad_norm = 5000

mean_rw_history = []
td_loss_history = []
grad_norm_history = []
initial_state_v_history = []

state = env.reset()
for step in trange(total_steps + 1):
#     if not utils.is_enough_ram():
#         print('less that 100 Mb RAM available, freezing')
#         print('make sure everything is ok and make KeyboardInterrupt to continue')
#         try:
#             while True:
#                 pass
#         except KeyboardInterrupt:
#             pass

    agent.epsilon = utils.linear_decay(init_epsilon, final_epsilon, step, decay_steps)

    # play
    _, state = play_and_record(state, agent, env, exp_replay, timesteps_per_epoch)

    # train
    (states, actions, rewards, next_states, is_done) = exp_replay.sample(batch_size)

    loss =agent.compute_td_loss(states, actions, rewards, next_states, is_done, target_network)

    loss.backward()
    grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
    agent.opt.step()
    agent.opt.zero_grad()

    if step % loss_freq == 0:
        td_loss_history.append(loss.data.cpu().item())
        grad_norm_history.append(grad_norm)

    if step % refresh_target_network_freq == 0:
        # Load agent weights into target_network
        target_network.load_state_dict(agent.state_dict())

    if step % eval_freq == 0:
        # eval the agent
        mean_rw_history.append(evaluate(
            make_env(seed=step), agent, n_games=3, greedy=True, t_max=1000)
        )
        initial_state_q_values = agent.get_qvalues(
            [make_env(seed=step).reset()]
        )
        initial_state_v_history.append(np.max(initial_state_q_values))

        clear_output(True)
        print("buffer size = %i, epsilon = %.5f" %
              (len(exp_replay), agent.epsilon))

        plt.figure(figsize=[16, 9])
        plt.subplot(2, 2, 1)
        plt.title("Mean reward per episode")
        plt.plot(mean_rw_history)
        plt.grid()

        assert not np.isnan(td_loss_history[-1])
        plt.subplot(2, 2, 2)
        plt.title("TD loss history (smoothened)")
        plt.plot(utils.smoothen(td_loss_history))
        plt.grid()

        plt.subplot(2, 2, 3)
        plt.title("Initial state V")
        plt.plot(initial_state_v_history)
        plt.grid()

        plt.subplot(2, 2, 4)
        plt.title("Grad norm history (smoothened)")
        plt.plot(utils.smoothen(grad_norm_history))
        plt.grid()

        plt.show()

  0%|          | 0/40001 [00:00<?, ?it/s]


AttributeError: module 'utils' has no attribute 'linear_decay'

In [None]:
s = env.reset()
X=[]
for t in range(3000):
    a = network.get_action(s)
    s, r, done, _ = env.step(a)
    X.append(s)
    if done:
        break
        

plt.plot(range(len(X)),np.array(X)[:,[0,2]])