In [None]:
from collections import namedtuple
import random
from typing import List, NamedTuple

import gym

import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

In [None]:
Transition = namedtuple('Transition', 'state action reward next_state done')

In [None]:
class VectorizeWrapper(gym.Wrapper):
    def __init__(self, make_env, num_envs: int=1):
        super().__init__(make_env())
        self.num_envs = num_envs
        self.envs = [make_env() for env_index in range(self.num_envs)]
    
    def reset(self):
        return np.array([env.reset() for env in range(self.envs)])
    
    def reset_at(self, idx: int):
        return self.envs[idx].reset()
    
    def step(self, actions):
        assert len(actions) == len(self.envs)
        next_states, rewards, dones, infos = [], [], [], []
        for env, action in zip(self.envs, actions):
            next_state, reward, done, info = env.step(action)
            next_states.append(next_state)
            rewards.append(reward)
            dones.append(done)
            infos.append(info)
        return np.array([next_states]), np.array(rewards), \
               np.array([dones]), np.array(infos)

In [None]:
class DiscreteToBoxCoverter(gym.ObservationWrapper):
    def __init__(self, env):
        super

In [51]:
class ReplayBuffer:
    def __init__(self, capacity: int = 100000):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
        
    def push(self, transition):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = transition
        self.position = (self.position + 1) % self.capacity
    
    def sample(self, batch_size: int) -> List[NamedTuple]:
        if len(self.buffer) < batch_size:
            raise ValueError(f"Can't sample {batch_size} num elements from buffer of size {self.buffer}")
        return random.sample(self.buffer, batch_size)
    
    def __len__(self):
        return len(self.buffer)

In [52]:
def create_model(state_dim: int, action_dim: int, hidden_sizes: List[int]):
    model = tf.keras.models.Sequential()
    model.add(layers.InputLayer(input_shape=state_dim))
    for hidden_size in hidden_sizes:
        model.add(layers.Dense(hidden_size))
        model.add(layers.LeakyReLU())
    model.add(layers.Dense(action_dim))
    
    return model

def sync_models(model1, model2):
    model2.set_weights(model1.get_weights())

In [53]:
class EpsScheduler:
    def __init__(self, init_eps: float = 1.0, end_eps: float = 0.25, steps: int = 10_000):
        self.init_eps = init_eps
        self.end_eps = end_eps
        self.steps = steps
        self.step = (self.init_eps - self.final_eps) / self.steps
        self.cur_step = 0
        
    def __call__(self, current_step = None):
        if current_step is None:
            current_step = self.cur_step
            self.cur_step += 1
        eps = max(self.final_eps, self.init_eps - self.step * current_step)
        return eps

In [54]:
class Agent:
    def __init__(self, state_dim: int, action_dim: int):
        self.state_dim = state_dim
        self.action_dim = action_dim
        
    def act(self, state):
        raise NotImplementedError()
        
    def update(self, transitions):
        raise NotImplementedError()

In [58]:
class DQN(Agent):
    def __init__(self, state_dim: int, action_dim: int, hidden_sizes: List[int], gamma: float=0.95):
        super(DQN, self).__init__(state_dim, action_dim)
        self.q_net = create_model(state_dim, action_dim, hidden_sizes)
        self.optimizer = tf.keras.optimizers.Adam()
        self.eps = EpsScheduler()
        
    def act(self, state):
        batch_size = state.shape[0]
        eps = self.eps()
        bern_samples = (np.random.rand(batch_size) < eps).astype(float)
        random_steps = np.random.randint(self.action_dim, size=batch_size)
        best_q_steps = np.argmax(self.q_net(state), axis=1)
        return bern_samples * random_steps + (1 - bern_samples) * best_q_steps
    
    def _prepare_batches(self, transitions):
        state, action, reward, next_state, done = [], [], [], [], []
        for transition in transitions:
            state.append(transition.state)
            action.append(transition.action)
            reward.append(transition.reward)
            next_state.append(transition.next_state)
            done.append(transition.done)
        state = np.array(state)
        action = np.array(action)
        reward = np.array(reward)
        next_state = np.array(next_state)
        done = np.array(done)
        return state, action, reward, next_state, done
    
    def update(self, transitions):
        state, action, reward, next_state, done = self._prepare_batches(transitions)
        with tf.GradientTape as tape:
            Q_pred = tf.reduce_sum(self.q_net(state)*tf.one_hot(action, self.action_dim, dtype=tf.float64))
            Q_next = tf.stop_gradient(tf.reduce_max(self.q_net(next_state), axis=1))
            loss = tf.reduce_mean((Q_pred - (1 - done)*(reward + self.gamma * Q_next))**2)
        gradients = tape.gradient(loss, self.q_net.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.q_net.trainable_variables))

In [None]:
class DDQN:
    def __init__(self, state_dim: int, action_dim: int, hidden_sizes: List[int], gamma: float=0.95):
        super(DDQN, self).__init__(state_dim, action_dim)
        self.q_net = create_model(state_dim, action_dim, hidden_sizes)
        self.target_net = create_model(state_dim, action_dim, hidden_sizes)
        sync_models(self.q_net, self.target_net)
        self.optimizer = tf.keras.optimizers.Adam()
    
    def act(self, state):
        pass
    
    def update(self):
        with tf.GradientTape as tape:
            pass