# Imports and Overview

In [52]:
# Neural Network
import tensorflow as tf
from tensorflow.keras.models import Model

# Buffer 
from tf_agents.replay_buffers import tf_uniform_replay_buffer

# Further support
import numpy as np
from tqdm.notebook import tqdm_notebook

# Environment
import gym
import highway_env
import random




# Experience Replay Buffer

In [53]:
# Has to save (Observation, action, reward, terminal state)
from numpy import float32


class Buffer:

    def __init__(
        self,
        batch_size=50,
        buffer_length=1000, 
        observation_size=192*48,
        action_size=2
    ):
        '''
        Create replay buffer

        Buffer size = batch_size * buffer_length

        '''
        # Save batch size for other functions of buffer
        self.batch_size = batch_size

        # Tell buffer what data & which size to expect
        self.data_spec = (
            tf.TensorSpec(
                shape=[1, observation_size],
                dtype=tf.dtypes.float64,
                name="Observation"
            ),
            tf.TensorSpec(
                shape=[1, observation_size],
                dtype=tf.dtypes.float64,
                name="Next state"
            ),
            tf.TensorSpec(
                shape=[1, action_size],
                dtype=tf.dtypes.int32,
                name="Action"
            ),
            tf.TensorSpec(
                # Reward size
                shape=[1, 1],
                dtype=tf.dtypes.float64,
                name="Reward"
            ),
            tf.TensorSpec(
                shape=[1, 1],
                # Either 0 or 1 
                dtype=tf.dtypes.bool,
                name="Terminal State"
            )
        )

        # Create the buffer 
        self.buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            self.data_spec, batch_size, buffer_length
        )

    def obtain_buffer_specs(self):
        return self.data_spec

    def add(self, items):
        '''
        length of items must be equal to batch size

        items: list or tuple of batched data from (50, 4)


        '''
        # Combine all values from "items" in tensor
        # Not sure wether we need tf.nest.map_structure
        self.batched_values = tf.nest.map_structure(
            lambda t: tf.stack([t] * self.batch_size),
            items
        )

        # Add to batch
        self.buffer.add_batch(self.batched_values)

    def sample(self):
        return self.buffer.as_dataset(single_deterministic_pass=False)




# Environment

In [54]:
class EnvironmentInteractor:

  def __init__(self, config, buffer, environment_name = "highway-fast-v0"):
    self.config = config

    self.env = gym.make(environment_name)    
    self.env.configure(config)

    self.buffer = buffer
  

  

  def create_trajectories(self, iterations):
    state = self.env.reset()
    

    # Save sizes of the stupid tensors
    data_spec = self.buffer.obtain_buffer_specs()

    for _ in range(iterations):
        action = self.env.action_space.sample()
        next_state, reward, done, _ = self.env.step(action)
        self.buffer.add((
            tf.constant(state, shape=data_spec[0].shape.as_list()),
            tf.constant(next_state, shape=data_spec[1].shape.as_list()),
            tf.constant(action, shape=data_spec[2].shape.as_list()),
            tf.constant(reward, shape=data_spec[3].shape.as_list()),
            tf.constant(done, shape=data_spec[4].shape.as_list())
        ))
        state = next_state
        
        if done:
          state = self.env.reset()


def __del__(self):
  self.env.close()

In [55]:

buffer = Buffer(batch_size=1)
config = {
        "observation": {
            "type": "GrayscaleObservation",
            "observation_shape": (192, 48),
            "stack_size": 1,
            # weights for RGB conversion
            "weights": [0.01, 0.01, 0.98],  
            "scaling": 1.5,
        },
        # was at 2
        "policy_frequency": 1 
    }

environment_interactor = EnvironmentInteractor(config, buffer)
environment_interactor.create_trajectories(10)






dataset = buffer.sample()

iterator = iter(dataset)
print("Iterator trajectories:")
trajectories = []
for _ in range(3):
  t, _ = next(iterator)
  trajectories.append(t)

print(trajectories)

print(tf.nest.map_structure(lambda t: t.shape, trajectories))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.position = np.array(position, dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return np.int(np.clip(np.round(x * (self.target_speeds.size - 1)), 0, self.target_speeds.size - 1))


Iterator trajectories:
[(<tf.Tensor: shape=(1, 9216), dtype=float64, numpy=array([[100., 100., 100., ..., 100., 100., 100.]])>, <tf.Tensor: shape=(1, 9216), dtype=float64, numpy=array([[100., 100., 100., ..., 100., 100., 100.]])>, <tf.Tensor: shape=(1, 2), dtype=int32, numpy=array([[4, 4]])>, <tf.Tensor: shape=(1, 1), dtype=float64, numpy=array([[0.7152461]])>, <tf.Tensor: shape=(1, 1), dtype=bool, numpy=array([[False]])>), (<tf.Tensor: shape=(1, 9216), dtype=float64, numpy=array([[100., 100., 100., ..., 100., 100., 100.]])>, <tf.Tensor: shape=(1, 9216), dtype=float64, numpy=array([[100., 100., 100., ..., 100., 100., 100.]])>, <tf.Tensor: shape=(1, 2), dtype=int32, numpy=array([[2, 2]])>, <tf.Tensor: shape=(1, 1), dtype=float64, numpy=array([[0.73333861]])>, <tf.Tensor: shape=(1, 1), dtype=bool, numpy=array([[False]])>), (<tf.Tensor: shape=(1, 9216), dtype=float64, numpy=array([[100., 100., 100., ..., 100., 100., 100.]])>, <tf.Tensor: shape=(1, 9216), dtype=float64, numpy=array([[100.,

# Actor Critic

# World model

# World model & agent training loops

# Hyperparam inits
Agent Data collection in environment + adding data to ERB (+ measure at which reward loop stops?) \par
World model loop on data sampled from ERB \par
Agent training loop with world model feedback
 

# Function execution