# Imports and Overview

In [7]:
# Neural Network
import tensorflow as tf
from tensorflow.keras.models import Model

from tf_agents.replay_buffers import tf_uniform_replay_buffer

# Further support
import numpy as np
from tqdm.notebook import tqdm_notebook



# Environment

In [18]:
import gym
import highway_env
import random
import numpy as np

env = gym.make("highway-v0")
state = env.reset()

config = {
        "observation": {
            "type": "GrayscaleObservation",
            "observation_shape": (192, 48),
            "stack_size": 1,
            # weights for RGB conversion
            "weights": [0.01, 0.01, 0.98],  
            "scaling": 1.5,
        },
        # was at 2
        "policy_frequency": 1 
    }
    
env.configure(config)
state = env.reset()


for _ in range(10):
    action = env.action_space.sample()
   # print(action)
    next_state, reward, done, _ = env.step(action)
    #env.render()
    a = [state.shape, next_state.shape, action, reward, done]
    state = next_state

print(a)



env.close()


[(1, 192, 48), (1, 192, 48), 0, 0.8037606638879525, False]


# Experience Replay Buffer

In [11]:
# Has to save (Observation, action, reward, terminal state)
from numpy import float32


class Buffer:

    def __init__(
        self,
        batch_size=50,
        buffer_length=1000, 
        observation_size=192*48,
        action_size=2
    ):
        '''
        Create replay buffer

        Buffer size = batch_size * buffer_length

        '''
        # Save batch size for other functions of buffer
        self.batch_size = batch_size

        # Tell buffer what data & which size to expect
        self.data_spec = (
            tf.TensorSpec(
                shape=[1, observation_size],
                dtype=tf.dtypes.float32,
                name="Observation"
            ),
            tf.TensorSpec(
                shape=[1, observation_size],
                dtype=tf.dtypes.float32,
                name="Next state"
            ),
            tf.TensorSpec(
                shape=[1, action_size],
                dtype=tf.dtypes.int32,
                name="Action"
            ),
            tf.TensorSpec(
                # Reward size
                shape=[1, 1],
                dtype=tf.dtypes.float32,
                name="Reward"
            ),
            tf.TensorSpec(
                shape=[1, 1],
                # Either 0 or 1 
                dtype=tf.dtypes.bool,
                name="Terminal State"
            )
        )

        # Create the buffer 
        self.buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            self.data_spec, batch_size, buffer_length
        )

    def stupid(self):
        return self.data_spec

    def add(self, items):
        '''
        length of items must be equal to batch size

        items: list or tuple of batched data from (50, 4)


        '''
        # Combine all values from "items" in tensor
        # Not sure wether we need tf.nest.map_structure
        self.batched_values = tf.nest.map_structure(
            lambda t: tf.stack([t] * self.batch_size),
            items
        )

        # Add to batch
        self.buffer.add_batch(self.batched_values)

    def sample(self):
        return self.buffer.as_dataset(single_deterministic_pass=False)


buffer = Buffer(batch_size=1, observation_size=4)
# Save sizes of the stupid tensors
data_spec = buffer.stupid()

for t in range(4):
    buffer.add((
        tf.constant([1.0,2.0,3.0,4.0], shape=data_spec[0].shape.as_list()),
        tf.constant([1.0,2.0,3.0,4.0], shape=data_spec[1].shape.as_list()),
        tf.constant([5], shape=data_spec[2].shape.as_list()),
        tf.constant([6.0], shape=data_spec[3].shape.as_list()),
        tf.constant([True], shape=data_spec[4].shape.as_list())
    ))


dataset = buffer.sample()

iterator = iter(dataset)
print("Iterator trajectories:")
trajectories = []
for _ in range(3):
  t, _ = next(iterator)
  trajectories.append(t)

print(trajectories)

print(tf.nest.map_structure(lambda t: t.shape, trajectories))


Iterator trajectories:
[(<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[1., 2., 3., 4.]], dtype=float32)>, <tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[1., 2., 3., 4.]], dtype=float32)>, <tf.Tensor: shape=(1, 2), dtype=int32, numpy=array([[5, 5]])>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[6.]], dtype=float32)>, <tf.Tensor: shape=(1, 1), dtype=bool, numpy=array([[ True]])>), (<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[1., 2., 3., 4.]], dtype=float32)>, <tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[1., 2., 3., 4.]], dtype=float32)>, <tf.Tensor: shape=(1, 2), dtype=int32, numpy=array([[5, 5]])>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[6.]], dtype=float32)>, <tf.Tensor: shape=(1, 1), dtype=bool, numpy=array([[ True]])>), (<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[1., 2., 3., 4.]], dtype=float32)>, <tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[1., 2., 3., 4.]], dtype=float32)>, <tf.Tensor: shape=(1, 2

# Actor Critic Agent

# World model

# World model & agent training loops

# Hyperparam inits
Agent Data collection in environment + adding data to ERB (+ measure at which reward loop stops?) \par
World model loop on data sampled from ERB \par
Agent training loop with world model feedback
 

# Function execution