# Imports and Overview

In [13]:
# Neural Network
import tensorflow as tf
from tensorflow.keras.models import Model
# Layer
from tensorflow.keras.layers import Dense, Layer, Conv2DTranspose, Conv2D, GlobalAveragePooling2D, Reshape, BatchNormalization, GRUCell, MaxPooling2D, Flatten
import tensorflow_probability as tfp



# Buffer 
from tf_agents.replay_buffers import tf_uniform_replay_buffer

# Further support
import numpy as np
from typing import NamedTuple
from tqdm.notebook import tqdm_notebook

# Environment
import gym
import highway_env
import random





# Experience Replay Buffer

In [4]:
# Has to save (Observation, action, reward, terminal state)
from numpy import float32


class Buffer:

    def __init__(
        self,
        batch_size=1,
        buffer_length=1000, 
        observation_size=(128,32,1),
        action_size=1
    ):
        '''
        Create replay buffer

        Buffer size = batch_size * buffer_length

        '''
        # Save batch size for other functions of buffer
        # NOT the usual batch size in Deep Learning
        # Batches in Uniform Replay Buffer describe size of input added to the buffer
        self.batch_size = batch_size

        # Tell buffer what data & which size to expect
        self.data_spec = (
            tf.TensorSpec(
                shape= observation_size,
                dtype=tf.dtypes.float64,
                name="Observation"
            ),
            tf.TensorSpec(
                shape=observation_size,
                dtype=tf.dtypes.float64,
                name="Next state"
            ),
            tf.TensorSpec(
                shape=[action_size],
                dtype=tf.dtypes.int32,
                name="Action"
            ),
            tf.TensorSpec(
                # Reward size
                shape=[1, ],
                dtype=tf.dtypes.float64,
                name="Reward"
            ),
            tf.TensorSpec(
                shape=[1, ],
                # Either 0 or 1 
                dtype=tf.dtypes.bool,
                name="Terminal State"
            )
        )

        # Create the buffer 
        self.buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            self.data_spec, batch_size, buffer_length
        )

    def obtain_buffer_specs(self):
        return self.data_spec

    def add(self, items):
        '''
        length of items must be equal to batch size

        items: list or tuple of batched data from (50, 5)


        '''
        # Combine all values from "items" in tensor
        # Not sure wether we need tf.nest.map_structure
        batched_values = tf.nest.map_structure(
            lambda t: tf.stack([t] * self.batch_size),
            items
        )
        
        print(batched_values)
        # Add to batch
        self.buffer.add_batch(batched_values)

    def sample(self, batch_size, prefetch_size):
        data = self.buffer.as_dataset(single_deterministic_pass=True)
        data = data.batch(batch_size).prefetch(prefetch_size)
        #later we want these to sequences
        return data




# Environment

In [5]:
class EnvironmentInteractor:

  def __init__(self, config, buffer, environment_name = "highway-fast-v0"):
    self.config = config

    self.env = gym.make(environment_name)    
    self.env.configure(config)

    self.buffer = buffer
    # Save sizes of the stupid tensors
    self.data_spec = self.buffer.obtain_buffer_specs()
  

  

  def create_trajectories(self, iterations):
    state = self.env.reset()
    


    for _ in range(iterations):
        action = self.env.action_space.sample()
        next_state, reward, done, _ = self.env.step(action)
        
        self.buffer.add((
            tf.constant(state, shape=self.data_spec[0].shape.as_list()),
            tf.constant(next_state, shape=self.data_spec[1].shape.as_list()),
            tf.constant(action, shape=self.data_spec[2].shape.as_list()),
            tf.constant(reward, shape=self.data_spec[3].shape.as_list()),
            tf.constant(done, shape=self.data_spec[4].shape.as_list())
        ))

        state = next_state
        
        if done:
          state = self.env.reset()


  def __del__(self):
    self.env.close()

# Parameters

In [14]:
# Image size
image_shape = (128,32, 1)

# Long term memory of GRU
hidden_unit_size = 200

# Z in paper
stochastic_state_shape = (32,32)
stochastic_state_size = stochastic_state_shape[0] * stochastic_state_shape[1]

#
action_size = 1
#

#
mlp_hidden_layer_size = 100
batch_size = 50

# TODO different variable names for network inp/outp sizes




# World model

In [17]:
class WorldModel:


    def __init__(self) -> None:
        super().__init__()

        self.encoder = self.create_encoder()
        self.decoder = self.create_decoder()
        self.reward_model = self.create_reward_predictor()
        self.discount_model = self.create_discount_predictor()


    def create_encoder(self, input_size=image_shape, output_size=hidden_unit_size):
        # Third dimension might be obsolete
        encoder_input = tf.keras.Input(shape=input_size)
        x = Conv2D(16, (3, 3), activation="elu", padding="same")(encoder_input) # 16 layers of filtered 192x48 features
        x = MaxPooling2D((2, 2), padding="same")(x) # 64 / 96x24
        x = Conv2D(32, (3, 3), activation="elu", padding="same")(x) # 64 / 96x24
        x = MaxPooling2D((2, 2), padding="same")(x) # 64 / 96x24
        x = Conv2D(64, (3, 3), activation="elu", padding="same")(x) # 64 / 48x12
        x = MaxPooling2D((2, 2), padding="same")(x) # 64 / 48x12
        x = GlobalAveragePooling2D()(x) # 64
        encoder_output = Dense(output_size, activation = "elu")(x)

        encoder = tf.keras.Model(encoder_input, encoder_output, name="Encoder")

        return encoder


    # Input size = 1024(z:32x32) + 200(size of hidden state)
    # Output size = game frame
    def create_decoder(
        self, 
        input_size=stochastic_state_size + hidden_unit_size, 
        output_size=image_shape
    ):
        # Third dimension might be obsolete
        decoder_input = tf.keras.Input(shape=input_size)
        # TODO WIE SCHLIMM IST EIN MLP HIER?
        x = Dense(256, activation= "elu")(decoder_input)
        x = Reshape((32, 8, 1))(x) 
        # TODO Check whether correct reshape happens
        #tf.debugging.assert_equal(x)
        x = Conv2DTranspose(16, (3, 3), strides=2, activation="elu", padding="same")(x)
        x = BatchNormalization()(x)
        x = Conv2DTranspose(1, (3, 3), strides=2, activation="elu", padding="same")(x)
        x = Flatten()(x)
        # Might needs shape as Tensor  #event_shape=output_size
        decoder_output = tfp.layers.IndependentNormal(event_shape=output_size)(x)


        decoder = tf.keras.Model(decoder_input, decoder_output, name="Decoder")

        return decoder
    

        # Input: concatination of h and z
    # Output: float predicting the obtained reward
    def create_reward_predictor(
        self, 
        input_size=hidden_unit_size+stochastic_state_size,
        output_size=1
    ):
        reward_predictor_input = tf.keras.Input(shape=input_size)
        x = Dense(mlp_hidden_layer_size, activation="elu")(reward_predictor_input)
        x = Dense(mlp_hidden_layer_size, activation="elu")(x)
        x = Dense(mlp_hidden_layer_size)(x)
        # Creates indipendent normal distribution
        # Hope is that it learns to output variables over reward space [0,1]
        reward_predictor_output = tfp.layers.IndependentNormal()(x)

        create_reward_predictor = tf.keras.Model(
            reward_predictor_input,
            reward_predictor_output,
            name="create_reward_predictor"
        )

        return create_reward_predictor
    

        # Input: concatination of h and z
    # Output: float predicting the obtained reward
    def create_discount_predictor(
        self, 
        input_size=hidden_unit_size+stochastic_state_size,
        output_size=1
    ):
        discount_predictor_input = tf.keras.Input(shape=input_size)
        x = Dense(mlp_hidden_layer_size, activation="elu")(discount_predictor_input)
        x = Dense(mlp_hidden_layer_size, activation="elu")(x)
        # Create 1 output sampled from bernoulli distribution
        discount_predictor_output = tfp.layers.IndependentBernoulli()(x)

        create_discount_predictor = tf.keras.Model(
            discount_predictor_input,
            discount_predictor_output,
            name="create_discount_predictor"
        )

        return create_discount_predictor



class RSSMState(NamedTuple):
    logits: tf.Tensor = tf.zeros(shape=(batch_size, stochastic_state_size))
    stochastic_state_z: tf.Tensor = tf.zeros(shape=(batch_size, stochastic_state_size))
    hidden_rnn_state: tf.Tensor = tf.zeros(shape=(batch_size, hidden_unit_size))
    
    @classmethod
    def from_list(cls, rssm_states):
        logits = tf.stack([rssm_state.logits for rssm_state in rssm_states])
        stochastic_state_z = tf.stack([rssm_state.stochastic_state_z for rssm_state in rssm_states])
        hidden_rnn_state = tf.stack([rssm_state.hidden_rnn_state for rssm_state in rssm_states])

        cls(logits, stochastic_state_z, hidden_rnn_state)


class RSSM:


    def __init__(self) -> None:
        super().__init__()

        self.state_action_embedder = self.create_stochastic_state_action_embedder()
        self.rnn = self.create_rnn()
        self.prior_model = self.create_prior_stochastic_state_embedder()
        self.posterior_model = self.create_posterior_stochastic_state_embedder()

    # 
    def create_stochastic_state_action_embedder(
        self, 
        input_size=stochastic_state_size + action_size,
        output_size=hidden_unit_size
    ):
        state_action_input = tf.keras.Input(shape=input_size)
        state_action_output = Dense(output_size, activation = "elu")(state_action_input)

        stochastic_state_action_embedder = tf.keras.Model(
            state_action_input,
            state_action_output,
            name="stochastic_state_action_embedder"
        )

        return stochastic_state_action_embedder


    # Contains GRU cell
    def create_rnn(
        self, 
        input_size=(hidden_unit_size, 1),
        output_size=hidden_unit_size
    ):
        rnn_input = tf.keras.Input(shape=input_size)
        rnn_output = GRUCell(output_size)(rnn_input)

        rnn = tf.keras.Model(
            rnn_input,
            rnn_output,
            name="rnn"
        )

        return rnn


    # Gets probabilities for each element of class in each category
    # Turns these (32x32) probabilities into categoricals (either 0 or 1)
    def sample_stochastic_state(self, logits):
        # logits Output from MLP
        #Onehot logits and create distribution from it (tfp.distrib)
        # sample from distribution = sample (32x32)
        # Apply softmax on sample to get probabillities = probs
        # Do sample = + probs - stop_grad(probs) (gradients)
        #return sample, gradients
        logits = tf.reshape(logits, shape=(-1, *stochastic_state_shape))
        logits_distribution = tfp.distributions.OneHotCategorical(logits)
        sample = logits_distribution.sample() 
        sample += logits_distribution.prob(sample) - tf.stop_gradient(logits_distribution.prob(sample))

        return sample

    def dream(self, previous_rssm_state: RSSMState, previous_action: tf.Tensor, isTerminal: bool=False):
        state_action_embedding = self.state_action_embedder(tf.concat([previous_rssm_state.stochastic_state_z * isTerminal, previous_action]))
        hidden_rnn_state = self.rnn(state_action_embedding, previous_rssm_state.hidden_rnn_state * isTerminal)

        prior_logits = self.prior_logits(hidden_rnn_state)
        prior_stochastic_state_z = self.sample_stochastic_state(prior_logits)
        prior_rssm_state = RSSMState(prior_logits, prior_stochastic_state_z, hidden_rnn_state)

        return prior_rssm_state

    def dreaming_rollout(self, horizon: int, actor: tf.keras.Model, previous_rssm_state: RSSMState):
        pass

    def observe(self, encoded_state: tf.Tensor, previous_action: tf.Tensor, previous_isTerminal: tf.Tensor, previous_rssm_state: RSSMState):
        prior_rssm_state = self.dream(previous_rssm_state, previous_action, previous_isTerminal)

        encoded_state_and_hidden_state = tf.concat([prior_rssm_state.hidden_rnn_state, encoded_state], axis=1)

        posterior_logits = self.posterior_model(encoded_state_and_hidden_state)
        posterior_stochastic_state_z = self.sample_stochastic_state(posterior_logits)
        posterior_rssm_state = RSSMState(posterior_logits, posterior_stochastic_state_z, prior_rssm_state.hidden_rnn_state)

        return prior_rssm_state, posterior_rssm_state


    def observing_rollout(self, encoded_states: tf.Tensor, actions: tf.Tensor, isTerminals: tf.Tensor, previous_rssm_state: RSSMState):
        priors = []
        posteriors = []

        for encoded_state, action, isTerminal in zip(encoded_states, actions, isTerminals):
            previous_action = action * isTerminal
            prior_rssm_state, posterior_rssm_state = self.observe(encoded_state, previous_action, isTerminal, previous_rssm_state)
            
            priors.append(prior_rssm_state)
            posteriors.append(posterior_rssm_state)
            
            previous_rssm_state = posterior_rssm_state

        priors = RSSMState.from_list(priors)
        posteriors = RSSMState.from_list(posteriors)
        return priors, posteriors


    # Z^ in paper
    def create_prior_stochastic_state_embedder(
        self, 
        input_size=hidden_unit_size,
        output_size=stochastic_state_size
    ):
        state_embedder_input = tf.keras.Input(shape=input_size)
        x = Dense(mlp_hidden_layer_size, activation="elu")(state_embedder_input)
        # Activation function removed
        state_embedder_output = Dense(output_size)(x)

        create_prior_stochastic_state_embedder = tf.keras.Model(
            state_embedder_input,
            state_embedder_output,
            name="create_prior_stochastic_state_embedder"
        )

        return create_prior_stochastic_state_embedder


    # Z in paper
    # Input size = concatenated output of RNN with output of CNN
    def create_posterior_stochastic_state_embedder(
        self, 
        input_size=hidden_unit_size+hidden_unit_size,
        output_size=stochastic_state_size
    ):
        state_embedder_input = tf.keras.Input(shape=input_size)
        x = Dense(mlp_hidden_layer_size, activation="elu")(state_embedder_input)
        # Activation function removed
        state_embedder_output = Dense(output_size)(x)

        create_posterior_stochastic_state_embedder = tf.keras.Model(
            state_embedder_input,
            state_embedder_output,
            name="create_posterior_stochastic_state_embedder"
        )

        return create_posterior_stochastic_state_embedder

# Test Everything

In [None]:
buffer = Buffer(batch_size=1)
config = {
        "observation": {
            "type": "GrayscaleObservation",
            "observation_shape": (128, 32),
            "stack_size": 1,
            # weights for RGB conversion
            "weights": [0.01, 0.01, 0.98],  
            "scaling": 1.5,
        },
        # was at 2
        "policy_frequency": 1 
    }

environment_interactor = EnvironmentInteractor(config, buffer)
environment_interactor.create_trajectories(300)

# Sample from buffer
data = buffer.sample(batch_size=50, prefetch_size=70)

world_model = WorldModel()
rssm = RSSM()


for sequence in data:
    state, next_state, action, reward, done = sequence[0]
    print(state.shape)
    encoder.predict(state)




Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.position = np.array(position, dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return np.int(np.clip(np.round(x * (self.target_speeds.size - 1)), 0, self.target_speeds.size - 1))


(<tf.Tensor: shape=(1, 128, 32, 1), dtype=float64, numpy=
array([[[[100.],
         [100.],
         [100.],
         ...,
         [100.],
         [255.],
         [100.]],

        [[100.],
         [100.],
         [100.],
         ...,
         [100.],
         [255.],
         [100.]],

        [[100.],
         [100.],
         [100.],
         ...,
         [100.],
         [255.],
         [100.]],

        ...,

        [[100.],
         [100.],
         [100.],
         ...,
         [100.],
         [255.],
         [100.]],

        [[100.],
         [100.],
         [100.],
         ...,
         [100.],
         [255.],
         [100.]],

        [[100.],
         [100.],
         [100.],
         ...,
         [100.],
         [255.],
         [100.]]]])>, <tf.Tensor: shape=(1, 128, 32, 1), dtype=float64, numpy=
array([[[[100.],
         [100.],
         [100.],
         ...,
         [100.],
         [255.],
         [100.]],

        [[100.],
         [100.],
        

In [None]:
iterator = iter(dataset)
print("Iterator trajectories:")
trajectories = []
for _ in range(3):
  t, _ = next(iterator)
  trajectories.append(t)

print(trajectories)

print(tf.nest.map_structure(lambda t: t.shape, trajectories))

NameError: name 'dataset' is not defined

# World ModelTraining Loop

# Actor Critic

# World model & agent training loops

# Hyperparam inits
Agent Data collection in environment + adding data to ERB (+ measure at which reward loop stops?) 
World model loop on data sampled from ERB
Agent training loop with world model feedback
 

# Function execution

In [None]:
# Instantiate environment and network objects
# Loop:
# Pass respective inputs to networks
# Collect outputs
# Compute individuall losses
# Add together to 1 big loss
# Propagate with gradient Tape through network


# compute the loss of an input for the model and optimize/tweak according the parameters
def train_step(model, input, target, loss_function, optimizer):
    # use tf.gradientTape to compute loss, then gradients and apply these to the model to modify the parameters
    with tf.GradientTape() as tape:
        prediction = model(input)
        loss = loss_function(target, prediction)
        gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss


# TODO move hyperparams to the rest
epochs = 32

# define loss-function and optimizer
cross_entropy_loss = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.Adam()

for epoch in range(epochs): 


    for world_model_input in tqdm(data):
        train_loss = train_step()


array([-0.02432574,  0.01747934, -0.02349192,  0.01419894], dtype=float32)

: 