# Beschreibung Reinforcment Learning
Elemente vom „Reinforment Learning“ sind:
* Eine Umgebung (engl. environment)
* Durchfürhbare Aktionen (engl. actions)
* Algorithmen (engl. policy)
* Beobachtungen (engl. observations)

# Installationspackete

In [None]:
!pip install gym
!pip install pygame
!pip install numpy
!pip install tensorflow

# Programmcode

In [None]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from collections import deque

## Einfaches Beispiel

* Verfügbare Aktionen (engl. actions) sind im CarPole-Beispiel die Beschleunigung des Autos nach links (0) oder nach rechts (1)
* Observation-Array enthält folgende Infos: Position des Autos, Geschwindigkeit des Autos, Winkel des Stabs, Winkelgeschwindigkeit des Stabs

### Basic Policy
Methode stellt einen Basis-Algorithmus dar. Wenn der Winkel des Stabes kleiner 0 ist, dann wird das Auto nach
links beschleunigt (Aktion-Wert 0), andernfalls nach rechts beschleunigt (Aktion-Wert 1).

In [None]:
def basic_policy(observation):
    angle = observation[2]
    
    if angle < 0:
        return 0
    else:
        return 1

In [None]:
environment = gym.make("CartPole-v1")
observation = environment.reset()
    
totals = []

for episode in range(500):
    episode_rewards = 0
    observation = environment.reset()
    
    # Führt 200 Aktionsschritte aus oder bricht ab, wenn der Stab umgefallen ist
    for step in range(200):
        # Wählt die Aktion anhand der aktuellen Beobachtung aus
        action = basic_policy(observation)
        
        # Führt die Aktion auf der Umgebung aus
        observation, reward, done, info = environment.step(action)
        
        # Fügt die Belohnung der Aktion der Gesamtbelohnung hinzu
        episode_rewards =  episode_rewards + reward
        
        # Bricht den Durchlauf ab, wenn der Terminalstatus erreich wurde (Stab umgefallen)
        if done:
            break
            
    totals.append(episode_rewards)

In [None]:
print("Durchschnitt: " + str(np.mean(totals)))
print("Standardabweichung: " + str(np.std(totals)))
print("Minimal: " + str(np.min(totals)))
print("Maximal: " + str(np.max(totals)))

## Einfaches Tensorflow-Beispiel

* [tf.GradientTape](https://www.tensorflow.org/api_docs/python/tf/GradientTape)
* np.newaxis
* [tf.random.uniform](https://www.tensorflow.org/api_docs/python/tf/random/uniform)
* [tf.constant](https://www.tensorflow.org/api_docs/python/tf/constant)
* [tf.cast](https://www.tensorflow.org/api_docs/python/tf/cast)
* [tf.reduce_mean](https://www.tensorflow.org/api_docs/python/tf/math/reduce_mean)
* [tf.concatenate](https://www.tensorflow.org/api_docs/python/tf/concat)
* mean()
* std()

Erstellt eine neuronales Netz.

In [None]:
model = keras.models.Sequential([
                                 keras.layers.Dense(5, activation="elu", input_shape=[4]),
                                 keras.layers.Dense(1, activation="sigmoid")
                                ])

In [None]:
def play_one_step(environment, observation, model, loss_function):
    with tf.GradientTape as tape:
        # Gibt den aktuellen Status der Umwelt in das Model und bekommt die Wahrscheinlichkeit nach Links zu fahren
        propability_of_left = model(observation[np.newaxis])

        action = (tf.random.uniform([1,1]) > propability_of_left)
        y_target = tf.constant([[1.]] - tf.cast(action, tf.float32))
        loss = tf.reduce_mean(loss_function(y_target, propability_of_left))
        
    gradients = tape.gradient(loss, model.trainable_variables)
    
    observation, reward, done, info = environment.step(int(action[0,0].numpy()))
    
    return observation, reward, done, gradients

In [None]:
def play_multiple_episodes(environment, number_of_episodes, max_steps, model, loss_function):
    all_rewards = []
    all_gradients = []
    
    for episode in range(number_of_episodes):
        current_rewards = []
        current_gradients = []
        
        observation = environment.reset()
        
        # Führt die angegebene Anzahl von Aktionsschritte aus oder bricht ab, wenn der Stab umgefallen ist
        for step in range(max_steps):
            observation, reward, done, gradients = play_one_step(environment, observation, model, loss_function)
            current_rewards.append(reward)
            current_gradients.append(gradients)
            
            if done:
                break
                
        all_rewards.append(current_rewards)
        all_gradients.append(current_gradients)
    
    return all_rewards, all_gradients

* numpy.array()

In [None]:
def discount_rewards(rewards, discount_factor):
    discounted = np.array(rewards)
    
    for step in range(len(rewards) - 2, -1, -1):
        discounted[step] = discounted[step] + discounted[step + 1] * discount_factor
        
    return discounted

In [None]:
def discount_and_normalize_rewards(all_rewards, discount_factor):
    
    all_discounted_rewards = []
    for rewards in all_rewards:
        value = discount_rewards(rewards, discount_factor)
        all_discounted_rewards.append(value)
    
    flat_rewards = np.concatenate(all_discounted_rewards)
    rewards_mean = flat_rewards.mean()
    rewards_std = flat_rewards.std()
    
    all_normalize_discounted_rewards = []
    for rewards in all_discounted_rewards:
        value = (rewards - rewards_mean) / rewards_std
        all_normalize_discounted_rewards.append(value)
    
    return all_normalize_discounted_rewards

* tf.reduce_mean

In [None]:
environment = gym.make("CartPole-v1")
observation = environment.reset()

number_of_iterations = 150
number_of_episodes_per_update = 10
max_steps = 200
discount_factor = 0.95

optimizer = keras.optimizers.Adam(learning_rate=0.01)
loss_function = keras.losses.binary_crossentropy

for iteration in range(number_of_iterations):
    all_rewards, all_gradients = play_multiple_episodes(environment, number_of_episodes_per_update, max_steps, model, loss_function)
    all_final_rewards = discount_and_normalize_rewards(all_rewards, discount_factor)
    
    all_mean_gradients = []
    
    for var_index in range(len(model.trainable_variables)):
        mean_gradients = tf.reduce_mean([
                                         final_reward * all_gradients[episode_index][step][var_index] for episode_index,
                                         final_rewards in enumerate(final_rewards, axis=0)
                                        ])
        
        all_mean_gradients.append(mean_gradients)
    
    optimizer.appy_gradients(zip(all_mean_gradients, model.trainable_variables))

## Deep-Q-Learning-Beispiel

In [None]:
model = keras.models.Sequential([
                                 keras.layers.Dense(32, activation="elu", input_shape=[4]),
                                 keras.layers.Dense(32, activation="elu"),
                                 keras.layers.Dense(2)
                                ])

In [None]:
def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand() < epsilon:
        return np.random.randint(2)
    else:
        Q_values = model.predict(state[np.newaxis])
        return np.argmax(Q_values[0])

In [None]:
def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_buffer), size=batch_size)
    batch = [replay_buffer[index] for index in indices]
    states, actions, rewards, next_states, done = [np.array([experience[filed_index]] for experience in batch) for field_index in range(5)]
    
    return states, actions, rewards, next_states, done

In [None]:
def play_one_step(environment, state, epsilon):
    action = epsilon_greedy_policy(state, epsilon)
    next_state, reward, done, info = environment.step(action)
    replay_buffer.append((state, action, reward, next_state, done))
    
    return next_state, reward, done, info

In [None]:
environment = gym.make("CartPole-v1")
observation = environment.reset()

discount_factor = 0.95

optimizer = keras.optimizers.Adam(learning_rate=1e-3)
loss_function = keras.losses.mean_squared_error

def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states, done = experiences
    next_Q_values = model.predict(next_states)
    max_next_Q_values = np.max(next_Q_values, axis=1)
    
    target_Q_values = (rewards + (1 - dones) * discount_factor * max_next_Q_values)
    target_Q_values = target_Q_values.reshape(-1,1)
    
    nask = tf.one_hot(actions, n_outputs)
    
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_function(target_Q_values, Q_values))
        
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

In [None]:
replay_buffer = deque(maxlen=2000)
batch_size = 32

for episode in range(600):
    observation = environment.reset()
    
    for step in range(200):
        epsilon = max(1 - episode / 500, 0.01)
        observation, reward, done, info = play_one_step(environment, observation, epsilon)
        
        if done:
            break;
    
    if episode > 50:
        training_step(batch_size)