# Adam Diouri - Ahmed Amine Ghorbel - Farid Biao

# CartPole Reinforcement Learning Project

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras import Model
import tensorflow_probability as tfp
import gym
import time

In [None]:
class MakeModel(Model):
  def __init__(self, num_actions):
    super().__init__()
    self.fc1 = tf.keras.layers.Dense(64, activation='relu')
    self.fc2 = tf.keras.layers.Dense(64, activation='relu')
    self.fc3 = tf.keras.layers.Dense(64, activation='relu')
    self.action = tf.keras.layers.Dense(num_actions, activation='softmax')

  def call(self, state):
    x = tf.convert_to_tensor(state)
    x = self.fc1(x)
    x = self.fc2(x)
    x = self.fc3(x)
    x = self.action(x)
    return x

In [None]:
class Agent:
  def __init__(self, gamma=0.05, lr=0.001, n_actions=2):
    self.gamma = gamma  # Discounting factor for each future reward
    self.lr = lr
    self.model = MakeModel(n_actions)
    self.opt = tf.keras.optimizers.Adam(learning_rate=self.lr)
    self.action_memory = []  # Store actions
    self.reward_memory = []  # Store rewards
    self.state_memory = []  # Store states

  def choose_action(self, state):
    # This function uses the state to predict an output form the model
    # The output will be an array of size (1, number_actions)
    # Later on, it will be converted to a probability distribution
    # This distribution will be used to select an action based on the probabilities
    # Finally, this action will be stored into the action memory we created earlier once the episode ends
    prob = self.model(np.array([state]))
    dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
    action = dist.sample()
    self.action_memory.append(action)
    return int(action.numpy()[0])

  def learn(self):
    # This is the main part of the agent class
    # This function will tell the model how to learn from the actions and rewards taken in each episode
    # First, we calculate the discounted reward
    # The discount reward essentially determines how much the agents care about rewards in the distant future relative to those in the immediate future
    # Since it's assumed that rewards that are recieved in the recent future carry more importance than that of the future rewards
    # The discounted reward formula is: G(t) = R(t+1) + γ*R(t+2) + γ^2*R(t+3)
    # Next, we calculate the gradients and the loss for the model training and then optimize them using Adam
    sum_reward = 0
    discnt_rewards = []
    self.reward_memory.reverse()
    for r in self.reward_memory:
      sum_reward = r + self.gamma*sum_reward
      discnt_rewards.append(sum_reward)
    discnt_rewards.reverse()

    for state, action, reward in zip(self.state_memory, self.action_memory, discnt_rewards):
      with tf.GradientTape() as tape:
        p = self.model(np.array([state]), training=True)
        loss = self.calc_loss(p, action, reward)
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))

    self.reward_memory = []
    self.action_memory = []
    self.state_memory = []

  def calc_loss(self, prob, action, reward):
    # First, we take the probability distribution of the model output
    # Then the log probability is taken form this distribution
    # Finally, the reward recieved is multiplied with this probability 
    dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
    log_prob = dist.log_prob(action)
    loss = -log_prob*reward
    return loss

  def store_reward(self, reward):
    self.reward_memory.append(reward)

  def store_state(self, state):
    self.state_memory.append(state)

In [None]:
env = gym.make('CartPole-v0')
agent = Agent()
num_episodes = 10000

In [None]:
%load_ext tensorboard
LOG_DIR = '/tmp/log'
%tensorboard --

In [None]:
for i in range(num_episodes):
  state = env.reset()
  score = 0
  rewards = []
  states = []
  actions = []
  done = False
  while not done:
    action = agent.choose_action(state=state)
    state_, reward, done, _ = env.step(action)
    agent.store_reward(reward)
    agent.store_state(state)
    state = state_
    score += reward
    # Remove comment to render the GUI
    # env.render()
    if done:
      agent.learn()
      print(f'Episode done: {i+1}\t|\t Score recieved: {score}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Episode done: 5001	|	 Score recieved: 8.0
Episode done: 5002	|	 Score recieved: 9.0
Episode done: 5003	|	 Score recieved: 9.0
Episode done: 5004	|	 Score recieved: 10.0
Episode done: 5005	|	 Score recieved: 8.0
Episode done: 5006	|	 Score recieved: 9.0
Episode done: 5007	|	 Score recieved: 10.0
Episode done: 5008	|	 Score recieved: 10.0
Episode done: 5009	|	 Score recieved: 10.0
Episode done: 5010	|	 Score recieved: 10.0
Episode done: 5011	|	 Score recieved: 9.0
Episode done: 5012	|	 Score recieved: 10.0
Episode done: 5013	|	 Score recieved: 8.0
Episode done: 5014	|	 Score recieved: 10.0
Episode done: 5015	|	 Score recieved: 10.0
Episode done: 5016	|	 Score recieved: 10.0
Episode done: 5017	|	 Score recieved: 9.0
Episode done: 5018	|	 Score recieved: 9.0
Episode done: 5019	|	 Score recieved: 9.0
Episode done: 5020	|	 Score recieved: 8.0
Episode done: 5021	|	 Score recieved: 11.0
Episode done: 5022	|	 Score recieved: 9.0
E