<a href="https://colab.research.google.com/github/Aryan8912/CUDA_Practice/blob/main/SAC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
class ActorCriticNetwork(keras.Model):
  def __init__(self, n_actions, fc1_dims=1024, fc2_dims=512,
               name='actor_critic', chkpt_dir='tmp/actor_critic'):
    super(ActorCriticNetwork, self).__init__()
    self.fc1_dims = fc1_dims
    self.fc2_dims = fc2_dims
    self.n_actions = n_actions
    self.model_name = name
    self.checkpoint_dir = chkpt_dir
    self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_ac.weights.h5')

    self.fc1 = Dense(self.fc1_dims, activation='relu')
    self.fc2 = Dense(self.fc2_dims, activation='relu')
    self.v = Dense(1, activation=None)
    self.pi = Dense(n_actions, activation='softmax')

  def call(self, state):
    value = self.fc1(state)
    value = self.fc2(value)

    v = self.v(value)
    pi = self.pi(value)

    return v, pi

In [6]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import tensorflow_probability as tfp

class Agent:
    def __init__(self, alpha=0.0003, gamma=0.99, n_actions=2):
        self.gamma = gamma
        self.n_actions = n_actions
        self.action = None
        self.action_space = [i for i in range(self.n_actions)]

        self.actor_critic = ActorCriticNetwork(n_actions=n_actions)

        self.actor_critic.compile(optimizer=Adam(learning_rate=alpha))


    def choose_action(self, observation):
        state = tf.convert_to_tensor([observation])
        _, probs = self.actor_critic(state)

        action_probabilities = tfp.distributions.Categorical(probs=probs)
        action = action_probabilities.sample()
        log_prob = action_probabilities.log_prob(action)
        self.action = action

        return action.numpy()[0]

    def save_models(self):
        print('... saving models ...')
        self.actor_critic.save_weights(self.actor_critic.checkpoint_file)

    def load_models(self):
        print('... loading models ...')
        self.actor_critic.load_weights(self.actor_critic.checkpoint_file)

    def learn(self, state, reward, state_, done):
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        state_ = tf.convert_to_tensor([state_], dtype=tf.float32)
        reward = tf.convert_to_tensor(reward, dtype=tf.float32) # not fed to NN
        with tf.GradientTape(persistent=True) as tape:
            state_value, probs = self.actor_critic(state)
            state_value_, _ = self.actor_critic(state_)
            state_value = tf.squeeze(state_value)
            state_value_ = tf.squeeze(state_value_)

            action_probs = tfp.distributions.Categorical(probs=probs)
            log_prob = action_probs.log_prob(self.action)

            delta = reward + self.gamma*state_value_*(1-int(done)) - state_value
            actor_loss = -log_prob*delta
            critic_loss = delta**2
            total_loss = actor_loss + critic_loss

        gradient = tape.gradient(total_loss, self.actor_critic.trainable_variables)
        self.actor_critic.optimizer.apply_gradients(zip(
            gradient, self.actor_critic.trainable_variables))

In [8]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import tensorflow_probability as tfp

class Agent:
    def __init__(self, alpha=0.0003, gamma=0.99, n_actions=2):
        self.gamma = gamma
        self.n_actions = n_actions
        self.action = None
        self.action_space = [i for i in range(self.n_actions)]

        self.actor_critic = ActorCriticNetwork(n_actions=n_actions)

        self.actor_critic.compile(optimizer=Adam(learning_rate=alpha))


    def choose_action(self, observation):
        state = tf.convert_to_tensor([observation])
        _, probs = self.actor_critic(state)

        action_probabilities = tfp.distributions.Categorical(probs=probs)
        action = action_probabilities.sample()
        log_prob = action_probabilities.log_prob(action)
        self.action = action

        return action.numpy()[0]

    def save_models(self):
        print('... saving models ...')
        self.actor_critic.save_weights(self.actor_critic.checkpoint_file)

    def load_models(self):
        print('... loading models ...')
        self.actor_critic.load_weights(self.actor_critic.checkpoint_file)

    def learn(self, state, reward, state_, done):
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        state_ = tf.convert_to_tensor([state_], dtype=tf.float32)
        reward = tf.convert_to_tensor(reward, dtype=tf.float32) # not fed to NN
        with tf.GradientTape(persistent=True) as tape:
            state_value, probs = self.actor_critic(state)
            state_value_, _ = self.actor_critic(state_)
            state_value = tf.squeeze(state_value)
            state_value_ = tf.squeeze(state_value_)

            action_probs = tfp.distributions.Categorical(probs=probs)
            log_prob = action_probs.log_prob(self.action)

            delta = reward + self.gamma*state_value_*(1-int(done)) - state_value
            actor_loss = -log_prob*delta
            critic_loss = delta**2
            total_loss = actor_loss + critic_loss

        gradient = tape.gradient(total_loss, self.actor_critic.trainable_variables)
        self.actor_critic.optimizer.apply_gradients(zip(
            gradient, self.actor_critic.trainable_variables))

In [9]:
import numpy as np
import matplotlib.pyplot as plt

def plot_learning_curve(x, scores, figure_file):
  running_avg = np.zeros(len(scores))
  for i in range(len(running_avg)):
    running_avg[i] = np.mean(score[max(0, i-100):(i+1)])
  plt.plot(x, running_avg)
  plt.title('Running average of previous 100 scores')
  plt.savefig(figure_file)

In [16]:
import os
import gym
import numpy as np
from gym import wrappers

if __name__ == '__main__':
    #env = gym.make('LunarLander-v2')
    env = gym.make('CartPole-v0')
    agent = Agent(alpha=1e-5, n_actions=env.action_space.n)
    n_games = 1800
    # uncomment this line and do a mkdir tmp && mkdir video if you want to
    # record video of the agent playing the game.
    #env = wrappers.Monitor(env, 'tmp/video', video_callable=lambda episode_id: True, force=True)
    filename = 'cartpole_1e-5_1024x512_1800games.png'

    figure_file = 'plots/' + filename

    best_score = env.reward_range[0]
    score_history = []
    load_checkpoint = False

    if load_checkpoint:
        agent.load_models()

    # Create the directory if it doesn't exist
    if not os.path.exists(agent.actor_critic.checkpoint_dir):
        os.makedirs(agent.actor_critic.checkpoint_dir)

    for i in range(n_games):
        observation = env.reset()
        done = False
        score = 0
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            if not load_checkpoint:
                agent.learn(observation, reward, observation_, done)
            observation = observation_
        score_history.append(score)
        avg_score = np.mean(score_history[-100:])

        if avg_score > best_score:
            best_score = avg_score
            if not load_checkpoint:
                agent.save_models()

        print('episode ', i, 'score %.1f' % score, 'avg_score %.1f' % avg_score)

    if not load_checkpoint:
        x = [i+1 for i in range(n_games)]
        plot_learning_curve(x, score_history, figure_file)

  logger.warn(
  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


... saving models ...
episode  0 score 27.0 avg_score 27.0
episode  1 score 22.0 avg_score 24.5
episode  2 score 24.0 avg_score 24.3
episode  3 score 15.0 avg_score 22.0
episode  4 score 24.0 avg_score 22.4
... saving models ...
episode  5 score 51.0 avg_score 27.2
episode  6 score 27.0 avg_score 27.1
episode  7 score 22.0 avg_score 26.5
episode  8 score 25.0 avg_score 26.3
episode  9 score 9.0 avg_score 24.6
episode  10 score 10.0 avg_score 23.3
episode  11 score 33.0 avg_score 24.1
episode  12 score 18.0 avg_score 23.6
episode  13 score 12.0 avg_score 22.8
episode  14 score 10.0 avg_score 21.9
episode  15 score 34.0 avg_score 22.7
episode  16 score 16.0 avg_score 22.3
episode  17 score 61.0 avg_score 24.4
episode  18 score 31.0 avg_score 24.8
episode  19 score 12.0 avg_score 24.1
episode  20 score 15.0 avg_score 23.7
episode  21 score 32.0 avg_score 24.1
episode  22 score 43.0 avg_score 24.9
episode  23 score 16.0 avg_score 24.5
episode  24 score 29.0 avg_score 24.7
episode  25 score

TypeError: 'float' object is not subscriptable