Thanks to https://medium.com/swlh/cartpole-with-policy-gradient-tensorflow-2-x-3a7a14b9cc03

In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras import Model
import tensorflow_probability as tfp

In [2]:
class MakeModel(Model):
    def __init__(self,num_actions):
        super().__init__()
        self.fc1 = tf.keras.layers.Dense(32,activation='relu')
        self.fc2 = tf.keras.layers.Dense(32,activation='relu')
        self.action = tf.keras.layers.Dense(num_actions,activation='softmax')
        

    def call(self,state):
        x = tf.convert_to_tensor(state)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.action(x)
        return x

In [3]:
class Agent:
    def __init__(self,gamma=0.95,lr=0.001,n_actions=2):
        self.gamma = gamma
        self.lr = lr
        self.model = MakeModel(n_actions)
        self.opt = tf.keras.optimizers.Adam(learning_rate=self.lr)
        self.action_memory = []
        self.reward_memory = []
        self.state_memory = []

    def choose_action(self,state):
        prob = self.model(np.array([state]))
        dist = tfp.distributions.Categorical(probs=prob,dtype=tf.float32)
        action = dist.sample()
        self.action_memory.append(action)
        return int(action.numpy()[0])

    def store_reward(self,reward):
        self.reward_memory.append(reward)

    def store_state(self,state):
        self.state_memory.append(state)

    def learn(self):
        # G = np.zeros_like(self.reward_memory)
        # for i in range(len(self.reward_memory)):
        #     discount = 1
        #     g_sum = 0
        #     for j in range(i,len(self.reward_memory)):
        #         g_sum += self.reward_memory[j] * discount
        #         discount *= self.gamma
        #     G[i] = g_sum
        sum_reward = 0
        discnt_rewards = []
        self.reward_memory.reverse()
        for r in self.reward_memory:
            sum_reward = r + self.gamma*sum_reward
            discnt_rewards.append(sum_reward)
        discnt_rewards.reverse() 
        

        for state,action,reward in zip(self.state_memory,self.action_memory,discnt_rewards):
            with tf.GradientTape() as tape:
                p = self.model(np.array([state]),training=True)
                loss = self.calc_loss(p,action,reward)
                grads = tape.gradient(loss,self.model.trainable_variables)
                self.opt.apply_gradients(zip(grads,self.model.trainable_variables))

        self.reward_memory = []
        self.action_memory = []
        self.state_memory = []

    def calc_loss(self,prob,action,reward):
        dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
        log_prob = dist.log_prob(action)
        loss = -log_prob*reward
        return loss

In [4]:
import gym
import time

In [5]:
score_arr = []

In [6]:
env = gym.make('CartPole-v1')
agent = Agent()
num_episodes = 500

for i in range(num_episodes):
    state = env.reset()
    score = 0
    rewards = []
    states = []
    actions = []
    done = False
    while not done:
        action = agent.choose_action(state)
        state_,reward,done,_ = env.step(action)
        agent.store_reward(reward)
        agent.store_state(state)
        state = state_
        score += reward
        # env.render()
        if done:
            agent.learn()
            score_arr.append(score)
            if i % 50:
                print(f'episode done: {i+1}\t score recieved: {score}')

  logger.warn(


episode done: 2	 score recieved: 25.0
episode done: 3	 score recieved: 15.0
episode done: 4	 score recieved: 22.0
episode done: 5	 score recieved: 20.0
episode done: 6	 score recieved: 11.0
episode done: 7	 score recieved: 14.0
episode done: 8	 score recieved: 10.0
episode done: 9	 score recieved: 16.0
episode done: 10	 score recieved: 16.0
episode done: 11	 score recieved: 20.0
episode done: 12	 score recieved: 12.0
episode done: 13	 score recieved: 15.0
episode done: 14	 score recieved: 25.0
episode done: 15	 score recieved: 23.0
episode done: 16	 score recieved: 11.0
episode done: 17	 score recieved: 34.0
episode done: 18	 score recieved: 20.0
episode done: 19	 score recieved: 12.0
episode done: 20	 score recieved: 12.0
episode done: 21	 score recieved: 18.0
episode done: 22	 score recieved: 66.0
episode done: 23	 score recieved: 13.0
episode done: 24	 score recieved: 15.0
episode done: 25	 score recieved: 15.0
episode done: 26	 score recieved: 9.0
episode done: 27	 score recieved: 

In [None]:
import matplotlib.pyplot as plt

plt.plot(score_arr)