In [1]:
import gym
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
import os
import numpy as np
import math
import random
import matplotlib as plt

env = gym.make("CartPole-v1")
env.reset()


array([-0.02678249,  0.0294101 , -0.01768038,  0.03180382], dtype=float32)

In [2]:
in_shape = (env.observation_space.shape[0] + 1)
print(in_shape)

5


In [3]:
class Agent: 
    def __init__(self, state_size, action_size):
        self.weight_backup      = "cartpole_weight.h5"
        self.state_size         = state_size
        self.action_size        = action_size
        #[[state, action], [...]]
        self.store              = []
        self.memory             = []
        self.learning_rate      = 0.001
        self.gamma              = 0.95
        self.exploration_rate   = 1.0
        self.exploration_min    = 0.01
        self.exploration_decay  = 0.9999
        self.model              = self._build_model()
        
    def _build_model(self):
        #input: state
        inputs = keras.Input(shape=self.state_size)
        dense = layers.Dense(16)
        x = dense(inputs)
        x = layers.Dense(64, activation="relu")(x)
        x = layers.Dense(64, activation="relu")(x)
        x = layers.Dense(32, activation="relu")(x)
        #output excpected reward
        outputs = layers.Dense(self.action_size, activation = "linear")(x)
        model = keras.Model(inputs=inputs, outputs=outputs, name="cartpole-model")  
        model.compile(optimizer="Adam", loss="mse")
        model.summary()
        return model
    
    def update_memory(self, step):
        #Format of step: [state(t), action taken while t, total reward G(t)]
        self.memory.append(step)
        
    def take_action(self, state):
        act_rew = {}
        
        
        """for i in range(self.action_size):
            act_rew[i] = self.predict(state, i)
        """    
        #action with best score
        scores = self.predict(state)[0]
        #print(scores)
        action = np.argmax(scores)#max(act_rew, key=act_rew.get)
        
        #exploration (declinining with t)
        if (random.random() <= self.exploration_rate):
            
            action = random.randint(0, self.action_size - 1)
            #choose random exploration action
            #action = random.choice(list(act_rew))
            
        if self.exploration_rate >= self.exploration_min:
            self.exploration_rate *= self.exploration_decay
            
        return action
    
    def predict(self, state):
        return self.model.predict(state.reshape(1, self.state_size))
    
    
    def replay(self, epochs = 1):
        #replay with memory
        #calculate rewards
        
        #Bellman equation for infinite MDP's
        #Q(s, a) = Q(s, a) + alpha(R(s, a) + y*maxQ(s', a) - Q(s, a))
        self.model.fit(np.array(dataset), np.array(adj_rewards))

        #self.memory = []
    
    def save_model(self):
        self.model.save("proto.h5")
        
    def load_model(self, name = "proto.h5"):
        self.model = keras.models.load_model(name)

In [4]:
agent = Agent(env.observation_space.shape[0], env.action_space.n)

Model: "cartpole-model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 4)]               0         
                                                                 
 dense (Dense)               (None, 16)                80        
                                                                 
 dense_1 (Dense)             (None, 64)                1088      
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dense_3 (Dense)             (None, 32)                2080      
                                                                 
 dense_4 (Dense)             (None, 2)                 66        
                                                                 
Total params: 7,474
Trainable params: 7,474
Non-trai

In [5]:
#agent.load_model("proto.h5")

In [6]:
observation = env.reset()

while True:
    
    observation = env.reset()
    #env.render()
    index = 0
    
    while True:
        
        episodes = []
        action = agent.take_action(observation)
        #print(next_action)
        #print(next_action)
        next_observation, reward, done, _ = env.step(action)
        
        step_rem = [observation, action, index]
    
        agent.update_memory(step_rem)
        
        observation = next_observation
        
        if done:
            agent.replay()
            print(f"e: {agent.exploration_rate}")
            print(f"Done after {index} steps")
            print("------------------------")
            break
        else:    
            index += 1 
    if index > 400:
        print("DONE")
        break
            
env.close()

e: 0.9968049550435942
Done after 31 steps
------------------------
e: 0.9929247929420344
Done after 38 steps
------------------------
e: 0.9918331216146347
Done after 10 steps
------------------------
e: 0.9907426505254558
Done after 10 steps
------------------------
e: 0.9898513387241272
Done after 8 steps
------------------------
e: 0.9852097216922968
Done after 46 steps
------------------------
e: 0.9837329411318304
Done after 14 steps
------------------------
e: 0.9822583741922568
Done after 14 steps
------------------------
e: 0.9805898701598169
Done after 16 steps
------------------------
e: 0.9776523621367642
Done after 29 steps
------------------------
e: 0.9748211359851373
Done after 28 steps
------------------------
e: 0.9726787798222825
Done after 21 steps
------------------------
e: 0.9708323524791846
Done after 18 steps
------------------------
e: 0.9698619568847847
Done after 9 steps
------------------------
e: 0.9686018925558499
Done after 12 steps
----------------------

e: 0.7768464044376642
Done after 27 steps
------------------------
e: 0.7758371098299682
Done after 12 steps
------------------------
e: 0.7745967010244447
Done after 15 steps
------------------------
e: 0.7735129702441328
Done after 13 steps
------------------------
e: 0.7724307557011127
Done after 13 steps
------------------------
e: 0.7710415615298397
Done after 17 steps
------------------------
e: 0.7691162689782794
Done after 24 steps
------------------------
e: 0.7679634017970517
Done after 14 steps
------------------------
e: 0.7661990275609417
Done after 22 steps
------------------------
e: 0.7647445588667632
Done after 18 steps
------------------------
e: 0.7627587064463331
Done after 25 steps
------------------------
e: 0.760778010793336
Done after 25 steps
------------------------
e: 0.759181973593113
Done after 20 steps
------------------------
e: 0.7582714561179215
Done after 11 steps
------------------------
e: 0.7567563530573902
Done after 19 steps
----------------------

KeyboardInterrupt: 

In [None]:
agent.model.save("proto.h5")

In [None]:
env.action_space