In [1]:
import gym
import random
import numpy as np
from keras.layers import Dense, Flatten
from keras.models import Sequential
import matplotlib.pyplot as plt

# from keras.optimizers import Adam

In [2]:
env = gym.make("CartPole-v1")

In [3]:
states = env.observation_space.shape[0]
print("States", states)

States 4


In [4]:
actions = env.action_space.n
print("Actions", actions)

Actions 2


In [5]:
episodes = 10
for episode in range(1, episodes + 1):
    # At each begining reset the game
    state = env.reset()
    # set done to False
    done = False
    # set score to 0
    score = 0
    # while the game is not finished
    while not done:
        # visualize each step
        env.render()
        # choose a random action
        action = random.choice([0,1])
        # execute the action
        n_state, reward, done, info = env.step(action)
        # keep track of rewards
        score += reward
    print("episode {} score {}".format(episode, score))

episode 1 score 15.0
episode 2 score 12.0
episode 3 score 9.0
episode 4 score 21.0
episode 5 score 14.0
episode 6 score 27.0
episode 7 score 28.0
episode 8 score 21.0
episode 9 score 10.0
episode 10 score 11.0


In [6]:
def agent(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1, states)))
    model.add(Dense(24, activation="relu"))
    model.add(Dense(24, activation="relu"))
    model.add(Dense(24, activation="relu"))
    model.add(Dense(actions, activation="linear"))
    return model


model = agent(env.observation_space.shape[0], env.action_space.n)

In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 4)                 0         
                                                                 
 dense (Dense)               (None, 24)                120       
                                                                 
 dense_1 (Dense)             (None, 24)                600       
                                                                 
 dense_2 (Dense)             (None, 24)                600       
                                                                 
 dense_3 (Dense)             (None, 2)                 50        
                                                                 
Total params: 1,370
Trainable params: 1,370
Non-trainable params: 0
_________________________________________________________________


In [8]:
from rl.agents import SARSAAgent
from rl.policy import EpsGreedyQPolicy

policy = EpsGreedyQPolicy()

In [9]:
sarsa = SARSAAgent(model=model, policy=policy, nb_actions=env.action_space.n)

In [10]:
sarsa.compile("adam", metrics=["mse"])

In [11]:
sarsa.fit(env, nb_steps=50000, visualize=True, verbose=2)

Training for 50000 steps ...
    10/50000: episode: 1, duration: 0.322s, episode steps: 10, steps per second: 31, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.900 [0.000, 1.000], mean observation: -0.123 [-2.544, 1.602], 


  metrics = np.array(self.metrics[episode])


    19/50000: episode: 2, duration: 0.655s, episode steps: 9, steps per second: 14, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.157 [-2.823, 1.741], 
    29/50000: episode: 3, duration: 0.334s, episode steps: 10, steps per second: 30, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.129 [-3.037, 1.984], 
    38/50000: episode: 4, duration: 0.299s, episode steps: 9, steps per second: 30, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.889 [0.000, 1.000], mean observation: -0.138 [-2.237, 1.402], 
    46/50000: episode: 5, duration: 0.269s, episode steps: 8, steps per second: 30, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.183 [-2.603, 1.519], 
    60/50000: episode: 6, duration: 0.461s, episode steps: 14, steps per second: 30, episode reward: 14.000, mean rewa

<keras.callbacks.History at 0x25298fbae08>

In [12]:
scores = sarsa.test(env, nb_episodes=100, visualize=True)
print(
    "Average score over 100 test games:{}".format(
        np.mean(scores.history["episode_reward"])
    )
)

Testing for 100 episodes ...
Episode 1: reward: 500.000, steps: 500
Episode 2: reward: 500.000, steps: 500
Episode 3: reward: 500.000, steps: 500
Episode 4: reward: 500.000, steps: 500
Episode 5: reward: 500.000, steps: 500
Episode 6: reward: 500.000, steps: 500
Episode 7: reward: 500.000, steps: 500
Episode 8: reward: 500.000, steps: 500
Episode 9: reward: 500.000, steps: 500
Episode 10: reward: 500.000, steps: 500
Episode 11: reward: 500.000, steps: 500
Episode 12: reward: 500.000, steps: 500
Episode 13: reward: 500.000, steps: 500
Episode 14: reward: 500.000, steps: 500
Episode 15: reward: 500.000, steps: 500
Episode 16: reward: 500.000, steps: 500
Episode 17: reward: 500.000, steps: 500
Episode 18: reward: 500.000, steps: 500
Episode 19: reward: 138.000, steps: 138
Episode 20: reward: 141.000, steps: 141
Episode 21: reward: 500.000, steps: 500
Episode 22: reward: 500.000, steps: 500
Episode 23: reward: 500.000, steps: 500
Episode 24: reward: 500.000, steps: 500
Episode 25: reward: 

In [13]:
sarsa.save_weights("sarsa_cartpole.h5", overwrite=True)

In [14]:
sarsa.load_weights("sarsa_cartpole.h5")

In [15]:
_ = sarsa.test(env, nb_episodes=50, visualize=True)

Testing for 50 episodes ...
Episode 1: reward: 500.000, steps: 500
Episode 2: reward: 500.000, steps: 500
Episode 3: reward: 500.000, steps: 500
Episode 4: reward: 500.000, steps: 500
Episode 5: reward: 145.000, steps: 145
Episode 6: reward: 500.000, steps: 500
Episode 7: reward: 500.000, steps: 500
Episode 8: reward: 500.000, steps: 500
Episode 9: reward: 500.000, steps: 500
Episode 10: reward: 500.000, steps: 500
Episode 11: reward: 500.000, steps: 500
Episode 12: reward: 500.000, steps: 500
Episode 13: reward: 139.000, steps: 139
Episode 14: reward: 500.000, steps: 500
Episode 15: reward: 500.000, steps: 500
Episode 16: reward: 132.000, steps: 132
Episode 17: reward: 500.000, steps: 500
Episode 18: reward: 500.000, steps: 500
Episode 19: reward: 500.000, steps: 500
Episode 20: reward: 500.000, steps: 500
Episode 21: reward: 500.000, steps: 500
Episode 22: reward: 500.000, steps: 500
Episode 23: reward: 500.000, steps: 500
Episode 24: reward: 500.000, steps: 500
Episode 25: reward: 5