In [19]:
import gym
import random
import numpy as np
import tensorflow as tf
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

In [20]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [21]:
env_name = "Acrobot-v1"
env = gym.make(env_name)

In [22]:
print("Observation space:", env.action_space)
print("action space:", env.action_space)

Observation space: Discrete(3)
action space: Discrete(3)


In [23]:
## Actions are +1, -1, 0 torque

In [42]:
#Observe what a random agent will do
state = env.reset()
score = 0
for t in range(200):
    action = env.action_space.sample()
    env.render()
    state, reward, done, _ = env.step(action) #state, reward, whether model finished
    score += reward
    if done:
        break 
print('Final score:', score)
env.close()

Final score: -200.0


In [29]:
#Extract states and actions
states = env.observation_space.shape[0]
actions = env.action_space.n

In [30]:
## Create model. Takes state and actions to train

def build_model(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1,states)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [31]:
#Creates instance of model 
model = build_model(states, actions)
model.summary() # 4 different states, 24 dense nodes, passes out actions in final node

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 6)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 24)                168       
_________________________________________________________________
dense_4 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 75        
Total params: 843
Trainable params: 843
Non-trainable params: 0
_________________________________________________________________


In [32]:
#import keras dependencies for training.
from rl.agents import DQNAgent #
from rl.policy import BoltzmannQPolicy #Policy based RL
from rl.memory import SequentialMemory 

In [33]:
#Train model
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae']) #pass through optimizer and mean abs error metric 
dqn.fit(env, nb_steps=50000, visualize=True, verbose=1)

Training for 50000 steps ...
Interval 1 (0 steps performed)




    7/10000 [..............................] - ETA: 3:45 - reward: -1.0000  



27 episodes - episode_reward: -359.296 [-500.000, -164.000] - loss: 0.746 - mae: 19.114 - mean_q: -28.174

Interval 2 (10000 steps performed)
54 episodes - episode_reward: -186.500 [-500.000, -120.000] - loss: 1.356 - mae: 29.317 - mean_q: -43.006

Interval 3 (20000 steps performed)
58 episodes - episode_reward: -173.517 [-313.000, -93.000] - loss: 1.102 - mae: 26.286 - mean_q: -38.365

Interval 4 (30000 steps performed)
53 episodes - episode_reward: -186.019 [-500.000, -104.000] - loss: 1.080 - mae: 25.938 - mean_q: -37.863

Interval 5 (40000 steps performed)
done, took 1031.992 seconds


<tensorflow.python.keras.callbacks.History at 0x2146a9d7fd0>

In [34]:
#Build agent. Pass through model and actions agent can take
def build_agent(model, actions): 
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=70000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

In [13]:
env = gym.make(env_name)

In [35]:
 #Use test method. Pass through enviornment, 100 games.
scores = dqn.test(env, nb_episodes=100, visualize=True)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Episode 1: reward: -90.000, steps: 91
Episode 2: reward: -87.000, steps: 88
Episode 3: reward: -68.000, steps: 69
Episode 4: reward: -70.000, steps: 71
Episode 5: reward: -86.000, steps: 87
Episode 6: reward: -70.000, steps: 71
Episode 7: reward: -68.000, steps: 69
Episode 8: reward: -83.000, steps: 84
Episode 9: reward: -77.000, steps: 78
Episode 10: reward: -69.000, steps: 70
Episode 11: reward: -161.000, steps: 162
Episode 12: reward: -95.000, steps: 96
Episode 13: reward: -132.000, steps: 133
Episode 14: reward: -68.000, steps: 69
Episode 15: reward: -140.000, steps: 141
Episode 16: reward: -69.000, steps: 70
Episode 17: reward: -59.000, steps: 60
Episode 18: reward: -94.000, steps: 95
Episode 19: reward: -68.000, steps: 69
Episode 20: reward: -67.000, steps: 68
Episode 21: reward: -80.000, steps: 81
Episode 22: reward: -104.000, steps: 105
Episode 23: reward: -472.000, steps: 473
Episode 24: reward: -83.000, steps: 84
Episode 25: reward: -68.000, steps

In [44]:
_ = dqn.test(env, nb_episodes=15, visualize=True)

Testing for 15 episodes ...
Episode 1: reward: -68.000, steps: 69
Episode 2: reward: -67.000, steps: 68
Episode 3: reward: -80.000, steps: 81
Episode 4: reward: -95.000, steps: 96
Episode 5: reward: -86.000, steps: 87
Episode 6: reward: -88.000, steps: 89
Episode 7: reward: -94.000, steps: 95
Episode 8: reward: -80.000, steps: 81
Episode 9: reward: -75.000, steps: 76
Episode 10: reward: -83.000, steps: 84
Episode 11: reward: -68.000, steps: 69
Episode 12: reward: -91.000, steps: 92
Episode 13: reward: -71.000, steps: 72
Episode 14: reward: -60.000, steps: 61
Episode 15: reward: -60.000, steps: 61


In [45]:
env.close()