In [1]:
import tensorflow as tf
import gym
import random


In [2]:
import keras

Using TensorFlow backend.


In [2]:
env= gym.make('CartPole-v0')
#looking at the all states availabel in our environment 
states= env.observation_space.shape[0]
actions=env.action_space.n

In [3]:
states= env.observation_space
print(type(states))
print(states)

<class 'gym.spaces.box.Box'>
Box(4,)


In [4]:
print('number of states available in the env:',states.shape[0])
print('number of actions we can take[LEFT or RIGHT]:',actions)

number of states available in the env: 4
number of actions we can take[LEFT or RIGHT]: 2


# Visualusing our cart-pole when taking random steps

In [6]:
episode=10

#run for 10 episode
for episode in range(1, episode+1):
    #reset env and score
    state= env.reset()
    done=False
    score=0
    
    while not done:  # when done is true it will break and run new episode
        #render/visualise our cartpole
        env.render()
        #take a random step (left or right)
        action= random.choice([0,1])
        
        #retrive info about agent and environement at each step
        n_state, reward, done, info=env.step(action) 
        #based on our step we get reward
        score+=reward
        
    print('Episode {}, Score {}'.format(episode, score))
        
    

Episode 1, Score 15.0
Episode 2, Score 21.0
Episode 3, Score 23.0
Episode 4, Score 43.0
Episode 5, Score 49.0
Episode 6, Score 23.0
Episode 7, Score 25.0
Episode 8, Score 20.0
Episode 9, Score 16.0
Episode 10, Score 12.0


# Create Deep Learning model


In [3]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten
from tensorflow.keras.optimizers import Adam

In [4]:
#We will pass to the model the states(the 4 possible states) 
#and actions (the 2 possible states)

def build_model(states, actions):
    model=Sequential()
    model.add(Flatten(input_shape=(1,states)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [25]:
model= build_model(states, actions)

In [26]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_4 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________


# Build Agent with Keras RL

In [5]:

#type of agent
from rl.agents import DQNAgent
#For this we will use Policy based reinforcement Learning
from rl.policy import BoltzmannQPolicy
#Creating memory for DQN Agent
from rl.memory import SequentialMemory

In [6]:
#in this function we will pass the model and the actions we can take
def build_agent(model, actions):
    policy= BoltzmannQPolicy()
    memory=SequentialMemory(limit=5000, window_length=1)
    #for the dqn agent we pass our model,memomry,policy and other key variables
    dqn=DQNAgent(model=model, memory=memory, policy=policy,
                nb_actions=actions, nb_steps_warmup=10,
                target_model_update=1e-2)
    return dqn

## Training or DQN agent

In [29]:
dqn= build_agent(model,actions)
#compile the model using the adam as optimizer and mae as a measure for error
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

Training for 50000 steps ...
Interval 1 (0 steps performed)




    1/10000 [..............................] - ETA: 2:16:59 - reward: 1.0000



96 episodes - episode_reward: 103.156 [11.000, 200.000] - loss: 2.582 - mae: 19.350 - mean_q: 38.961

Interval 2 (10000 steps performed)
53 episodes - episode_reward: 188.415 [22.000, 200.000] - loss: 4.918 - mae: 35.576 - mean_q: 71.144

Interval 3 (20000 steps performed)
53 episodes - episode_reward: 190.698 [34.000, 200.000] - loss: 12.832 - mae: 43.663 - mean_q: 87.193

Interval 4 (30000 steps performed)
51 episodes - episode_reward: 194.647 [74.000, 200.000] - loss: 18.142 - mae: 44.279 - mean_q: 88.318

Interval 5 (40000 steps performed)
done, took 1269.226 seconds


<tensorflow.python.keras.callbacks.History at 0x33e49a90>

- We have essetiallly trained our agent to bakance a cartpole in the environment 
- We can see our agent is able to accumulate ~194 rewards

In [31]:
#retesting agent on the environment 

#pass the env, specify number of episodes
scores= dqn.test(env, nb_episodes=100, visualize=False)
print(scores.history['episode_reward'])

Testing for 100 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 200.000, steps: 200
Episode 11: reward: 200.000, steps: 200
Episode 12: reward: 200.000, steps: 200
Episode 13: reward: 200.000, steps: 200
Episode 14: reward: 200.000, steps: 200
Episode 15: reward: 200.000, steps: 200
Episode 16: reward: 200.000, steps: 200
Episode 17: reward: 200.000, steps: 200
Episode 18: reward: 200.000, steps: 200
Episode 19: reward: 200.000, steps: 200
Episode 20: reward: 200.000, steps: 200
Episode 21: reward: 200.000, steps: 200
Episode 22: reward: 200.000, steps: 200
Episode 23: reward: 200.000, steps: 200
Episode 24: reward: 200.000, steps: 200
Episode 25: reward: 

In [33]:
test_with_visualuzation=dqn.test(env, nb_episodes=15)

Testing for 15 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 200.000, steps: 200
Episode 11: reward: 200.000, steps: 200
Episode 12: reward: 200.000, steps: 200
Episode 13: reward: 200.000, steps: 200
Episode 14: reward: 200.000, steps: 200
Episode 15: reward: 200.000, steps: 200


# Saving the model

In [34]:
#save model weights
dqn.save_weights('dqn_weights.h5f', overwrite=True)

### Rebuilding the model and env from saved model

In [35]:
del model
del dqn
del env

In [7]:
#import/create env
env= gym.make('CartPole-v0')

#actions availabel in environement
actions= env.action_space.n
#available states in environement
states=env.observation_space.shape[0]

#build model
model=build_model(states, actions)

#build dqn agent
dqn= build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [8]:
#load the weights to the dqn weights
dqn.load_weights('dqn_weights.h5f')

In [9]:
test_with_visualuzation=dqn.test(env, nb_episodes=15)

Testing for 15 episodes ...




Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 200.000, steps: 200
Episode 11: reward: 200.000, steps: 200
Episode 12: reward: 200.000, steps: 200
Episode 13: reward: 200.000, steps: 200
Episode 14: reward: 200.000, steps: 200
Episode 15: reward: 200.000, steps: 200
