# Simple Q-learning

In [1]:
import gym
from gym import wrappers
import numpy as np

In [2]:
env = gym.make('FrozenLake-v0')
env = wrappers.Monitor(env, '/tmp/frozenlake_q-learning')
env.spec.timestep_limit = 100

[2017-09-10 16:11:02,145] Making new env: FrozenLake-v0
[2017-09-10 16:11:05,489] Creating monitor directory /tmp/frozenlake_q-learning


In [3]:
Q = np.zeros([env.observation_space.n, env.action_space.n])
lr = 0.8 # learning rate
y = 0.95 # discount
num_episodes = 2000

rewards = [] # list containing total rewards and steps per episode
for i in range(num_episodes):
    # Reset environment and get observation
    s = env.reset()
    rAll = 0
    d = False
    j = 0
    
    while j < 105:       
        j += 1
        
        # Choose and action by greedily (with noise) picking from Q table
        a = np.argmax(Q[s,:] + np.random.randn(1, env.action_space.n) * (1 / (i+1)))
        s1, r, d, _ = env.step(a)
        
        # Update Q-table with new knowledge
        Q[s, a] = Q[s, a] + lr * (r + y * np.max(Q[s1,:]) - Q[s, a])
        rAll += r
        s = s1
        if d == True:
            break
    rewards.append(rAll)

[2017-09-10 16:11:05,524] Starting new video recorder writing to /tmp/frozenlake_q-learning/openaigym.video.0.21902.video000000.json
[2017-09-10 16:11:05,526] Starting new video recorder writing to /tmp/frozenlake_q-learning/openaigym.video.0.21902.video000001.json
[2017-09-10 16:11:05,532] Starting new video recorder writing to /tmp/frozenlake_q-learning/openaigym.video.0.21902.video000008.json
[2017-09-10 16:11:05,542] Starting new video recorder writing to /tmp/frozenlake_q-learning/openaigym.video.0.21902.video000027.json
[2017-09-10 16:11:05,563] Starting new video recorder writing to /tmp/frozenlake_q-learning/openaigym.video.0.21902.video000064.json
[2017-09-10 16:11:05,596] Starting new video recorder writing to /tmp/frozenlake_q-learning/openaigym.video.0.21902.video000125.json
[2017-09-10 16:11:05,641] Starting new video recorder writing to /tmp/frozenlake_q-learning/openaigym.video.0.21902.video000216.json
[2017-09-10 16:11:05,752] Starting new video recorder writing to /tmp

In [4]:
print("Score over time: " +  str(float(sum(rewards))/num_episodes))

Score over time: 0.451


In [5]:
print("Final Q-Table Values", Q)

Final Q-Table Values [[  9.71010236e-02   7.86331069e-03   7.60328486e-03   6.36374254e-03]
 [  1.31156676e-04   1.20741365e-03   2.58309539e-04   8.15404608e-02]
 [  1.24982103e-03   8.97638422e-02   9.30770097e-03   3.04034533e-03]
 [  7.74912670e-04   2.47262227e-03   1.21626153e-04   5.23123361e-02]
 [  1.47224938e-01   1.16966173e-04   1.44886674e-03   1.16346300e-03]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  3.10872124e-04   5.67266036e-06   1.67405625e-01   8.73992282e-05]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  1.89044922e-05   3.87305393e-04   8.94174865e-04   9.41439259e-02]
 [  1.49642717e-03   5.72485364e-02   3.87260942e-03   1.87062697e-03]
 [  5.53201529e-02   0.00000000e+00   6.69974510e-05   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   1.71069871e-01   0

In [6]:
env.close()
# gym.upload('/tmp/frozenlake_q-learning', api_key='sk_BE89YDjqQzW6nN7FAKcK9w')

[2017-09-10 16:11:09,797] Finished writing results. You can upload them to the scoreboard via gym.upload('/tmp/frozenlake_q-learning')
[2017-09-10 16:11:09,802] [FrozenLake-v0] Uploading 2000 episodes of training data
[2017-09-10 16:11:14,040] [FrozenLake-v0] Uploading videos of 11 training episodes (1387 bytes)
[2017-09-10 16:11:14,897] [FrozenLake-v0] Creating evaluation object from /tmp/frozenlake_q-learning with learning curve and training video
[2017-09-10 16:11:15,306] 
****************************************************
You successfully uploaded your evaluation on FrozenLake-v0 to
OpenAI Gym! You can find it at:

    https://gym.openai.com/evaluations/eval_KIgrRxuTTfypRt63SIt2Aw

****************************************************


# Q-Network Learning

In [1]:
import gym
from gym import wrappers
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
env = gym.make('FrozenLake-v0')
env = wrappers.Monitor(env, '/tmp/frozenlake_network-q-learning')
env.spec.timestep_limit = 100

[2017-09-10 16:23:45,183] Making new env: FrozenLake-v0
[2017-09-10 16:23:45,187] Creating monitor directory /tmp/frozenlake_network-q-learning


In [4]:
tf.reset_default_graph()

In [7]:
# Feed-forward part of the network
inputs1 = tf.placeholder(shape=[1,16], dtype=tf.float32)
W = tf.Variable(tf.random_uniform([16, 4], 0, 0.01))
Qout = tf.matmul(inputs1, W)
predict = tf.argmax(Qout, 1)

# Back propagation using loss that is squared error
nextQ = tf.placeholder(shape=[1, 4], dtype=tf.float32)
loss = tf.reduce_sum(tf.square(nextQ - Qout))
trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = trainer.minimize(loss)

Train the network

In [None]:
init = tf.initialize_all_variables()

# Set params
y = 0.99 # discount
e = 0.1
num_episodes

