In [1]:
import gym
import numpy as np
import tensorflow as tf

In [2]:
from gym.envs.registration import register
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
    max_episode_steps=100,
    reward_threshold=0.78, # optimum = .8196
)

In [3]:
env = gym.make('FrozenLakeNotSlippery-v0')
s = env.reset()

rewards = []
actions = []
states = [s]
for _ in range(100):
    env.render()
    a = env.action_space.sample()
    s, r, d, _ = env.step(a)
    rewards.append(r)
    actions.append(a)
    states.append(s)
    if d==True:
        break

[2017-06-20 06:50:15,363] Making new env: FrozenLakeNotSlippery-v0



[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG


折扣累积函数实现

In [4]:
num_episodes = 2000
num_exp = 100

discount = 0.99
Q = np.zeros([env.observation_space.n,env.action_space.n])

for i in range(num_episodes):
    s = env.reset()
    r_all = 0
    for j in range(num_exp):
        # 根据Q-table选择最优动作，为开始中加入随机项，其值随次数增多而减小，模拟探索／利用策略
        a = np.argmax(Q[s,] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        s1, r, d, _ = env.step(a)
        # 更新Q-table
        Q[s,a] = Q[s,a] + (r + discount*np.max(Q[s1,]-Q[s,a]))
        r_all += r
        s = s1
        if d==True:
            break
    
print Q

[[ 0.          1.01010101  0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          1.01010101  0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          1.01010101  0.        ]
 [ 0.          1.01010101  0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          1.01010101  0.        ]
 [ 0.          0.          1.01010101  0.        ]
 [ 0.          0.          0.          0.        ]]


神经网络实现

In [38]:
tf.reset_default_graph()

inputs = tf.placeholder(dtype=tf.float32, shape=[1,16])
W = tf.Variable(tf.random_uniform([16,4],minval=0,maxval=0.1))
Q = tf.matmul(inputs, W)
out = tf.argmax(Q, 1)

Qplus = tf.placeholder(dtype=tf.float32, shape=[1,4])
loss = tf.reduce_sum(tf.square(Qplus - Q))
optimize = tf.train.GradientDescentOptimizer(learning_rate=0.01)

update = optimize.minimize(loss)

In [55]:
num_episodes = 2000
num_exp = 100
e = 0.1
discount = 0.99
r_list = []
j_list = []

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    for i in range(num_episodes):
        s = env.reset()
        rAll = 0
        for j in range(num_exp):
            a = sess.run(out, feed_dict={inputs:np.identity(16)[s:(s+1)]})
            if np.random.randn(1) < e:
                a[0] = env.action_space.sample()
            s1, r, d, _ = env.step(a[0])
            targetQ = sess.run(Q, feed_dict={inputs:np.identity(16)[s1:(s1+1)]})
            targetQ[0, a[0]] = r + discount*np.max(targetQ)
            _,Qw = sess.run([update,W], feed_dict={inputs:np.identity(16)[s:(s+1)], Qplus:targetQ})
            s = s1
            rAll += r
            if d==True:
                # 降低探索的概率
                e = 1.0/(i+10)
                break
        r_list.append(rAll)
        j_list.append(j)

In [56]:
Qw

array([[ 0.05831863,  0.06324577,  0.04077114,  0.05069289],
       [ 0.04550828,  0.05993972,  0.03847277,  0.05840481],
       [ 0.05555288,  0.05983341,  0.04727112,  0.05982288],
       [ 0.04594327,  0.06094423,  0.05360661,  0.06851841],
       [ 0.05967407,  0.06496553,  0.03774201,  0.04750047],
       [ 0.03905617,  0.02351878,  0.03271607,  0.05973833],
       [ 0.05554296,  0.04283049,  0.04565795,  0.05821531],
       [ 0.03585453,  0.01711981,  0.06851705,  0.06862446],
       [ 0.06349712,  0.07504875,  0.03240879,  0.04109855],
       [ 0.04754862,  0.04103345,  0.0416178 ,  0.05780173],
       [ 0.05191639,  0.05615814,  0.07320867,  0.04694156],
       [ 0.06624027,  0.05776247,  0.07247669,  0.06645908],
       [ 0.06540928,  0.08389746,  0.02745999,  0.03374217],
       [ 0.05370254,  0.01984292,  0.07395524,  0.08590403],
       [ 0.05304103,  0.04729204,  0.19151625,  0.04600854],
       [ 0.09376138,  0.05290662,  0.07220618,  0.00825945]], dtype=float32)

In [57]:
[[ 0.          1.01010101  0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          1.01010101  0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          1.01010101  0.        ]
 [ 0.          1.01010101  0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          1.01010101  0.        ]
 [ 0.          0.          1.01010101  0.        ]
 [ 0.          0.          0.          0.        ]]

<tf.Tensor 'MatMul:0' shape=(1, 4) dtype=float32>