In [1]:
import gym
import numpy as np
import tensorflow as tf

# FrozenLake确定版本

In [2]:
from gym.envs.registration import register
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
    max_episode_steps=100,
    reward_threshold=0.78, # optimum = .8196
)

随机尝试FrozenLake问题

In [3]:
env = gym.make('FrozenLakeNotSlippery-v0')
s = env.reset()

for _ in range(100):
    env.render()
    a = env.action_space.sample()
    s, r, d, _ = env.step(a)
    if d==True:
        break

[2017-06-24 08:38:52,639] Making new env: FrozenLakeNotSlippery-v0



[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG


# 折扣累积函数实现

In [4]:
num_episodes = 2000
num_exp = 100

discount = 0.99
Q = np.zeros([env.observation_space.n,env.action_space.n])

for i in range(num_episodes):
    s = env.reset()
    r_all = 0
    for j in range(num_exp):
        # 根据Q-table选择最优动作，为开始中加入随机项，其值随次数增多而减小，模拟探索／利用策略
        a = np.argmax(Q[s,] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        s1, r, d, _ = env.step(a)
        # 更新Q-table
        Q[s,a] = Q[s,a] + (r + discount*np.max(Q[s1,]-Q[s,a]))
        r_all += r
        s = s1
        if d==True:
            break
    
print Q

[[ 0.          0.          1.01010101  0.        ]
 [ 0.          0.          1.01010101  0.        ]
 [ 0.          1.01010101  0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          1.01010101  0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          1.01010101  0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          1.01010101  0.        ]
 [ 0.          0.          0.          0.        ]]


# 神经网络实现

In [5]:
tf.reset_default_graph()

inputs = tf.placeholder(dtype=tf.float32, shape=[1,16])
W = tf.Variable(tf.random_uniform([16,4],minval=0,maxval=0.01))
Q = tf.matmul(inputs, W)
out = tf.argmax(Q, 1)

Qplus = tf.placeholder(dtype=tf.float32, shape=[1,4])
loss = tf.reduce_sum(tf.square(Qplus - Q))
optimize = tf.train.GradientDescentOptimizer(learning_rate=0.1)

update = optimize.minimize(loss)

In [7]:
num_episodes = 2000
num_exp = 100
e = 0.1
discount = 0.99
r_list = []
j_list = []

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    for i in range(num_episodes):
        s = env.reset()
        rAll = 0
        for j in range(num_exp):
            a,targetQ = sess.run([out, Q], feed_dict={inputs:np.identity(16)[s:(s+1)]})
            if np.random.randn(1) < e:
                a[0] = env.action_space.sample()
            s1, r, d, _ = env.step(a[0])
            Q1 = sess.run(Q, feed_dict={inputs:np.identity(16)[s1:(s1+1)]})
            targetQ[0, a[0]] = r + discount*np.max(Q1)
            _,Qw = sess.run([update,W], feed_dict={inputs:np.identity(16)[s:(s+1)], Qplus:targetQ})
            s = s1
            rAll += r
            if d==True:
                # 随着次数的增多，不断降低探索的概率
                e = 1.0/((i/50)+10)
                break
        r_list.append(rAll)
        j_list.append(j)
        
print Qw

[[ 0.00932195  0.00941612  0.00941612  0.00932195]
 [ 0.00932195  0.00951123  0.00932195  0.00941612]
 [ 0.00941612  0.00939542  0.00917731  0.00931969]
 [ 0.00927864  0.00589476  0.00914054  0.0067129 ]
 [ 0.00941612  0.00932195  0.00951123  0.00932195]
 [ 0.00939793  0.00960731  0.00221813  0.00609945]
 [ 0.0095078   0.00349769  0.00476513  0.00696958]
 [ 0.00466632  0.00539051  0.00580434  0.00222694]
 [ 0.00932195  0.00902112  0.00919807  0.00941612]
 [ 0.00930827  0.00718343  0.00615196  0.00862864]
 [ 0.00221011  0.00639263  0.00259545  0.0039196 ]
 [ 0.0016076   0.00025851  0.00510592  0.00206563]
 [ 0.00778738  0.00628488  0.00911224  0.00363264]
 [ 0.008194    0.0069425   0.00517019  0.00840881]
 [ 0.00758353  0.00226784  0.00114562  0.0039397 ]
 [ 0.00682094  0.00621753  0.00688601  0.00319589]]
