In [30]:
import gym
import random
random.seed(420)
import numpy as np
np.random.seed(420)
import tensorflow.compat.v1 as tf
import tensorflow
tensorflow.random.set_seed(420)
tf.set_random_seed(420)
import time
from gym.envs.registration import register
from IPython.display import clear_output
tf.disable_v2_behavior()

In [31]:
# https://github.com/openai/gym/blob/master/gym/envs/toy_text/frozen_lake.py
# https://github.com/openai/gym/blob/master/gym/envs/__init__.py
try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery':False},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
    )
except:
    pass

# env_name = "CartPole-v1"
# env_name = "MountainCar-v0"
# env_name = "MountainCarContinuous-v0"
# env_name = "Acrobot-v1"
# env_name = "Pendulum-v0"
env_name = "FrozenLake-v0"
env_name = "FrozenLakeNoSlip-v0"
env = gym.make(env_name)
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)
type(env.action_space)

Observation space: Discrete(16)
Action space: Discrete(4)


gym.spaces.discrete.Discrete

In [32]:
class Agent():
    def __init__(self, env):
        self.is_discrete = \
            type(env.action_space) == gym.spaces.discrete.Discrete
        
        if self.is_discrete:
            self.action_size = env.action_space.n
            print("Action size:", self.action_size)
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print("Action range:", self.action_low, self.action_high)
        
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low,
                                       self.action_high,
                                       self.action_shape)
        return action

In [33]:
class QNAgent(Agent):
    def __init__(self, env, discount_rate=0.9, learning_rate=0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n
        print("State size:", self.state_size)
        
        self.eps = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        
    def build_model(self):
        tf.reset_default_graph()
        self.state_in = tf.placeholder(tf.int32, shape=[1])
        self.action_in = tf.placeholder(tf.int32, shape=[1])
        self.target_in = tf.placeholder(tf.float32, shape=[1])
        
        self.state = tf.one_hot(self.state_in, depth=self.state_size)
        self.action = tf.one_hot(self.action_in, depth=self.action_size)
        
        self.q_state = tf.layers.dense(self.state, units=self.action_size, name="q_table")#q_state is the vector of q values for the current state
        self.q_action = tf.reduce_sum(tf.multiply(self.q_state, self.action))#the q value for the action taken
        #reducing by max doesn't work because sometimes the q value is negative(facepalm,bruh)
        #btw, the reason it's reduced is because after the matrix multiplication it looks like [-0.2,0,0,0] if the action was 0 and q value was -0.2
        self.loss = tf.reduce_sum(tf.square(self.target_in - self.q_action))
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        
    def get_action(self, state):
        q_state = self.sess.run(self.q_state, feed_dict={self.state_in: [state]})
        action_greedy = np.argmax(q_state)
        action_random = super().get_action(state)
        return action_random if random.random() < self.eps else action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.sess.run(self.q_state, feed_dict={self.state_in: [next_state]})[0]
        if done:
            q_next = np.zeros(1)
        q_target = reward + self.discount_rate * np.max(q_next)
        
        feed = {self.state_in: [state], self.action_in: [action], self.target_in: [q_target]}
        self.sess.run(self.optimizer, feed_dict=feed)
        
        if done:
            self.eps = self.eps * 0.99
            
    def __del__(self):
        self.sess.close()
        

In [37]:
agent = QNAgent(env)

Action size: 4
State size: 16


In [39]:
total_reward = 0
for ep in range(100):
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
#         if done and next_state !=15:
#             reward=-0.5
        agent.train((state,action,next_state,reward,done))
        state = next_state
        total_reward += reward
        
        print("s:", state, "a:", action)
        print("Episode: {}, Total reward: {}, eps: {}".format(ep,total_reward,agent.eps))
        env.render()
        with tf.variable_scope("q_table", reuse=True):
            weights = agent.sess.run(tf.get_variable("kernel"))
            print(weights)
        clear_output(wait=True)

s: 15 a: 2
Episode: 99, Total reward: 73.0, eps: 0.13397967485796175
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
[[ 0.06523675  0.07017484  0.05906906  0.08723167]
 [ 0.07150782 -0.515132   -0.07802173  0.09812926]
 [ 0.0684272   0.21908918 -0.09400233 -0.00274765]
 [ 0.0719268  -0.2527575  -0.08098038 -0.13532001]
 [ 0.12610865  0.13444243 -0.40535855  0.13172986]
 [-0.27663392 -0.42904347  0.14204443  0.07371294]
 [-0.09707171  0.26722014 -0.15264829  0.06165149]
 [ 0.52954185 -0.542935    0.54214966  0.30800295]
 [ 0.1908543  -0.5110466   0.30814245  0.20012614]
 [ 0.18896708  0.1126323   0.38994366 -0.35970733]
 [ 0.262737    0.36673206 -0.41757014  0.33486882]
 [ 0.16353804 -0.5332946   0.449063   -0.36168173]
 [-0.37550336 -0.10144603 -0.3428325   0.0663262 ]
 [-0.48149237 -0.04618061  0.05461503  0.33457702]
 [ 0.1991122   0.26590595  0.57742614  0.42087302]
 [ 0.44857305 -0.3713547  -0.20806307 -0.41181913]]


In [46]:
total=0
t2=10000000
for i in range(t2):
    heads=0
    for j in range(6):
        if random.random()>0.5:
            heads+=1
    if heads==4:
       total+=1
print(total/t2)

0.2345922
