# Q-Learning Numpy
 Using Q-learning implemented on Numpy to play Frozen Lake environment on OpenAI Gym.

## CartPole-v1
A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The system is controlled by applying a force of +1 or -1 to the cart. The pendulum starts upright, and the goal is to prevent it from falling over. A reward of +1 is provided for every timestep that the pole remains upright. The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the center.

In [60]:
import gym
import numpy as np

In [72]:
# Create environment
#env = gym.make("FrozenLake-v0")
env = gym.make("CartPole-v1")

In [73]:
# Observation bounds
print(env.observation_space.low)
print(env.observation_space.high)

[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]


In [None]:
# TODO 
NUMPY DIGITIZE

In [74]:
# Number of actions and states
action_size = env.action_space.n
state_size = env.observation_space.shape

In [66]:
# Initialize q-table
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [67]:
# Hyperparameters
total_episodes = 100000       # Total episodes
learning_rate = 0.2           # Learning rate
epsido_length = 100           # Max steps per episode
gamma = 0.9                   # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.05            # Minimum exploration probability 
decay_rate = 0.005            # Exponential decay rate for exploration prob

In [68]:
# List of rewards
rewards = []

# 2 For life or until learning is stopped
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        # 3. Choose an action a in the current world state (s)
        ## First we randomize a number
        exp_exp_tradeoff = np.random.uniform(0, 1)
        
        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])

        # Else doing a random choice --> exploration
        else:
            action = env.action_space.sample()

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = (qtable[state, action] + 
                                 learning_rate * (reward + gamma * np.max(qtable[new_state, :]) -
                                                  qtable[state, action]))
        
        total_rewards += reward
        
        # Our new state is state
        state = new_state
        
        # If done (if we're dead) : finish episode
        if done == True: 
            break
        
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)

print ("Score over time: " +  str(sum(rewards)/total_episodes))
print(qtable)

Score over time: 0.31276
[[0.06750343 0.05193442 0.05640886 0.04888681]
 [0.04080654 0.01923145 0.01881644 0.04515681]
 [0.04179711 0.03899454 0.03059155 0.02821348]
 [0.02056088 0.0199648  0.01001099 0.03353127]
 [0.09285666 0.04358047 0.04751317 0.05441464]
 [0.         0.         0.         0.        ]
 [0.02980485 0.00953963 0.04048369 0.02422391]
 [0.         0.         0.         0.        ]
 [0.0778595  0.07727393 0.07065936 0.16392884]
 [0.10823278 0.21936239 0.16986442 0.05251152]
 [0.27503764 0.10682137 0.14313746 0.11233695]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.23001888 0.34904871 0.38873794 0.21920675]
 [0.34010503 0.70357338 0.32244571 0.43469428]
 [0.         0.         0.         0.        ]]


In [69]:
env.reset()

for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()
            
            # We print the number of step it took.
            print("Number of steps", step)
            break
        state = new_state
env.close()

****************************************************
EPISODE  0
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 16
****************************************************
EPISODE  1
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 10
****************************************************
EPISODE  2
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 17
****************************************************
EPISODE  3
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
Number of steps 25
****************************************************
EPISODE  4
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 91
