<a href="https://colab.research.google.com/github/Ekpenyong-Esu/Deep-RL-Exercise/blob/main/gym_mountain_car.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!apt update && apt install xvfb
#!pip install gym[classic_control]

In [None]:
import cv2
from google.colab.patches import cv2_imshow

import os
os.environ["SDL_VIDEODRIVER"] = "dummy"
from IPython import display

In [None]:
%matplotlib notebook

import gym
import numpy as np  
import matplotlib.pyplot as plt

In [None]:
env = gym.make("MountainCar-v0")

**TASK: Write a function to create a numpy array holding the bins for the observations of the car (position and velocity).** <br />
Feel free to explore different bins per observation spacings.
The function should take one argument which acts as the bins per observation <br />
Hint: You can find the observations here: https://github.com/openai/gym/blob/master/gym/envs/classic_control/mountain_car.py
<br /> Hint: You will probably need around 25 bins for good results, but feel free to use less to reduce training time. <br />


In [None]:
# Creating the bins 

def create_bins(num_bins_per_observation):
    car_position = np.linspace(-1.2, 0.6, num_bins_per_observation)  # bins for the car position
    car_velocity = np.linspace(-0.07, 0.07, num_bins_per_observation)  # bins for the car velocity
    bins = np.array([car_position, car_velocity])  # merge them
    return bins

In [None]:
NUM_BINS = 40  #  number of bins for this task
BINS = create_bins(NUM_BINS)  # Create the bins used for the rest of the notebook

Now we need the code to discretize the observations. We can use the same code as used in the last notebook

In [None]:
# dicretize the observation space 
def discretize_observation(observations, bins):
    binned_observations = []
    for i, observation in enumerate(observations):
        discretized_observation = np.digitize(observation, bins[i])
        binned_observations.append(discretized_observation)
    return tuple(binned_observations) # Important for later indexing

In [None]:
# Testing the bin 

test_bins = create_bins(5)
np.testing.assert_almost_equal(test_bins[0], [-1.2 , -0.75, -0.3 ,  0.15,  0.6])
np.testing.assert_almost_equal(test_bins[1], [-0.07 , -0.035,  0.   ,  0.035,  0.07 ])

test_observation = np.array([-0.9, 0.03])
discretized_test_bins = discretize_observation(test_observation, test_bins)
assert discretized_test_bins == (1, 3)

In [None]:
# Creation of the q table 

q_table_shape = (NUM_BINS, NUM_BINS, env.action_space.n)
q_table = np.zeros(q_table_shape)
print(q_table.shape)

In [None]:
#epsilon greedy function

def epsilon_greedy_action_selection(epsilon, q_table, discrete_state):
    if np.random.random() > epsilon:
        action = np.argmax(q_table[discrete_state])
    else:
        action = np.random.randint(0, env.action_space.n)
    return action

In [None]:
#hyper parameter

EPOCHS = 30000
BURN_IN = 100
epsilon = 1

EPSILON_END= 10000
EPSILON_REDUCE = 0.0001 #epsilon / EPOCHS

ALPHA = 0.8
GAMMA = 0.9

**TASK: Fill out the function to compute the next Q value.**

In [None]:
def compute_next_q_value(old_q_value, reward, next_optimal_q_value):
    
    return old_q_value +  ALPHA * (reward + GAMMA * next_optimal_q_value - old_q_value)


**TASK: Create a function to reduce epsilon, feel free to choose any reduction method you want. We'll use a reduction with BURN_IN and EPSILON_END limits in the solution. We'll also show a way to reduce epsilon based on the number of epochs. Feel free to experiment here.**

In [None]:
def reduce_epsilon(epsilon, epoch):
    if BURN_IN <= epoch <= EPSILON_END:
        epsilon-= EPSILON_REDUCE
    return epsilon

In [None]:
epochs = []  # store the epoch for plotting

for epoch in range(EPOCHS):
    ################################# TODO ######################################
    
    # TODO: Get initial observation and discretize them. Set done to False
    initial_state = env.reset()  # get the initial observation
    discretized_state = discretize_observation(initial_state, BINS)  # map the observation to the bins
    done = False  # to stop current run when the car reaches the top or the time limit is reached
    
   
    epochs.append(epoch)
    
    # TODO: As long as current run is alive (i.e not done) perform the following steps:
    while not done:  # Perform current run as long as done is False (as long as there is still time to reach the top)

        if epoch % 10000 == 0:
          print(f"EPOCH is {epoch}  ")
        # TODO: Select action according to epsilon-greedy strategy
        action = epsilon_greedy_action_selection(epsilon, q_table, discretized_state)  # Epsilon-Greedy Action Selection
        
        # TODO: Perform selected action and get next state. Do not forget to discretize it
        next_state, reward, done, info = env.step(action)  # perform action and get next state
        position, velocity = next_state
        next_state_discretized = discretize_observation(next_state, BINS)  # map the next observation to the bins
        
        # TODO: Get old Q-value from Q-Table and get next optimal Q-Value
        old_q_value =  q_table[discretized_state + (action,)]  # get the old Q-Value from the Q-Table
        next_optimal_q_value = np.max(q_table[next_state_discretized])  # Get the next optimal Q-Value
        
        # TODO: Compute next Q-Value and insert it into the table
        next_q = compute_next_q_value(old_q_value, reward, next_optimal_q_value)  # Compute next Q-Value
        q_table[discretized_state + (action,)] = next_q  # Insert next Q-Value into the table
        
        # TODO: Update the old state with the new one
        discretized_state = next_state_discretized  # Update the old state with the new one
        
       
    # TODO: Reduce epsilon
    epsilon = reduce_epsilon(epsilon, epoch)  # Reduce epsilon
    ##############################################################################

In [None]:
observation = env.reset()
for counter in range(3000):

    env.render()
    display.clear_output(wait=True)
    frame = env.render(mode='rgb_array')
    cv2_imshow(frame)
    cv2.waitKey(1)
    
    # TODO: Get discretized observation
    discrete_state = discretize_observation(observation, BINS)  # Get discretized observation
    
    # TODO: Chose action from Q-Table
    action = np.argmax(q_table[discrete_state])  # and chose action from the Q-Table
    
    # TODO: Perform the action 
    observation, reward, done, info = env.step(action) # Finally perform the action
    
    if done:
        print(f"done")
        break
env.close()
