* Written and coded by: Chirag Mirani
* On Tuesday, January 25, 2022

# 1. Importing Required Libraries
* Key library here is Openai Gym, which allows us to import OpenAI environments
* This "from IPython.display import clear_output" is used to clear output from Jupyter Notebook

In [18]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output


# 2. Importing Taxi-v3 environment
This task was introduced in [Dietterich2000] to illustrate some issues in hierarchical reinforcement learning. There are 4 locations (labeled by different letters) and your job is to pick up the passenger at one location and drop him off in another. You receive +20 points for a successful dropoff, and lose 1 point for every timestep it takes. There is also a 10 point penalty for illegal pick-up and drop-off actions.



In [13]:
env = gym.make('Taxi-v3')

# 3. Go through ten random action episode in the Taxi-v3 enviroment
* 1. 10 episodes are iterated through via for loop
* 2. We initialize each episode
* 3. We perform action until we are done for each game episode
    * 1. We render the environment
    * 2. We take a random action and store the next state, reward, done (True or false) and info for each action taken
    * 3. We accumulate our total score, which is a running sum of reward received for each action (through one complete episode)
    * 4. We clear the output after each action to display the next screen. 
    * 5. After each epiosde, we display the total score

In [14]:
episodes = 10

for episode in range (1, episodes):
    state = env.reset()
    done= False
    score =0
    
    while not done:
        env.render()
        state, reward, done, info = env.step(env.action_space.sample())
        score+=reward
        clear_output(wait=True)
    print ('Episode: {}\nScore: {}'.format (episode, score))
env.close()

Episode: 9
Score: -713


# Here we build our Q-table
# What?? What the heck is a Q-table???
* Q-table is a fancy name for a look up table for the environment that the agent can reference and update.  Specifically, along the rows you have different states encountered and along the columns is an action the agent can take.  The cells will hold the value received for state,action pair.  Interesting thought here is that how would the agent know the size of this q-table at inception because he hasn't explored the environment. 
* Hence Q-table is initialized to state x actions matrix

In [23]:
actions = env.action_space.n
state = env.observation_space.n

# initialized our Q-table
q_table = np.zeros((state, actions))


array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [31]:
#parameters for Q-learning
num_episodes = 10000
max_steps_per_episode = 100
learning_rate =0.1
discount_rate = 0.99
exploration_rate=1
max_exploration_rate =1
min_exploration_rate = 0.01
exploration_decay_rate=0.001
rewards_all_episodes=[]

In [33]:
#Q-learning Algorithm

for episodes in range(num_episodes):
    state = env.reset()
    done=False
    rewards_current_episode =0
    
    for step in range(max_steps_per_episode):
        
        #Exploration vs Exploitation trade-off
        exploration_treshold=np.random.uniform(0,1)
        if exploration_treshold>exploration_rate:
            action=np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()
        new_state, reward, done, info = env.step(action)
        
        #Update Q-Table
        q_table[state, action] = learning_rate*(reward + discount_rate*np.max(q_table[new_state,:]))+q_table[state,action] \
                                -learning_rate*(q_table[state,action])
       
        state=new_state
        rewards_current_episode +=reward
        
        if done == True:
            break
                
    exploration_rate =min_exploration_rate+ \
                    (max_exploration_rate-min_exploration_rate)*np.exp(-exploration_decay_rate*episode)
    
    rewards_all_episodes.append(rewards_current_episode)

print("***** Training Finished******")

***** Training Finished******


In [34]:
q_table

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 4.72961412,  5.865136  ,  4.86149983,  6.15531319,  7.44936047,
        -2.80174748],
       [10.38150481, 11.65009807, 10.41929749, 11.6711564 , 12.88259705,
         2.64633382],
       ...,
       [13.42805882, 14.8796532 , 13.49330736, 11.96069377,  4.41648867,
         4.4819251 ],
       [ 6.05846999,  7.95668135,  6.12379843,  7.73069972, -2.99643012,
        -2.7902915 ],
       [16.78791948, 15.69059773, 17.06967515, 18.56699302,  8.15224757,
         8.11085919]])

In [None]:
#Calculate and print average reward per thousand episodes
