### Cab-Driver Agent

In [1]:
# Importing libraries
import numpy as np
import random
import math
import time
from collections import deque
import collections
import pickle

# for building DQN model
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
# from keras.optimizers import Adam
from tensorflow.keras.optimizers import Adam

# for plotting graphs
import matplotlib.pyplot as plt

# Import the environment
from Env import CabDriver

In [2]:
import warnings
warnings.filterwarnings('ignore')

#### Defining Time Matrix

In [3]:
# Loading the time matrix provided
Time_matrix = np.load("TM.npy")

In [4]:
Time_matrix.shape

(5, 5, 24, 7)

#### Tracking the state-action pairs for checking convergence


In [5]:
#Defining a function to save the Q-dictionary as a pickle file
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state-action and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [6]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        # Define size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # Write here: Specify you hyper parameters for the DQN
        self.discount_factor = 0.95
        self.learning_rate = 0.01 
        self.epsilon = 1
        self.epsilon_max = 1
        self.epsilon_decay = 0.0009
        self.epsilon_min = 0.001
        self.batch_size = 32        
        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # create main model and target model
        self.model = self.build_model()


    # approximate Q function using Neural Network
    def build_model(self):
        model = Sequential()
        # Write your code here: Add layers to your neural nets    
        
        model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation= 'relu', kernel_initializer= 'he_uniform'))
        model.add(Dense(self.action_size, activation='relu', kernel_initializer='he_uniform'))
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        model.summary
        
        return model



    def get_action(self, state):
        # Write your code here:
        # get action from model using epsilon-greedy policy
        # Decay in ε after we generate each sample from the environment    
        
        possible_actions_index, actions = env.requests(state)
        
        z = np.random.random()
        
        if z > self.epsilon:  #Exploitation
            state = np.array(env.state_encod_arch2(state)).reshape(1,self.state_size)
            q_values = self.model.predict(state)
            q_values_possible = np.array([q_values[0][i] for i in possible_actions_index])
            q_max_index = np.argmax(q_values_possible)
            
            return possible_actions_index[q_max_index]
        
        else:    #Exploration
            return random.randrange(self.action_size)

    def append_sample(self, state, action, reward, next_state, terminal_state):
        # Write your code here:
        # save sample <s,a,r,s'> to the replay memory
        self.memory.append((state, action, reward, next_state, terminal_state))

    # pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self):
        if len(self.memory) > self.batch_size:
            # Sample batch from the memory
            mini_batch = random.sample(self.memory, self.batch_size)
            update_output = np.zeros((self.batch_size, self.state_size))
            update_input = np.zeros((self.batch_size, self.state_size))
            actions, rewards, terminal_state = [], [], []
            
            for i in range(self.batch_size):
                state, action, reward, next_state, terminal_state2 = mini_batch[i]
                update_input[i] = env.state_encod_arch2(state)
                actions.append(action)
                rewards.append(reward)
                update_output[i] = env.state_encod_arch2(next_state)
                terminal_state.append(terminal_state2)
                
                # Write your code from here
            # 1. Predict the target from earlier model
            target = self.model.predict(update_input)
                
            # 2. Get the target for the Q-network
            target_q_net = self.model.predict(update_output)
                
            # 3. Update your 'update_output' and 'update_input' batch. Be careful to use the encoded state-action pair
            for i in range(self.batch_size):
    
                if terminal_state[i]:
                    target[i][actions[i]] = rewards[i]
                else:
                    target[i][actions[i]] = rewards[i] + self.discount_factor * np.amax(target_q_net[i])
            
            # 4. Fit your model and track the loss values
            self.model.fit(update_input, target, batch_size=self.batch_size, epochs=1, verbose=0)
            
            
    def save(self, name):
        self.model.save_weights(name)

In [7]:
#Initializing necessary variables
Episodes = 10000
max_run_time = 24*30
m = 5 # number of cities, ranges from 1 ..... m
t = 24 # number of hours, ranges from 0 .... t-1
d = 7  # number of days, ranges from 0 ... d-1

### DQN block

In [11]:
start_time = time.time() #to calculate the time of each episode
#rewards_tracked = []
rewards_per_episode =[]
episodes = []

#Getting info from CabDriver to invoke DQNAgent object
env = CabDriver()
action_space, state_space, state = env.reset()
state_size = m+t+d
action_size = len(action_space)
dqn_agent = DQNAgent(state_size, action_size)

for episode in range(Episodes):

    # Write code here
    # Call the environment
    # Call all the initialised variables of the environment
    
    terminal_state = False
    reward_this_episode = 0
    
    env = CabDriver()
    action_space, state_space, state = env.reset()
    init_state = state
    total_time = 0 #time drive time for this episode
    
    while not (terminal_state):
        
        # Write your code here
        # 1. Pick epsilon-greedy action from possible actions for the current state
        action_index = dqn_agent.get_action(state)
        action = env.action_space[action_index]
        
        # 2. Evaluate your reward and next state
        reward = env.reward_func(state, action, Time_matrix)
        next_state = env.next_state_func(state, action, Time_matrix)
        
        #calculate the time taken as the max time is 24*30 hrs.
        time_for_step = env.calc_total_time(state, action, Time_matrix)
        total_time += time_for_step
        if total_time < max_run_time:
            # 3. Append the experience to the memory
            dqn_agent.append_sample(state, action_index, reward, next_state, terminal_state)
            # 4. Train the model by calling function dqn_agent.train_model
            dqn_agent.train_model()
            # 5. Keep a track of rewards, Q-values, loss
            reward_this_episode += reward
            state = next_state
        else:
            terminal_state = True
        
    rewards_per_episode.append(reward_this_episode)
    episodes.append(episode)

    #set the epsilon value
    dqn_agent.epsilon = dqn_agent.epsilon_min + (dqn_agent.epsilon_max - dqn_agent.epsilon_min) * np.exp(dqn_agent.epsilon_decay * episode)

    if episode % 5 == 0:
        print('Episode ',episode, 
              '\tInitial State: ', init_state, 
              '\tEpisode Reward: ',reward_this_episode, 
              '\tTotal Ride Time: ', total_time)

    if episode % 2000 == 0:
        dqn_agent.save('CabDriver_DQN_Model.h5')

end_time = time.time()
print('Elapsed time: ', (start_time-end_time))

Episode  0 	Initial State:  (2, 14, 6) 	Episode Reward:  -10 	Total Ride Time:  723
Episode  5 	Initial State:  (3, 8, 4) 	Episode Reward:  -25 	Total Ride Time:  723
Episode  10 	Initial State:  (1, 22, 5) 	Episode Reward:  -186 	Total Ride Time:  724
Episode  15 	Initial State:  (0, 23, 6) 	Episode Reward:  52 	Total Ride Time:  720
Episode  20 	Initial State:  (3, 19, 0) 	Episode Reward:  179 	Total Ride Time:  720
Episode  25 	Initial State:  (1, 12, 4) 	Episode Reward:  -45 	Total Ride Time:  721
Episode  30 	Initial State:  (1, 17, 5) 	Episode Reward:  105 	Total Ride Time:  721
Episode  35 	Initial State:  (1, 12, 3) 	Episode Reward:  55 	Total Ride Time:  725
Episode  40 	Initial State:  (2, 20, 5) 	Episode Reward:  -49 	Total Ride Time:  726
Episode  45 	Initial State:  (2, 14, 3) 	Episode Reward:  101 	Total Ride Time:  720
Episode  50 	Initial State:  (1, 5, 1) 	Episode Reward:  239 	Total Ride Time:  721
Episode  55 	Initial State:  (1, 17, 1) 	Episode Reward:  36 	Total Ri

Episode  480 	Initial State:  (1, 20, 2) 	Episode Reward:  5 	Total Ride Time:  731
Episode  485 	Initial State:  (3, 4, 6) 	Episode Reward:  -289 	Total Ride Time:  723
Episode  490 	Initial State:  (2, 8, 1) 	Episode Reward:  41 	Total Ride Time:  722
Episode  495 	Initial State:  (2, 20, 2) 	Episode Reward:  -237 	Total Ride Time:  728
Episode  500 	Initial State:  (4, 19, 2) 	Episode Reward:  6 	Total Ride Time:  720
Episode  505 	Initial State:  (1, 7, 1) 	Episode Reward:  171 	Total Ride Time:  722
Episode  510 	Initial State:  (1, 3, 2) 	Episode Reward:  146 	Total Ride Time:  726
Episode  515 	Initial State:  (3, 23, 0) 	Episode Reward:  -268 	Total Ride Time:  723
Episode  520 	Initial State:  (2, 11, 2) 	Episode Reward:  -210 	Total Ride Time:  721
Episode  525 	Initial State:  (0, 2, 6) 	Episode Reward:  -75 	Total Ride Time:  726
Episode  530 	Initial State:  (1, 4, 1) 	Episode Reward:  -62 	Total Ride Time:  724
Episode  535 	Initial State:  (2, 4, 4) 	Episode Reward:  -17

KeyboardInterrupt: 

In [None]:
temp = CabDriver()
action_space, state_space, state = temp.reset()
state_size = m+t+d
action_size = len(action_space)
print(action_size, state_size)

In [None]:
m+t+d

### Tracking Convergence

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
# time = np.arange(0,10000)
# epsilon = []
# for i in range(0,10000):
#     epsilon.append(0 + (1 - 0) * np.exp(-0.0009*i))

In [None]:
# plt.plot(time, epsilon)
# plt.show()