### <strong> Jack's Car Rental Problem (2nd Version) </strong>
This notebook provides a solution for the modified version of "Jack's Car Rental Problem" presented as "Exercise 4.7" in the "Reinforcement Learning: An Introduction, Second Edition" book by Sutton and Barto.

##### <strong> Original Problem Description </strong>
Jack manages two locations for a nationwide car rental company. Each day, some number of customers arrive at each location to rent cars. If Jack has a car available, he rents it out and is credited 10 dollars by the national company. If he is out of cars at that location, then the business is lost. Cars become available for renting the day after they are returned. To help ensure that cars are available where they are needed, Jack can move them between the two locations overnight, at a cost of 2 dollars per car moved. We assume that the number of cars requested and returned at each location are Poisson random variables. Suppose λ (parameter for poisson process) is 3 and 4 for rental requests at the first and second locations and 3 and 2 for returns. To simplify the problem slightly, we assume that there can be no more than 20 cars at each location (any additional cars are returned to the nationwide company, and thus disappear from the problem) and a maximum of five cars can be moved from one location to the other in one night. We take the discount rate to be γ = 0.9 and formulate this as a continuing finite MDP, where the time steps are days, the state is the number of cars at each location at the end of the day, and the actions are the net numbers of cars moved between the two locations overnight.

##### <strong> Modification </strong>
One of Jack’s employees at the first location rides a bus home each night and lives near the second location. She is happy to shuttle one car to the second location for free. Each additional car still costs $2, as do all cars moved in the other direction. In addition, Jack has limited parking space at each location. If more than 10 cars are kept overnight at a location (after any moving of cars), then an additional cost of $4 must be incurred to use a second parking lot (independent of how many cars are kept there).

In [8]:
import numpy as np
import math
import matplotlib.pyplot as plt

In [9]:
class Location:
    def __init__(self, capacity, lambda_request, lambda_return, epsilon = 0.005):
        self.capacity = capacity
        self.lambda_request = lambda_request
        self.lambda_return = lambda_return
        self.epsilon = epsilon
        self.request_seq = self.__request_count()
        self.return_seq = self.__return_count()

    def __poisson_pmf(x, lam):
        return lam**x * math.exp(-lam) / math.factorial(x)

    def __request_count(self):
        count = 0
        prob = Location.__poisson_pmf(count, self.lambda_request)
        res = []
        prob_sum = 0.0

        while prob < self.epsilon:
            count += 1
            prob = Location.__poisson_pmf(count, self.lambda_request)


        while prob >= self.epsilon:
            prob_sum += prob
            res.append((count, prob))
            count += 1
            prob = Location.__poisson_pmf(count, self.lambda_request)

        res = [(count, prob / prob_sum) for count, prob in res]

        return res

    def __return_count(self):
        count = 0
        prob = Location.__poisson_pmf(count, self.lambda_return)
        res = []
        prob_sum = 0.0

        while prob < self.epsilon:
            count += 1
            prob = Location.__poisson_pmf(count, self.lambda_return)
    
        while prob >= self.epsilon:
            prob_sum += prob
            res.append((count, prob))
            count += 1
            prob = Location.__poisson_pmf(count, self.lambda_return)

        res = [(count, prob / prob_sum) for count, prob in res]

        return res
    
    def request_count(self):
        return self.request_seq
    
    def return_count(self):
        return self.return_seq

In [10]:
class Environment:
    def __init__(self):
        self.actions = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]
        self.states = []
        self.location1 = Location(20, 3, 3)
        self.location2 = Location(20, 4, 2)
        self.rental_price = 10.0
        self.move_cost = -2.0

        for i in range(self.location1.capacity + 1):
            for j in range(self.location2.capacity + 1):
                self.states.append((i, j))
    
    def get_states(self):
        return self.states

    def get_actions(self):
        return self.actions
    
    def __get_reward(self, state, action, loc1_req_count, loc2_req_count, loc1_ret_count, loc2_ret_count):
        reward = 0

        reward += self.move_cost * ((action - 1) if action > 0 else abs(action))

        reward += (-4) if (state[0] - action > 10) else 0
        reward += (-4) if (state[1] + action > 10) else 0

        reward += self.rental_price * min(min(state[0] - action, 20), loc1_req_count)
        reward += self.rental_price * min(min(state[1] + action, 20), loc2_req_count)

        return reward
    
    def get_nextState_reward_prob(self, state, action):

        for loc1_req_count, loc1_req_prob in self.location1.request_count():
            for loc2_req_count, loc2_req_prob in self.location2.request_count():
                for loc1_ret_count, loc1_ret_prob in self.location1.return_count():
                    for loc2_ret_count, loc2_ret_prob in self.location2.return_count():
                        i = min(max(min(state[0] - action, self.location1.capacity) - loc1_req_count, 0) + loc1_ret_count, self.location1.capacity)
                        j = min(max(min(state[1] + action, self.location2.capacity) - loc2_req_count, 0) + loc2_ret_count, self.location2.capacity)

                        reward = self.__get_reward(state, action, loc1_req_count, loc2_req_count, loc1_ret_count, loc2_ret_count)
                        prob = loc1_req_prob * loc2_req_prob * loc1_ret_prob * loc2_ret_prob
                        yield (int(i), int(j)), reward, prob
    
    @staticmethod
    def is_valid(state, action):
        return ((state[0] - action - 1) >= 0) and ((state[1] + action + 1) >= 0)

In [11]:
class Agent:
    def __init__(self, env, discount = 0.9):
        self.env = env
        self.V = None
        self.policy = None
        self.discount = discount
        self.counter = 1

    def policy_iteration(self, theta = 0.01):
        self.V = np.zeros((self.env.location1.capacity + 1, self.env.location2.capacity + 1))
        self.policy = np.zeros((self.env.location1.capacity + 1, self.env.location2.capacity + 1))
        self.counter = 1

        while True:
            prev_V = np.copy(self.V)

            self.policy_evaluation(theta)
            
            stable_policy = self.policy_improvement()

            if np.greater(self.V, prev_V).all():
                print("State Value Improved!")
            else:
                print("State Value did not Improve!")

            self.save_state_value_function(".\\outputs\\state_value_functions\\V" + str(self.counter) + ".png")
            self.save_policy(".\\outputs\\policies\\p" + str(self.counter) + ".png")
            self.counter += 1 
            
            if stable_policy:
                print("-------- Policy Iteration Done! --------")
                break
            else:
                print("-------- Policy Iteration " + str(self.counter) + " --------")
            
    def policy_evaluation(self, theta):

        while True:
            delta = 0

            for s in self.env.get_states():
                v = self.V[s[0], s[1]]
                tmp = 0
                for next_s, reward, prob in self.env.get_nextState_reward_prob(s, self.policy[s[0], s[1]]):
                    tmp += prob * (reward + self.discount * self.V[next_s[0], next_s[1]])
                self.V[s[0], s[1]] = tmp
                delta = max(delta, abs(v - self.V[s[0], s[1]]))
            
            print("---- Delta: ", delta)
            if delta < theta:
                print("Policy Evaluation Done!")
                break

    def policy_improvement(self):
        policy_stable = True
        for s in self.env.get_states():
            old_action = self.policy[s[0], s[1]]

            best_action = old_action
            best_action_value = -1e5

            for action in self.env.get_actions():
                
                if self.env.is_valid(s, action):
                    action_value = 0
                    for next_s, reward, prob in self.env.get_nextState_reward_prob(s, action):
                        action_value += prob * (reward + self.discount * self.V[next_s[0], next_s[1]])
                
                    if action_value > best_action_value:
                        best_action = action
                        best_action_value = action_value
            
            self.policy[s[0], s[1]] = best_action
            
            if old_action != self.policy[s[0], s[1]]:
                policy_stable = False

        return policy_stable
    
    def save_state_value_function(self, filename):
        plt.imshow(self.V)
        plt.colorbar()
        plt.savefig(filename)
        plt.close("all")

    def save_policy(self, filename):
        plt.imshow(self.policy)
        plt.colorbar()
        plt.savefig(filename)
        plt.close("all")

In [12]:
env = Environment()
agent = Agent(env)
agent.policy_iteration(theta = 0.5)

---- Delta:  171.86254673870607
---- Delta:  120.31289055878005
---- Delta:  82.38775781013965
---- Delta:  65.04429434827725
---- Delta:  51.91012301121762
---- Delta:  40.44585426529238
---- Delta:  31.643466942938062
---- Delta:  25.14344499939739
---- Delta:  21.02597262278755
---- Delta:  17.621385114134284
---- Delta:  14.738416495098647
---- Delta:  12.304194252125228
---- Delta:  10.25381107740418
---- Delta:  8.530792820113675
---- Delta:  7.0862713884503705
---- Delta:  5.878005901564052
---- Delta:  4.869541656074318
---- Delta:  4.029513112318284
---- Delta:  3.3310435865995487
---- Delta:  2.751206679211748
---- Delta:  2.270533318796481
---- Delta:  1.8725597711764408
---- Delta:  1.5434165597798142
---- Delta:  1.2714589805644891
---- Delta:  1.046939042558563
---- Delta:  0.8617174594636481
---- Delta:  0.7090132915479899
---- Delta:  0.5831881491523632
---- Delta:  0.47956151352468623
Policy Evaluation Done!
State Value Improved!
-------- Policy Iteration 2 --------
--