In [1]:
import math
import numpy as np

In [15]:
class Jack:
    def __init__(self):
        self.move_limit = 5 # Can be in [-5, 5]
        self.gamma = 0.9 # Discount rate
        self.states_in_every_store = 21 ## {0 car, 1 car, ..., 20 car}
        self.car_limit = 20
        self.V = np.zeros((self.states_in_every_store, self.states_in_every_store))
        self.Policy = np.zeros((self.states_in_every_store, self.states_in_every_store))
        #self.Policy = np.random.randint(low=-5,high=5,size=(21,21))
        
    def greedy_action(self, obs):
        loc1_car_no, loc1_returned_yesterday, loc2_car_no, loc2_returned_yesterday = obs
        loc1_car_no += loc1_returned_yesterday ## Adding the returned cars considering agent's actions
        if loc1_car_no > self.car_limit:
            loc1_car_no = self.car_limit
        loc2_car_no += loc2_returned_yesterday
        if loc2_car_no > self.car_limit:
            loc2_car_no = self.car_limit
        return self.Policy[int(loc1_car_no), int(loc2_car_no)]

<img src="./images/Jacks_Rental_problem.jpeg" width=900 hight=300>

In [21]:
class Location:
    def __init__(self, car_no, returned_yesterday, request_lambda, return_lambda):
        self.car_no = car_no
        self.request_lambda = request_lambda
        self.return_lambda = return_lambda
        self.returned_yesterday = returned_yesterday
        
class Business:
    def __init__(self):
        self.loc1 = Location(car_no=20,returned_yesterday=0,request_lambda=3,return_lambda=3)
        self.loc2 = Location(car_no=20,returned_yesterday=0,request_lambda=4,return_lambda=2)
        self.state = self.loc1.car_no, self.loc1.returned_yesterday, self.loc2.car_no, self.loc2.returned_yesterday
        
        self.car_limit = 20
        self.profit_renting = 10
        self.moving_car_price = -2
    @staticmethod
    def poisson_number_gen(self, lam, n=20):
        poisson_mass = []
        for i in range(0,n):
            prob = math.pow(lam, i) * math.exp(-lam)/ math.factorial(i)
            poisson_mass.append(prob)
        var, pp, u = 0, poisson_mass[0], np.random.uniform(0, 1)
        while(u>pp):
            var += 1
            pp = pp + poisson_mass[var]
        return var
    
    def reset(self):
        self.loc1.car_no = 20
        self.loc2.car_no = 20
        self.loc1.returned_yesterday = 0
        self.loc2.returned_yesterday = 0
        loc1_req = Business.poisson_number_gen(lam=self.loc1.request_lambda)
        self.loc1.car_no -= loc1_req
        loc2_req = Business.poisson_number_gen(lam=self.loc2.request_lambda)
        self.loc2.car_no -= loc2_req
        self.loc1.returned_yesterday = Business.poisson_number_gen(lam=self.loc1.return_lambda)
        self.loc2.returned_yesterday = Business.poisson_number_gen(lam=self.loc2.return_lambda)
        self.state = self.loc1.car_no, self.loc1.returned_yesterday, self.loc2.car_no, self.loc2.returned_yesterday
        return self.state
    @staticmethod
    def general_step(init_state, action):
        """
        Actual state is a combination of the init_state and the generated random requests and returns.
        The problem is that state cannot be pre-determined as it was the case for the GridWorld environment.
        """
        pass
    
    def step(self, action): ## Actions={-5,-4,...,-1,0,1,...,4,5}
        self.loc1.car_no += (self.loc1.returned_yesterday + action) ## Adding the returned cars considering agent's actions
        if self.loc1.car_no > self.car_limit:
            self.loc1.car_no = self.car_limit
        if self.loc1.car_no < 0:
            self.loc1.car_no = 0
        self.loc2.car_no += (self.loc2.returned_yesterday - action)
        if self.loc2.car_no > self.car_limit:
            self.loc2.car_no = self.car_limit
        if self.loc2.car_no < 0:
            self.loc2.car_no = 0
            
        loc1_req = self.poisson_number_gen(lam=self.loc1.request_lambda)
        self.loc1.car_no -= loc1_req
        if self.loc1.car_no < 0:
            self.loc1.car_no = 0
            
        print("loc1_req:", loc1_req)
        loc2_req = self.poisson_number_gen(lam=self.loc2.request_lambda)
        self.loc2.car_no -= loc2_req
        if self.loc2.car_no < 0:
            self.loc2.car_no = 0
        
        self.loc1.returned_yesterday = self.poisson_number_gen(lam=self.loc1.return_lambda)
        self.loc2.returned_yesterday = self.poisson_number_gen(lam=self.loc2.return_lambda)
        
        self.state = self.loc1.car_no, self.loc1.returned_yesterday, self.loc2.car_no, self.loc2.returned_yesterday
        reward = (loc1_req * self.profit_renting) + (loc2_req * self.profit_renting) - abs(action * self.moving_car_price)
        done = False
        if self.loc1.car_no <= 0 or self.loc2.car_no <= 0:
            done = True
        return self.state, reward, done

<img src="images/Ch04-PolicyIter.png" width=600 hight=200>

In [25]:
def areDeltasLargerThanTheta(d, t, while_counter):
    if while_counter < 2: ## Turning the while to a do..while loop
        return True
    for i in range(d.shape[0]):
        if abs(d[i]) > abs(t):
            return True
    return False
#def iterative_policy_evaluation():
if __name__ == "__main__":
    world = Business()
    agent = Jack()
    theta = 1e-3 ## Every element in delta will be compared to this value
    V_s = np.zeros((20,20))
    deltas = np.zeros((20,20)) # [0, 0 ...]
    while_count = 0
    while areDeltasLargerThanTheta(deltas, theta, while_count):
        ################
        k = while_count
        if k == 0 or k == 1 or k == 2 or k == 3 or k % 10 == 0:
            print("K =", k)
            print("--------")
            print(world.worldValues)
        ################
        deltas = np.zeros((20,20)) # [0, 0 ...] Zeroing deltas every time
        while_count+=1
        for state in range(1,15):
            v = world.get_value(state) ## v <- V(s)
            new_v = 0
            for action in range(len(world.actions)):                   ## Generating the \sum \pi(a|s)
            #This is an undicounted update (no \gamma)
                new_s, r = GridWorld.general_step(state, action)       ## p(s',r | s,a) 
                new_v += agent.policy() * (r + world.get_value(new_s)) ## Adding the value of every action to the value of the state
            world.set_value(state, new_v)                              ## V(s) <- Expression
            deltas[state] = max(deltas[state], abs(new_v - v))         ## \delta <- max(\delta, |v - V(s)|)
print("Number of main loops, Last K =", while_count)
print("--------")
print(world.worldValues)

Init State: (18, 4, 15, 2)
0.0
loc1_req: 3
(17, 1, 15.0, 6) 50.0
0.0
loc1_req: 3
(15.0, 4, 13, 2) 100.0
0.0
loc1_req: 3
(16.0, 4, 10.0, 1) 80.0
0.0
loc1_req: 3
(17.0, 2, 6.0, 3) 80.0
0.0
loc1_req: 7
(12.0, 5, 8.0, 2) 80.0
0.0
loc1_req: 0
(17.0, 2, 8.0, 1) 20.0
0.0
loc1_req: 3
(16.0, 1, 8.0, 1) 40.0
0.0
loc1_req: 4
(13.0, 2, 6.0, 2) 70.0
0.0
loc1_req: 3
(12.0, 4, 1.0, 0) 100.0
0.0
loc1_req: 4
(12.0, 6, -4.0, 2) 90.0
Episode_Reward: 710.0
Init State: (14, 2, 14, 4)
0.0
loc1_req: 2
(14.0, 3, 11.0, 5) 90.0
0.0
loc1_req: 3
(14.0, 4, 12.0, 4) 70.0
0.0
loc1_req: 2
(16.0, 1, 13.0, 1) 50.0
0.0
loc1_req: 2
(15.0, 0, 12.0, 4) 40.0
0.0
loc1_req: 1
(14.0, 4, 12.0, 2) 50.0
0.0
loc1_req: 3
(15.0, 1, 12.0, 1) 50.0
0.0
loc1_req: 4
(12.0, 1, 7.0, 2) 100.0
0.0
loc1_req: 1
(12.0, 0, 6.0, 0) 40.0
0.0
loc1_req: 5
(7.0, 5, 5.0, 1) 60.0
0.0
loc1_req: 1
(11.0, 0, 3.0, 0) 40.0
0.0
loc1_req: 2
(9.0, 3, -5.0, 4) 100.0
Episode_Reward: 690.0
Init State: (20, 3, 14, 4)
0.0
loc1_req: 3
(17, 3, 14.0, 2) 70.0
0.0
loc1_