In [1]:
import numpy as np
from scipy.stats import poisson

In [2]:
requested_1 = 3
requested_2 = 4
returned_1 = 3
returned_2 = 2
mu = [requested_1, requested_2, returned_1, returned_2]

y = 0.9

max_cars = 20
max_overtake = 5

rew_down = -2
rew_up = 10

In [3]:
def car_interval(mu, e):
    for i in range(max_cars+1):
        if poisson.pmf(i, mu) > e:
            start = i
            for j in range(i, max_cars+1):
                if poisson.pmf(j, mu) < e:
                    end = j-1
                    return range(start, end+1)
                elif j == max_cars:
                    return range(start, max_cars+1)
    return range([0, 0])

In [4]:
# Расмматриваем количество арендованных и возвращенных машин, для которых вероятность появления больше e
e = 0.01
int_req_1 = car_interval(requested_1, e)
int_ret_1 = car_interval(returned_1, e)
int_req_2 = car_interval(requested_2, e)
int_ret_2 = car_interval(returned_2, e)

In [5]:
values = np.zeros((max_cars + 1, max_cars + 1))
policy = values.copy().astype(int)

In [6]:
# оценка ценности

def new_value(state, action, values):
    
    new_state = [state[0]-action, state[1]+action] # больше 0, меньше 20 (где учитывать?)
    v = abs(action)*rew_down # затраты на перегон машин
    
    for out_1 in int_req_1:
        for out_2 in int_req_2:
            for in_1 in int_ret_1:
                for in_2 in int_ret_2:
                    n = [out_1, out_2, in_1, in_2] # количество арендованных и возвращенных машин в 1 и 2 офисах
                    p = np.prod(poisson.pmf(n, mu)) # вероятность комбинации n
            
                    # сколько машин было сдано в аренду в 1 и 2 офисах
                    cars_req1 = min(out_1, new_state[0])
                    cars_req2 = min(out_2, new_state[1])
            
                    # прибыль за аренду
                    reward = (cars_req1 + cars_req2) * rew_up
                    
                    # сколько машин осталось после возвращения машин
                    cars_ret1 = min(new_state[0] - cars_req1 + in_1, max_cars)
                    cars_ret2 = min(new_state[1] - cars_req2 + in_1, max_cars)

                    v += p * (reward + y * values[cars_ret1, cars_ret2])
    return v

In [16]:
# оценивание стратегии

def policy_evoluation(values, Q):
    dif = 1
    while dif > Q:
        old_values = values.copy()
        for i in range(values.shape[0]):
            for j in range(values.shape[1]):
                action = policy[i, j]
                values[i, j] = new_value([i, j], action, values)
        dif = abs(old_values-values).max()
        print(dif)
    return values

In [8]:
# улучшение стратегии

def policy_improvement(policy, values):
    old_policy = policy.copy()
    for i in range(policy.shape[0]):
        for j in range(policy.shape[1]):
            
            act_min = max(-j, -5)
            act_max = min(5, i) + 1
            all_values = {}
            
            for action in range(act_min, act_max):
                all_values[action] = new_value([i, j], action, values)
            new_action = max(all_values, key=all_values.get)
            policy[i, j] = new_action
    if policy == old_policy:
        return True
    else:
        return False

In [17]:
for _ in range(1):
    Q = 0.01
    values = policy_evoluation(values, Q)
    if policy_improvement(policy, values):
        break

0 0
0 1
0 2
0 3
0 4
0 5
0 6
0 7
0 8
0 9
0 10
0 11
0 12
0 13
0 14
0 15
0 16
0 17
0 18
0 19
0 20
1 0
1 1
1 2
1 3
1 4
1 5
1 6
1 7
1 8
1 9
1 10
1 11
1 12
1 13
1 14
1 15
1 16
1 17
1 18
1 19
1 20
2 0
2 1
2 2
2 3
2 4
2 5
2 6
2 7
2 8
2 9
2 10
2 11
2 12
2 13
2 14
2 15
2 16
2 17
2 18
2 19
2 20
3 0
3 1
3 2
3 3
3 4
3 5
3 6
3 7
3 8
3 9
3 10
3 11
3 12
3 13
3 14
3 15
3 16
3 17
3 18
3 19
3 20
4 0
4 1
4 2
4 3
4 4
4 5
4 6
4 7
4 8
4 9
4 10
4 11
4 12
4 13
4 14
4 15
4 16
4 17
4 18
4 19
4 20
5 0
5 1
5 2
5 3
5 4
5 5
5 6
5 7
5 8
5 9
5 10
5 11
5 12
5 13
5 14
5 15
5 16
5 17
5 18
5 19
5 20
6 0
6 1
6 2
6 3
6 4
6 5
6 6
6 7
6 8
6 9
6 10
6 11
6 12
6 13
6 14
6 15
6 16
6 17
6 18
6 19
6 20
7 0
7 1
7 2
7 3
7 4
7 5
7 6
7 7
7 8
7 9
7 10
7 11
7 12
7 13
7 14
7 15
7 16
7 17
7 18
7 19
7 20
8 0
8 1
8 2
8 3
8 4
8 5
8 6
8 7
8 8
8 9
8 10
8 11
8 12
8 13
8 14
8 15
8 16
8 17
8 18
8 19
8 20
9 0
9 1
9 2
9 3
9 4
9 5
9 6
9 7
9 8
9 9
9 10
9 11
9 12
9 13
9 14
9 15
9 16
9 17
9 18
9 19
9 20
10 0
10 1
10 2
10 3
10 4
10 5
10 6
10 7
10 8
10 9


KeyboardInterrupt: 

![avatar](s2.png)