In [None]:
import torch

class InventoryEnv:
    def __init__(self, capacity=40, demand_dist='uniform'):
        assert demand_dist in ['uniform', 'poisson']
        self.capacity = capacity
        self.gamma = 0.25

        # 成本设置
        self.variable_cost = 1 
        self.fixed_cost = 200 # 固定成本
        self.holding_cost = 1 # 存活成本
        self.selling_price = 10 # 销售价格

        if demand_dist=='uniform':
            self.max_demand = capacity # 最大需求量不可以超过存货的最大值
            # demand只会出现在0, 1, ..., 40这几个值，因此是41个值
            self.demand_lst = torch.arange(self.max_demand+1) # shape = (capacity+1, )
            self.demand_probs = torch.ones(self.max_demand+1)/(self.max_demand+1)
        else:
            raise NotImplementedError("后面再写poisson")
    
    def get_state_space(self):
        """
        state = 当前库存数量
        如果仓库容量是capacity，那么库存不可能小于0，并且不可能大于capacity
        """
        return torch.arange(self.capacity+1)
    
    def get_action_space(self):
        """
        action = 订货数量(order quantity)
        可以订货0个，也可以订货1个、2个，一直到capacity个
        """
        return torch.arange(self.capacity+1)
    
    def one_step_cost(self, state, action, demand):
        """
        state: 当前库存 
        action: 订货量
        demand: 需求量
        """
        # 订货成本：如果进货a，那么固定成本为fixed_cost*I(a>0)+c*a
        acquisition = self.fixed_cost * (action>0) + self.variable_cost * action
        # 库存成本：如果demand<(state+action)，说明有剩余
        holding = self.holding_cost*torch.clamp(state+action-demand, min=0)
        # 缺货成本：p*max(demand-state-action, 0)
        lost_sales = self.selling_price * torch.clamp(demand-state-action, min=0)
        return acquisition+holding+lost_sales
    
    def simulation(self, policy, num_episodes=100, days_per_episode=100):
        """
        用策略policy进行模拟
        return:
        : sim_states: (L, T+1)
        : sim_actions: (L, T)
        : sim_demands: (L, T)
        : sim_costs: (L, T)
        """
        L = num_episodes # 表示轨迹的数量
        T = days_per_episode # 表示每个轨迹有多少天
        # s0, a0, r0, s1, a1, r1, s2, a2, r2 # 轨迹1，并且有三天

        # 初始化结果，表示经过模拟后的states, actions, demands, costs
        # 分别有L个轨迹，每个轨迹有T天
        sim_states = torch.zeros((L, T+1))
        sim_actions = torch.zeros((L, T))
        sim_demands = torch.zeros((L, T))
        sim_costs = torch.zeros((L, T))

        # 随机初始库存
        sim_states[:, 0] = torch.randint(low=0, high=self.capacity+1, size = (L, ))

        for t in range(T):
            # 当前episode的库存state
            states = sim_states[:, t] # 当前episode的第t天的策略
            print(states)

            # 根据策略选择动作（动作概率分布）
            with torch.no_grad():
                # policy输入shape必须是(L, 1)
                action_probs = policy(states.view(-1, 1))

                # 构造可行动作mask（不能超过capacity-state)
                action_idx = torch.arange(self.capacity+1).unsqueeze(0) # shape = (1, capacity+1)
                limits = (self.capacity-states.long()).unsqueeze(1) # shape = (L, 1)
                # unsqueeze(0)可以让维度1的位置加入一个1
                mask = (action_idx<=limits).float()

                # 屏蔽非法动作
                masked_probs = action_probs * mask

                # 防止除以0，重新归一化
                masked_probs = masked_probs/masked_probs.sum(dim=1, keepdim=True)

            # 采样动作
            actions = torch.multinomial(masked_probs, num_samples=1).squeeze(1) # shape = (L, )

            # 随机生成需求
            demands = torch.multinomial(self.demand_probs, num_samples = L, replacement=True)

            # 计算成本
            costs = self.one_step_cost(states, actions, demands)

            # 更新下一个state
            next_states = torch.clamp(states+actions-demands, 0, self.capacity)

            # 写入结果
            sim_states[:, t+1] = next_states
            sim_actions[:, t] = actions
            sim_demands[:, t] = actions
            sim_costs[:, t] = costs
        return sim_states, sim_actions, sim_demands, sim_costs

In [11]:
import torch.nn as nn
import torch

class ValueNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim = 128):
        """
        V(s): 输入state，比如库存数，输出一个标量value
        input_dim: 状态维度，这里是1，只有库存一个数
        hidden_dim: 隐藏层维度
        """
        super().__init__()

        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim), 
            nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim), 
            nn.Tanh(), 
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, 1)

        )
    def forward(self, x):
        """
        x: shape = (batch_size, input_dim)
        return: shape = (batch_size, 1)
        """
        return self.model(x)

states = torch.tensor([[0.0],[10],[20],[30],[40]]) # shape = (5, 1) 
value_net = ValueNetwork(input_dim=1, hidden_dim=128)
values = value_net(states)
print("input states shape = ", states.shape)
print("output values shape = ", values.shape)
print("values = ", values)

input states shape =  torch.Size([5, 1])
output values shape =  torch.Size([5, 1])
values =  tensor([[-0.1468],
        [-0.2407],
        [-0.2691],
        [-0.2735],
        [-0.2740]], grad_fn=<AddmmBackward0>)


In [14]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim = 128):
        """
        input: state (batch_size x input_dim)
        output: the probabilties of actions (batch_size x output_dim)
        """
        super().__init__()

        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim), 
            nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, output_dim),
            nn.Softmax(dim=1)
        )
    
    def forward(self, x):
        return self.model(x)

policy_network = PolicyNetwork(input_dim=1, output_dim=41)
states = torch.tensor([[0.0],[10.0],[20.0],[30.0],[40.0]])
probs = policy_network(states)
print("probs.shape:", probs.shape)
print("每行概率之和:", probs.sum(dim=1))

probs.shape: torch.Size([5, 41])
每行概率之和: tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000], grad_fn=<SumBackward1>)


In [2]:
env = InventoryEnv(capacity = 40)
print("demand_lst:", env.demand_lst.shape)
print("demand_probs:", env.demand_probs.shape)
print("sum of probs:", env.demand_probs.sum())

demand_lst: torch.Size([41])
demand_probs: torch.Size([41])
sum of probs: tensor(1.)


In [None]:
print(env.get_action_space()) # 动作空间的值，表示可以订购的数量
print(env.get_state_space()) # 状态空间的值，表示当前的库存量

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40])
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40])


In [3]:
states = torch.tensor([10, 20, 5, 0])
actions = torch.tensor([5, 3, 10, 0])
demands = torch.tensor([12, 20, 1, 0])

print(env.one_step_cost(states, actions, demands))


tensor([208, 206, 224,   0])


In [None]:
env = InventoryEnv(capacity=40)

# 假 policy：每个动作概率都一样
policy_dummy = lambda x: torch.ones((len(x), env.capacity+1)) / (env.capacity+1)

# 总共有3个episode，每个episode有5天，现在需要通过这三个episode来进行模拟
# 生成state，action，demand，cost
states, actions, demands, costs = env.simulation(policy_dummy, num_episodes=3, days_per_episode=5)

print("states.shape  =", states.shape)    # 期望 (3, 6)
print("actions.shape =", actions.shape)   # 期望 (3, 5)
print("demands.shape =", demands.shape)   # 期望 (3, 5)
print("costs.shape   =", costs.shape)     # 期望 (3, 5)


tensor([23., 20., 26.])
tensor([ 0., 19., 13.])
tensor([19.,  0.,  0.])
tensor([18.,  5.,  0.])
tensor([ 3.,  0., 11.])
states.shape  = torch.Size([3, 6])
actions.shape = torch.Size([3, 5])
demands.shape = torch.Size([3, 5])
costs.shape   = torch.Size([3, 5])
