In [1]:
%matplotlib inline

from tqdm import tqdm
import numpy as np
import pandas as pd
import numpy as np

weekMax = 12
init_oh = 800

season = np.array([1]*weekMax)
season = season/sum(season)

In [19]:
def generateSimpleData(delta, alpha, scale, season, init_oh, disc_len, weekMax):

    price = np.append(np.ones( weekMax - disc_len), (1 - delta)*np.ones(disc_len))
    
    slsu = np.zeros(weekMax)
    bst_est = np.zeros(weekMax)
    on_hand = np.zeros(weekMax)
    on_hand[0]=init_oh
    rew = np.zeros(weekMax)
    
    for weekNo in range(weekMax):
        if weekNo:
            on_hand[weekNo] = on_hand[weekNo-1] - slsu[weekNo-1]   
            
        curr_mean =scale*weekMax*season[weekNo]*(price[ weekNo]**(-alpha))
        demand = np.random.poisson(curr_mean)
        slsu[weekNo] = min(demand, on_hand[weekNo])
        rew[weekNo] = slsu[weekNo]*price[weekNo]
        bst_est[weekNo] = curr_mean
            # slsu[ShopNo, weekNo] = effictivityScale[ShopNo]*season[weekNo]*(priceMatrix[ShopNo, weekNo]**(-alpha))

    return price,  slsu, on_hand, rew, bst_est

### Точная оценка Q и V

In [21]:
state_cols = [ 'oh_after', 'days_left']
next_state_cols = [col + '_next' for col in state_cols]

def gen_rl_df(delta, alpha, scale, season, init_oh, unit_price =1, weekMax = 12, disc_len = None):
    if disc_len !=None:
        dics =disc_len
    else:
        dics = np.random.randint(0, weekMax+1)
    price,  slsu, on_hand, rew, best_est = generateSimpleData(delta, alpha, scale, \
                                                    season, init_oh, disc_len = dics, weekMax = weekMax)

    res_ser = rew*unit_price
    days_left = np.arange(weekMax, 0, -1)-1.


    df = pd.concat([pd.Series(price), 
               pd.Series(season),
               pd.Series(on_hand),
               pd.Series(days_left), 
               pd.Series(slsu), 
               pd.Series(res_ser).shift(-1), 
               pd.Series(best_est)], 1)

    
    df.columns = ['price','season', 'on_hand','days_left', 'slsu' , 'rev_ser', 'bst_est']
    df['d_r_sh'] = df.on_hand / df.slsu
    df['oh_after'] = df.on_hand - df.slsu


    df['action'] = (df.price -1. ).fillna(0)
    
    bin_d_left = pd.get_dummies(df.days_left)
    dl_cols = [ 'dl_'+str(col) for col in bin_d_left.columns]
    bin_d_left.columns = dl_cols

    for col, vals in bin_d_left.iteritems():
        df[col] = vals.values
    
    bin_cols = dl_cols
    next_bin_cols = [col + '_next' for col in bin_cols]
    
    df_shifted = df[state_cols+bin_cols].shift(-1)
    df_shifted.columns = next_state_cols + next_bin_cols

    return pd.concat([df[state_cols+bin_cols], df_shifted, df[['action', 'rev_ser', 'slsu', 'bst_est']]], 1), (bin_cols, next_bin_cols)
    


def V(oh_left, reg_sales,disc_sales,delta, days_left  ):
    res_list = [1*min(reg_sales*days_left,oh_left)]
    
    for i in range(days_left):
        reg_price_sales = min(i*reg_sales, oh_left)
        cor_proce_sales = min(disc_sales*(days_left - i),oh_left - reg_price_sales)
        res_list.append(reg_price_sales*1. + cor_proce_sales*(1- delta))

    return max(res_list)

def Q(oh_left, reg_sales,disc_sales,delta, days_left, action):
    if not days_left:
        return 0
    
    if action >= 0:
        res_list = [1*min(reg_sales*days_left,oh_left)]
        
        for i in range(1, days_left+1):
            reg_price_sales = min(i*reg_sales, oh_left)
            cor_proce_sales = min(disc_sales*(days_left - i),oh_left - reg_price_sales)
            res_list.append(reg_price_sales*1. + cor_proce_sales*(1- delta))

        return  max(res_list)
    
    return (1- delta)*min(disc_sales*days_left,oh_left)


### Попытка с бинаризацией 

In [4]:
samples_list = []
for i in (range(1000)):
    tmp_df, (bin_state, next_bin_state) = gen_rl_df(delta = .35, alpha = 2., scale = 40, season = season, init_oh = init_oh)
    samples_list.append(tmp_df)
    
data_samples = pd.concat(samples_list)




df =data_samples.fillna(0)
df['dumm_0'] = 0.
df['dumm_m35'] = -.35

df['V'] = df.apply(lambda x:V(x['oh_after'], 40, 94.67, .35, int(x['days_left'])), 1)
df['Q_keep'] = df.apply(lambda x:Q(x['oh_after'], 40, 94.67, .35, int(x['days_left']), 0), 1)
df['Q_disc'] = df.apply(lambda x:Q(x['oh_after'], 40, 94.67, .35, int(x['days_left']), -.35), 1)

In [5]:
num_actions = 2
m35_order = 1
z_order = 0

state_cols_curr = ['oh_after'] + bin_state
next_state_cols_curr = [col + '_next' for col in state_cols_curr]
useful_cols = ['days_left', 'action', 'rev_ser', 'days_left_next']


used_cols = useful_cols +state_cols_curr+next_state_cols_curr
col_n2no =  {name:i for i, name in  enumerate(used_cols)}

X = df[used_cols].values


state_cols_nos = [col_n2no[name] for name in state_cols_curr]
next_state_cols_nos = [col_n2no[name] for name in next_state_cols_curr]

state_cols_nos, next_state_cols_nos


([4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
 [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])

In [23]:
from keras.models import Sequential
from keras.layers.convolutional import Convolution2D
from keras.layers.core import Dense, Flatten, Dropout
from keras.optimizers import SGD, RMSprop,Adagrad



# define the architecture of the network
model = Sequential()
model.add(Dense(8, input_dim=len(state_cols_nos), init="uniform", activation="relu"))
model.add(Dense(8, activation="relu", kernel_initializer="uniform"))
model.add(Dense(num_actions))
model.compile(RMSprop(lr=0.0001), 'MSE')



  # Remove the CWD from sys.path while we load stuff.


### вместе

In [None]:
err_list = []
q_0_hist = []
for i in tqdm(range(150000), mininterval=10):
    idx = np.random.randint(df.shape[0], size = 500)
    X_curr = X[idx]
    
    last_day_ind = np.where(X_curr[:, col_n2no['days_left_next']] ==0)
    m35_action_ind = np.where(X_curr[:, col_n2no['action']] == -0.35)
    z_action_ind = np.where(X_curr[:, col_n2no['action']] ==0.)
    

    next_pred = model.predict(X_curr[:, next_state_cols_nos])
    next_pred[last_day_ind] = 0.
    
    
    target    = model.predict(X_curr[:, state_cols_nos])
    target[last_day_ind] = 0.
    
    
    V_next = np.max(next_pred, 1)


    target[m35_action_ind, m35_order] = X_curr[m35_action_ind, col_n2no['rev_ser']] + V_next[m35_action_ind] 
    target[z_action_ind, z_order]     = X_curr[z_action_ind, col_n2no['rev_ser']]   + V_next[z_action_ind]

    err = model.train_on_batch(X_curr[:,state_cols_nos], target)
    err_list.append(err)
    if not i%500:
        mean_q_0 = next_pred[np.where(X_curr[:, col_n2no['days_left_next']] ==10)].mean(0)
        q_0_hist.append(mean_q_0)
        print('err, mean_q_0',err, mean_q_0 )




  0%|          | 0/150000 [00:00<?, ?it/s][A[A[A

err, mean_q_0 1159.8162 [1.1361476 0.6300827]
err, mean_q_0 1108.0824 [17.241789 11.79391 ]





  1%|          | 795/150000 [00:10<31:18, 79.43it/s][A[A[A

err, mean_q_0 1090.7914 [54.854763 35.46076 ]
err, mean_q_0 1066.6434 [119.131546  77.01834 ]





  1%|          | 1662/150000 [00:20<29:46, 83.04it/s][A[A[A

err, mean_q_0 1243.9375 [214.79024 141.6203 ]
err, mean_q_0 1756.756 [338.45132 234.39214]





  2%|▏         | 2529/150000 [00:30<29:11, 84.18it/s][A[A[A

err, mean_q_0 1819.6536 [470.693  368.7067]





  2%|▏         | 3397/150000 [00:40<28:48, 84.82it/s][A[A[A

err, mean_q_0 1923.3962 [496.83133 444.72382]
err, mean_q_0 2224.118 [494.09915 443.22675]





  3%|▎         | 4265/150000 [00:50<28:31, 85.16it/s][A[A[A

err, mean_q_0 1699.8341 [495.15897 445.05124]
err, mean_q_0 2449.1116 [498.3612 448.8466]





  3%|▎         | 5131/150000 [01:00<28:17, 85.35it/s][A[A[A

err, mean_q_0 1646.8152 [491.0441  440.78357]





  4%|▍         | 5998/150000 [01:10<28:03, 85.54it/s][A[A[A

err, mean_q_0 2373.3723 [489.99673 440.4265 ]
err, mean_q_0 2313.67 [497.38928 447.1933 ]





  5%|▍         | 6865/150000 [01:20<27:50, 85.66it/s][A[A[A

err, mean_q_0 1874.436 [494.9224 444.6216]
err, mean_q_0 1571.3085 [493.17014 444.10657]





  5%|▌         | 7731/150000 [01:30<27:38, 85.76it/s][A[A[A

err, mean_q_0 1778.0635 [493.92038 443.69974]
err, mean_q_0 1997.9142 [496.51437 445.07208]





  6%|▌         | 8599/150000 [01:40<27:26, 85.86it/s][A[A[A

err, mean_q_0 2385.8608 [490.18723 440.8799 ]





  6%|▋         | 9468/150000 [01:50<27:15, 85.94it/s][A[A[A

err, mean_q_0 1713.9341 [493.5276  443.72632]
err, mean_q_0 2222.979 [493.5753  443.39612]





  7%|▋         | 10337/150000 [02:00<27:04, 85.99it/s][A[A[A

err, mean_q_0 1876.5623 [496.83594 446.3624 ]
err, mean_q_0 2024.5377 [499.11404 448.06073]





  7%|▋         | 11205/150000 [02:10<26:52, 86.05it/s][A[A[A

err, mean_q_0 1893.1741 [496.6931  445.84314]
err, mean_q_0 1823.0208 [496.0735  446.07205]





  8%|▊         | 12074/150000 [02:20<26:41, 86.11it/s][A[A[A

err, mean_q_0 1851.7441 [496.04807 445.65012]





  9%|▊         | 12943/150000 [02:30<26:30, 86.15it/s][A[A[A

err, mean_q_0 2412.7756 [489.9221  439.74194]
err, mean_q_0 2110.7668 [491.55624 442.06393]





  9%|▉         | 13812/150000 [02:40<26:19, 86.20it/s][A[A[A

err, mean_q_0 1720.1117 [503.78403 452.63013]
err, mean_q_0 1980.5278 [505.61853 453.4099 ]





 10%|▉         | 14682/150000 [02:50<26:09, 86.24it/s][A[A[A

err, mean_q_0 2901.0388 [500.0042  448.40747]
err, mean_q_0 1644.3767 [495.6776 445.6001]





 10%|█         | 15552/150000 [03:00<25:58, 86.27it/s][A[A[A

err, mean_q_0 1085.995 [501.5488 450.4258]





 11%|█         | 16423/150000 [03:10<25:47, 86.31it/s][A[A[A

err, mean_q_0 1325.3652 [497.77353 447.0142 ]
err, mean_q_0 1646.8258 [497.4117  446.69604]





 12%|█▏        | 17294/150000 [03:20<25:37, 86.34it/s][A[A[A

err, mean_q_0 1863.3423 [501.25055 449.73795]
err, mean_q_0 1512.8103 [501.58035 449.94592]





 12%|█▏        | 18164/150000 [03:30<25:26, 86.36it/s][A[A[A

err, mean_q_0 1856.0892 [499.8644  449.12314]
err, mean_q_0 2023.4725 [503.61993 452.15594]





 13%|█▎        | 19035/150000 [03:40<25:15, 86.39it/s][A[A[A

err, mean_q_0 1722.3451 [507.54767 454.75076]





 13%|█▎        | 19906/150000 [03:50<25:05, 86.42it/s][A[A[A

err, mean_q_0 1609.1193 [508.345   456.60703]
err, mean_q_0 1720.1692 [502.7322  452.48535]





 14%|█▍        | 20775/150000 [04:00<24:55, 86.43it/s][A[A[A

err, mean_q_0 1027.3319 [495.94897 446.18448]
err, mean_q_0 1213.034 [503.05728 451.03113]





 14%|█▍        | 21645/150000 [04:10<24:44, 86.45it/s][A[A[A

err, mean_q_0 1103.7836 [509.66022 457.15118]





 15%|█▌        | 22515/150000 [04:20<24:34, 86.47it/s][A[A[A

err, mean_q_0 1065.6046 [501.6689  450.24686]
err, mean_q_0 1295.6375 [508.212   456.01114]





 16%|█▌        | 23385/150000 [04:30<24:24, 86.48it/s][A[A[A

err, mean_q_0 1231.875 [507.9106  455.93765]
err, mean_q_0 1121.0773 [507.27878 455.13333]





 16%|█▌        | 24254/150000 [04:40<24:13, 86.49it/s][A[A[A

err, mean_q_0 1088.2493 [510.54056 458.10068]
err, mean_q_0 1134.6383 [503.57275 453.1849 ]





 17%|█▋        | 25124/150000 [04:50<24:03, 86.51it/s][A[A[A

err, mean_q_0 1104.0048 [506.6862  456.34885]





 17%|█▋        | 25994/150000 [05:00<23:53, 86.51it/s][A[A[A

err, mean_q_0 1598.8945 [510.86856 459.30426]
err, mean_q_0 925.499 [505.05954 454.6443 ]





 18%|█▊        | 26863/150000 [05:10<23:43, 86.53it/s][A[A[A

err, mean_q_0 1035.3063 [515.4035  463.83075]
err, mean_q_0 1051.527 [506.36526 456.45682]





 18%|█▊        | 27732/150000 [05:20<23:32, 86.54it/s][A[A[A

err, mean_q_0 664.02423 [514.7157  463.44608]
err, mean_q_0 938.477 [507.87372 458.26797]





 19%|█▉        | 28602/150000 [05:30<23:22, 86.55it/s][A[A[A

err, mean_q_0 1066.5627 [515.85706 465.6553 ]





 20%|█▉        | 29472/150000 [05:40<23:12, 86.56it/s][A[A[A

err, mean_q_0 670.2291 [498.34424 451.4188 ]
err, mean_q_0 558.386 [507.8218 460.8202]





 20%|██        | 30343/150000 [05:50<23:02, 86.57it/s][A[A[A

err, mean_q_0 521.906 [504.56488 458.85226]
err, mean_q_0 707.3935 [506.14767 461.04254]





 21%|██        | 31214/150000 [06:00<22:52, 86.58it/s][A[A[A

err, mean_q_0 516.9381 [508.92133 462.43442]
err, mean_q_0 458.9322 [498.49875 452.6806 ]





 21%|██▏       | 32084/150000 [06:10<22:41, 86.59it/s][A[A[A

err, mean_q_0 485.29987 [505.67578 458.90305]





 22%|██▏       | 32954/150000 [06:20<22:31, 86.60it/s][A[A[A

err, mean_q_0 470.05884 [501.6876  455.07568]
err, mean_q_0 323.4839 [503.59067 457.6412 ]


In [17]:
samples_list = []
for i in (range(10)):
    tmp_df, (bin_state, next_bin_state) = gen_rl_df(delta = .35, alpha = 2., scale = 40, season = season, init_oh = init_oh)
    samples_list.append(tmp_df)
    
data_samples = pd.concat(samples_list)




df =data_samples.fillna(0)
df['dumm_0'] = 0.
df['dumm_m35'] = -.35

df['V'] = df.apply(lambda x:V(x['oh_after'], 40, 94.67, .35, int(x['days_left'])), 1)
df['Q_keep'] = df.apply(lambda x:Q(x['oh_after'], 40, 94.67, .35, int(x['days_left']), 0), 1)
df['Q_disc'] = df.apply(lambda x:Q(x['oh_after'], 40, 94.67, .35, int(x['days_left']), -.35), 1)




pred_df = pd.DataFrame(model.predict(df[state_cols_curr]), columns=  ['q_keep_pred', 'q_disc_pred'])
df['q_keep_pred'] = pred_df['q_keep_pred']
df['q_disc_pred'] = pred_df['q_disc_pred']

df[ ['days_left', 'oh_after','action','rev_ser' ,'Q_keep', 'Q_disc', 'q_keep_pred', 'q_disc_pred'] ]


Unnamed: 0,days_left,oh_after,action,rev_ser,Q_keep,Q_disc,q_keep_pred,q_disc_pred
0,11.0,758.0,0.00,42.00,562.7000,492.7000,605.606445,605.254517
1,10.0,716.0,0.00,59.80,521.4000,465.4000,562.277100,564.369385
2,9.0,624.0,-0.35,64.35,461.6000,405.6000,493.778778,495.886627
3,8.0,525.0,-0.35,54.60,397.2500,341.2500,419.461212,422.364288
4,7.0,441.0,-0.35,49.40,342.6500,286.6500,378.826080,375.215546
5,6.0,365.0,-0.35,53.30,283.0710,237.2500,312.397369,305.904449
6,5.0,283.0,-0.35,68.90,225.9500,183.9500,224.437149,224.028183
7,4.0,177.0,-0.35,58.50,160.0000,115.0500,157.969620,143.083282
8,3.0,87.0,-0.35,56.55,87.0000,56.5500,89.803215,66.849716
9,2.0,0.0,-0.35,0.00,0.0000,0.0000,11.290747,0.086078




 16%|█▌        | 23970/150000 [05:13<27:28, 76.44it/s][A[A

In [22]:
tmp_df = gen_rl_df(delta = .35, alpha = 2., scale = 40, season = season, init_oh = init_oh, disc_len=0)

pred_df = pd.DataFrame(model.predict(df[state_cols_curr]), columns=  ['q_keep_pred', 'q_disc_pred'])
df['q_keep_pred'] = pred_df['q_keep_pred']
df['q_disc_pred'] = pred_df['q_disc_pred']

df[ ['days_left', 'oh_after','action','rev_ser' ,'Q_keep', 'Q_disc', 'q_keep_pred', 'q_disc_pred'] ]

Unnamed: 0,days_left,oh_after,action,rev_ser,Q_keep,Q_disc,q_keep_pred,q_disc_pred
0,11.0,758.0,0.00,42.00,562.7000,492.7000,605.606445,605.254517
1,10.0,716.0,0.00,59.80,521.4000,465.4000,562.277100,564.369385
2,9.0,624.0,-0.35,64.35,461.6000,405.6000,493.778778,495.886627
3,8.0,525.0,-0.35,54.60,397.2500,341.2500,419.461212,422.364288
4,7.0,441.0,-0.35,49.40,342.6500,286.6500,378.826080,375.215546
5,6.0,365.0,-0.35,53.30,283.0710,237.2500,312.397369,305.904449
6,5.0,283.0,-0.35,68.90,225.9500,183.9500,224.437149,224.028183
7,4.0,177.0,-0.35,58.50,160.0000,115.0500,157.969620,143.083282
8,3.0,87.0,-0.35,56.55,87.0000,56.5500,89.803215,66.849716
9,2.0,0.0,-0.35,0.00,0.0000,0.0000,11.290747,0.086078
