# Generate Raw Data

In [4]:
import gym
import random
import numpy as np
import gym_gridworld
import pandas as pd

random_state = 0
np.random.seed(random_state)
random.seed(random_state)

env = gym.make('gridworld-v0', slide=True, deterministic=False)
env.seed(random_state)
action_size = 3

episodes = 1
df = pd.DataFrame(columns=['episode_id', 'transition_id', 'state', 'action', 'immediate_reward', 
                      'delayed_reward', 'infer_reward', 'infer_reward_gp', 'done', 'next_state'])

for ep in range(episodes):
    state = env.reset()
    print(np.array(state))
    done = False
    delayed_reward = 0
    transition_id = 0
    while not done:
#         env.render()
        action = np.random.choice(range(action_size))
        next_state, reward, done, info = env.step(action)
        delayed_reward += reward

        if done:
            if ep%100==0:
                print("Episode:", ep, "| Total Reward:", round(delayed_reward,2))
            df = df.append({'episode_id':ep, 'transition_id':transition_id, 'state':np.array(state), 'action':action, 
                   'immediate_reward': reward, 'delayed_reward':delayed_reward, 'done':done, 'next_state':np.array(next_state)}, 
                  ignore_index=True)
            break
        
        df = df.append({'episode_id':ep, 'transition_id':transition_id, 'state':np.array(state), 'action':action, 
                   'immediate_reward': reward, 'delayed_reward':0, 'done':done, 'next_state':np.array(next_state)}, 
                  ignore_index=True)
        transition_id += 1
        state = next_state

df

[0 0]
Episode: 0 | Total Reward: -2.7


Unnamed: 0,episode_id,transition_id,state,action,immediate_reward,delayed_reward,infer_reward,infer_reward_gp,done,next_state
0,0,0,"[0, 0]",0,-0.1,0.0,,,False,"[0, 1]"
1,0,1,"[0, 1]",1,-1.1,0.0,,,False,"[1, 1]"
2,0,2,"[1, 1]",0,0.9,0.0,,,False,"[1, 2]"
3,0,3,"[1, 2]",1,-0.1,0.0,,,False,"[2, 2]"
4,0,4,"[2, 2]",1,-0.1,0.0,,,False,"[3, 2]"
5,0,5,"[3, 2]",2,-0.1,0.0,,,False,"[3, 1]"
6,0,6,"[3, 1]",0,-0.1,0.0,,,False,"[3, 2]"
7,0,7,"[3, 2]",2,-0.1,0.0,,,False,"[3, 1]"
8,0,8,"[3, 1]",0,-0.1,0.0,,,False,"[3, 2]"
9,0,9,"[3, 2]",0,-0.1,0.0,,,False,"[3, 3]"


In [2]:
df['delayed_reward'] = pd.to_numeric(df['delayed_reward'])
df = df.sort_values(by=['episode_id', 'transition_id'])
df.reset_index(inplace=True, drop=True)
print("Total transitions:", len(df), " | Total episodes:", len(df['episode_id'].unique()))

df.to_pickle('../data/gridworldchi_dm_slide_1k.pkl')

Total transitions: 35570  | Total episodes: 1000


# LSTM infer

In [3]:
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from keras.models import Sequential, Model
import keras.layers as layers
from keras.optimizers import Adam
from keras import backend as K
from keras.layers.merge import _Merge, Multiply
from copy import deepcopy
import keras

random_state=0
np.random.seed(random_state)
random.seed(random_state)

dataset = '../data/gridworldchi_ndm_slide_1k.pkl'
df = pd.read_pickle(dataset)
df

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


Unnamed: 0,episode_id,transition_id,state,action,immediate_reward,delayed_reward,infer_reward,infer_reward_gp,done,next_state
0,0,0,"[0, 0]",0,-0.1,0.0,,,False,"[0, 1]"
1,0,1,"[0, 1]",0,-0.1,0.0,,,False,"[0, 2]"
2,0,2,"[0, 2]",1,0.9,0.0,,,False,"[1, 2]"
3,0,3,"[1, 2]",2,-1.1,0.0,,,False,"[1, 1]"
4,0,4,"[1, 1]",2,-0.1,0.0,,,False,"[1, 2]"
...,...,...,...,...,...,...,...,...,...,...
35935,999,34,"[13, 1]",0,-0.1,0.0,,,False,"[13, 2]"
35936,999,35,"[13, 2]",0,-0.1,0.0,,,False,"[13, 3]"
35937,999,36,"[13, 3]",0,-0.1,0.0,,,False,"[13, 4]"
35938,999,37,"[13, 4]",0,-0.1,0.0,,,False,"[13, 5]"


In [4]:
summ = df.groupby(['episode_id']).count().describe()
max_step = summ.loc['max','transition_id']
history = int(summ.loc['75%','transition_id'])
action_size = len(df['action'].unique())
print("History needed:", history, "| Max step:", max_step, "| Action size:", action_size)
summ

History needed: 41 | Max step: 105.0 | Action size: 3


Unnamed: 0,transition_id,state,action,immediate_reward,delayed_reward,infer_reward,infer_reward_gp,done,next_state
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,35.94,35.94,35.94,35.94,35.94,0.0,0.0,35.94,35.94
std,10.211595,10.211595,10.211595,10.211595,10.211595,0.0,0.0,10.211595,10.211595
min,19.0,19.0,19.0,19.0,19.0,0.0,0.0,19.0,19.0
25%,29.0,29.0,29.0,29.0,29.0,0.0,0.0,29.0,29.0
50%,34.0,34.0,34.0,34.0,34.0,0.0,0.0,34.0,34.0
75%,41.0,41.0,41.0,41.0,41.0,0.0,0.0,41.0,41.0
max,105.0,105.0,105.0,105.0,105.0,0.0,0.0,105.0,105.0


In [5]:
def get_features(row, action_size=action_size):
    # dummy data for filler
    if isinstance(row, int):
        l = [0] * row
        return [l]
    
    act_tuple = ()
    for i in range(action_size):
        if row['action']==i:
            act_tuple += (1,)
        else:
            act_tuple += (0,)
            
    return [list(tuple([row['state'][0], row['state'][1]]) + tuple([row['next_state'][0], row['next_state'][1]]))]

# LSTM MODEL
def get_model(state_size, history, unit):
    # one input layer
    inp_s1 = layers.Input(shape=(None, state_size))
    # an lstm layer that takes the input sequence, produces a sequence and returns states
    layer_s1 = layers.LSTM(unit, return_sequences=True, return_state=True)
    outputs_s1, state_h_s1, state_c_s1 = layer_s1(inp_s1)
    # we use the states for next input sequence
    states_prev = [state_h_s1, state_c_s1]
    # and we use a dense as immediate reward
    dense_s1 = layers.Dense(1, activation='linear')
    out_s1 = dense_s1(outputs_s1)
    
    
    outs = []
    inps = []
    outs.append(out_s1)
    inps.append(inp_s1)
    
    for h in range(history-1):
        # one input layer
        inp_next = layers.Input(shape=(None, state_size))
        # an lstm layer that takes the input sequence, produces a sequence and returns states
        layer_next = layers.LSTM(unit, return_sequences=True, return_state=True)
        outputs_next, state_h_next, state_c_next = layer_next(inp_next, initial_state=states_prev)
        # we use the states for next input sequence
        states_prev = [state_h_next, state_c_next]
        # and we use a dense as immediate reward
        dense_next = layers.Dense(1, activation='linear')
        out_next = dense_next(outputs_next)

        outs.append(out_next)
        inps.append(inp_next)

    # we have an upper layer for delayed reward
    delayed = layers.Add()(outs)

    model = Model(inps, delayed)
    model.compile(loss='mse', optimizer=Adam(lr=0.01))
    
    return model


In [6]:
row = df.loc[12]
dummy = get_features(row)
print("Dummy feature:", dummy)


# Create INPUT OUTPUT for LSTM
feature_length = len(get_features(df.loc[0])[0])

X = []
for i in range(history):
    X.append([])
    
y = []
for ep in sorted(df['episode_id'].unique()):
    step_count = 0
    for i, row in df.loc[df['episode_id']==ep].sort_values('transition_id', ascending=False).iterrows():
        if step_count==0:
            y.append([[row['delayed_reward']]])

        feature = get_features(row)
        X[step_count].append(feature)
        step_count+=1
        if step_count==history:
            break
    while step_count<history:
        dummy = get_features(feature_length)
        X[step_count].append(dummy)
        step_count+=1
    
y = np.array(y)
X.reverse()
for i in range(history):
    X[i] = np.array(X[i])
    


Dummy feature: [[3, 3, 4, 3]]


In [7]:
def run_and_optimize(X, y, df):
    best_error = 99999
    best_unit = 8
    best_df = df.copy()
    for unit in [8, 16, 24, 32]:
        # Build a model
        print('--running ' + str(unit) + '--')
        model = get_model(state_size=feature_length, history=history, unit=unit)
        model.fit(X,y, epochs=200)

        # This is the infer reward
        f = keras.backend.function(model.input, model.layers[-1].input) 
        infer_reward = f([X,1])
        
        print("--calculating error--")
        df['infer_reward'] = 0
        for ep in sorted(df['episode_id'].unique()):
            ep_total = 0
            if ep%100==0:
                print("Copying immediate reward for Ep:", ep)

            curr_trans = max(df.loc[df['episode_id']==ep, 'transition_id'])
            # this loops from last transitions to back
            for h in reversed(range(history)):
                immediate_reward = infer_reward[h][ep][0][0]
                ep_total += immediate_reward
                if curr_trans>=0:
                    df.loc[(df['episode_id']==ep) & (df['transition_id']==curr_trans), 'infer_reward'] = immediate_reward
                    curr_trans-=1
                    
        a = df.groupby(['episode_id']).sum()
        mae_delayed = sum(abs(a['delayed_reward'] - a['infer_reward']))/len(a)
        mae_imm = sum(abs(df['immediate_reward'] - df['infer_reward']))/len(df)
        print("MAE Error Delayed:", mae_delayed)
        print("MAE Error Immediate:", mae_imm)
        if mae_delayed<best_error:
            best_unit = unit
            best_error = mae_delayed
            best_df = df.copy()
    return best_unit, best_error, best_df


In [8]:
unit, err, df = run_and_optimize(X, y, df.copy())
print("Best Units:", unit, "| MAE:", err)

--running 8--

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 

Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200
--calculating error--
Copying immediate reward for Ep: 0
Copying immediate reward for Ep: 100
Copying immediate reward for Ep: 200
Copying immediate reward for Ep: 300
Copying immediate reward for Ep: 400
Copying immediate reward for Ep: 500
Copying immediate reward for Ep: 600
Copying immediate reward for Ep: 700
Copying immediate reward for Ep: 800
Copying immediate reward for Ep: 900
MAE Error Delayed: 0.22344178561456454
MAE Error Immediate: 0.3888679167158371
--running 16--
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epo

Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200
--calculating error--
Copying immediate reward for Ep: 0
Copying immediate reward for Ep: 100
Copying immediate reward for Ep: 200
Copying immediate reward for Ep: 300
Copying immediate reward for Ep: 400
Copying immediate reward for Ep: 500
Copying immediate reward for Ep: 600
Copying immediate reward for Ep: 700
Copying immediate reward for Ep: 800
Copying immediate reward for Ep: 900
MAE Error Delayed: 0.23618871278669662
MAE Error Immediate: 0.29041932969730616
--running 24--
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch

Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 13

Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200
--calculating error--
Copying immediate reward for Ep: 0
Copying immediate reward for Ep: 100
Copying immediate reward for Ep: 200
Copying immediate reward for Ep: 300
Copying immediate reward for Ep: 400
Copying immediate reward for Ep: 500
Copying immediate reward for Ep: 600
Copying immediate reward for Ep: 70

Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200
--calculating error--
Copying immediate reward for Ep: 0
Copying immediate reward for Ep: 100
Copying immediate reward

In [9]:
# Build a model
# model = get_model(state_size=feature_length, history=history, unit=16)
# keras.utils.plot_model(model)

In [10]:
# model.fit(X,y, epochs=500)

# # This is the infer reward
# f = keras.backend.function(model.input, model.layers[-1].input) 
# infer_reward = f([X,1])

# pred = model.predict(X)

In [11]:
# df['infer_reward'] = 0
# for ep in sorted(df['episode_id'].unique()):
#     ep_total = 0
#     if ep%100==0:
#         print("Running Ep:", ep)
        
#     curr_trans = max(df.loc[df['episode_id']==ep, 'transition_id'])
#     # this loops from last transitions to back
#     for h in reversed(range(history)):
#         immediate_reward = infer_reward[h][ep][0][0]
#         ep_total += immediate_reward
#         if curr_trans>=0:
#             df.loc[(df['episode_id']==ep) & (df['transition_id']==curr_trans), 'infer_reward'] = immediate_reward
#             curr_trans-=1
    

In [12]:
import math
a = df.groupby(['episode_id']).sum()
print("MAE Error Delayed:", sum(abs(a['delayed_reward'] - a['infer_reward']))/len(a))
# print("MAE Error Immediate:", sum(abs(df['immediate_reward'] - df['infer_reward']))/len(df))
a.describe()

df.to_pickle(dataset)

MAE Error Delayed: 0.22344178561456454


# GP Infer

In [13]:
import delayed_reward_fun
import numpy as np
import pandas as pd
import time
import random

random_state=0
np.random.seed(random_state)
random.seed(random_state)

dataset = '../data/gridworldchi_ndm_slide_1k.pkl'

df = pd.read_pickle(dataset)
df

Unnamed: 0,episode_id,transition_id,state,action,immediate_reward,delayed_reward,infer_reward,infer_reward_gp,done,next_state
0,0,0,"[0, 0]",0,-0.1,0.0,0.000000,,False,"[0, 1]"
1,0,1,"[0, 1]",0,-0.1,0.0,0.000000,,False,"[0, 2]"
2,0,2,"[0, 2]",1,0.9,0.0,0.000000,,False,"[1, 2]"
3,0,3,"[1, 2]",2,-1.1,0.0,0.000000,,False,"[1, 1]"
4,0,4,"[1, 1]",2,-0.1,0.0,0.000000,,False,"[1, 2]"
...,...,...,...,...,...,...,...,...,...,...
35935,999,34,"[13, 1]",0,-0.1,0.0,-0.029318,,False,"[13, 2]"
35936,999,35,"[13, 2]",0,-0.1,0.0,-0.550923,,False,"[13, 3]"
35937,999,36,"[13, 3]",0,-0.1,0.0,-0.047019,,False,"[13, 4]"
35938,999,37,"[13, 4]",0,-0.1,0.0,-1.038777,,False,"[13, 5]"


In [14]:

def _discounted_array(filled_till, x, size, gamma):
    arr = []
    val = 1
    for i in range(size):
        if i<filled_till:
            arr.append(0)
        elif i==filled_till:
            arr.append(val)
        elif i<filled_till+x:
            val = val * gamma
            arr.append(val)
        else:
            arr.append(0)
    return arr, filled_till+x


def _get_inferred_processed_df(df, mio_r_dif, cross_validation=True):
    """
    This version uses original implementation from Dr Chi's lab.
    """
    
    feature_state = np.array(df['state'].tolist())
    action = np.array(df['action'].tolist())
    temp = np.array(df.loc[df['done']==True]['delayed_reward'].tolist())
    delayed_reward = temp.reshape((len(temp), 1))

    temp = np.array(df['episode_id'].tolist())
    episode_index = temp.reshape((len(temp), 1))

    #transfer features to feature action pairs
    phi, phi_list = delayed_reward_fun.prepare_phi(feature_state, action)
    
    final_H_P_gamma_R = 0.5
    final_H_P_sigma_R = 0.01
    final_H_P_gamma = 1
    if cross_validation:
        H_P_discounted_reward_search = [1, 0.95, 0.9]
        # rbf (gaussian) kernel variables K(x, y) = exp(-gamma ||x-y||^2)
        H_P_gamma_search = [0.1, 0.3, 0.5, 0.7, 0.9, 1]
        # Noise hyperparameter
        H_P_sigma_search = [0.01, 0.05, 0.1]
        # Error
        GP_error = np.zeros((len(H_P_discounted_reward_search), len(H_P_gamma_search), len(H_P_sigma_search)))
        min_error = 999999999999
        
        for discounted_reward_index, discounted_reward in enumerate(H_P_discounted_reward_search):
            d = []
            filled = 0
            temp = df.groupby('episode_id').count()[['action']].reset_index()
            for row in temp.iterrows():
                x = row[1]['action']
                arr, filled = _discounted_array(filled, x, len(feature_state), discounted_reward)
                d.append(arr)
            Dmat = np.matrix(d)
            D = np.array(Dmat)                
                            
                            
            for H_P_gamma_index, H_P_gamma_R in enumerate(H_P_gamma_search):
                for H_P_sigma_index, H_P_sigma_R in enumerate(H_P_sigma_search):
                    print("cross validation Hyperparameters are", discounted_reward, H_P_gamma_R, H_P_sigma_R)
                    t1 = time.time()
                    GP_error[discounted_reward_index, H_P_gamma_index, H_P_sigma_index] = delayed_reward_fun.gaussian_process(phi, 
                                                                                                     D, episode_index, 
                                                                                                     delayed_reward, 
                                                                                                     'rbf', 
                                                                                                     H_P_gamma_R, 
                                                                                                     H_P_sigma_R, True,
                                                                                                    mio_r_dif=mio_r_dif)

                    current_error = GP_error[discounted_reward_index, H_P_gamma_index, H_P_sigma_index]
                    if current_error < min_error:
                        min_error = current_error
                        final_H_P_gamma_R = H_P_gamma_R
                        final_H_P_sigma_R = H_P_sigma_R
                        final_H_P_gamma = discounted_reward

                    print("sum of squared error is:", GP_error[discounted_reward_index, H_P_gamma_index, H_P_sigma_index])
                    print("training takes", time.time()- t1 , "seconds")
                    print("************************")

    print("Final:", "gamma:", final_H_P_gamma, "gamma_R:", final_H_P_gamma_R, "sigma_R:", final_H_P_sigma_R, "mse:", min_error)
    d = []
    filled = 0
    temp = df.groupby('episode_id').count()[['action']].reset_index()
    for row in temp.iterrows():
        x = row[1]['action']
        arr, filled = _discounted_array(filled, x, len(feature_state), final_H_P_gamma)
        d.append(arr)
    Dmat = np.matrix(d)
    D = np.array(Dmat) 
    # Expected value cov of inferred reward
    E_r_R, C_r_R = delayed_reward_fun.gaussian_process(phi, D, episode_index, delayed_reward, 'rbf', 
                                                       final_H_P_gamma_R, final_H_P_sigma_R, False)
    
    return E_r_R, C_r_R

In [None]:
E_r, C_r = _get_inferred_processed_df(df, mio_r_dif=0)

df['infer_reward_gp'] = E_r

cross validation Hyperparameters are 1 0.1 0.01
sum of squared error is: 458.40355243907777
training takes 3056.6725368499756 seconds
************************
cross validation Hyperparameters are 1 0.1 0.05
sum of squared error is: 470.6683273531572
training takes 3689.5273818969727 seconds
************************
cross validation Hyperparameters are 1 0.1 0.1


In [None]:
a = df.groupby(['episode_id']).sum()
print("MAE Error Delayed:", sum(abs(a['delayed_reward'] - a['infer_reward_gp']))/len(a))
print("MAE Error Immediate:", sum(abs(df['immediate_reward'] - df['infer_reward_gp']))/len(df))
a.describe()

df.to_pickle(dataset)