# Generate Raw Data

In [4]:
import gym
import random
import numpy as np
import gym_gridworld
import pandas as pd

random_state = 0
np.random.seed(random_state)
random.seed(random_state)

env = gym.make('gridworld-v0', deterministic=False)
env.seed(random_state)
action_size = 2

episodes = 10000
df = pd.DataFrame(columns=['episode_id', 'transition_id', 'state', 'action', 'immediate_reward', 
                      'delayed_reward', 'infer_reward', 'infer_reward_gp', 'done', 'next_state'])

for ep in range(episodes):
    state = env.reset()
    done = False
    delayed_reward = 0
    transition_id = 0
    while not done:
#         env.render()
        action = np.random.choice(range(action_size))
        next_state, reward, done, info = env.step(action)
        delayed_reward += reward

        if done:
            if ep%100==0:
                print("Episode:", ep, "| Total Reward:", round(delayed_reward,2))
            df = df.append({'episode_id':ep, 'transition_id':transition_id, 'state':np.array(state), 'action':action, 
                   'immediate_reward': reward, 'delayed_reward':delayed_reward, 'done':done, 'next_state':np.array(next_state)}, 
                  ignore_index=True)
            break
        
        df = df.append({'episode_id':ep, 'transition_id':transition_id, 'state':np.array(state), 'action':action, 
                   'immediate_reward': reward, 'delayed_reward':0, 'done':done, 'next_state':np.array(next_state)}, 
                  ignore_index=True)
        transition_id += 1
        state = next_state

df

Episode: 0 | Total Reward: 1.0
Episode: 100 | Total Reward: 4.0
Episode: 200 | Total Reward: 5.0
Episode: 300 | Total Reward: 2.0
Episode: 400 | Total Reward: 0.0
Episode: 500 | Total Reward: 10.0
Episode: 600 | Total Reward: 6.0
Episode: 700 | Total Reward: -5.0
Episode: 800 | Total Reward: -5.0
Episode: 900 | Total Reward: 3.0
Episode: 1000 | Total Reward: -4.0
Episode: 1100 | Total Reward: 0.0
Episode: 1200 | Total Reward: 8.0
Episode: 1300 | Total Reward: 22.0
Episode: 1400 | Total Reward: -1.0
Episode: 1500 | Total Reward: -10.0
Episode: 1600 | Total Reward: -4.0
Episode: 1700 | Total Reward: 14.0
Episode: 1800 | Total Reward: -5.0
Episode: 1900 | Total Reward: 4.0
Episode: 2000 | Total Reward: 26.0
Episode: 2100 | Total Reward: 1.0
Episode: 2200 | Total Reward: -3.0
Episode: 2300 | Total Reward: 27.0
Episode: 2400 | Total Reward: 8.0
Episode: 2500 | Total Reward: -1.0
Episode: 2600 | Total Reward: -1.0
Episode: 2700 | Total Reward: 4.0
Episode: 2800 | Total Reward: -9.0
Episode: 

Unnamed: 0,episode_id,transition_id,state,action,immediate_reward,delayed_reward,infer_reward,infer_reward_gp,done,next_state
0,0,0,"[0, 0]",0,0.0,0,,,False,"[0, 1]"
1,0,1,"[0, 1]",0,0.0,0,,,False,"[0, 2]"
2,0,2,"[0, 2]",1,0.0,0,,,False,"[1, 2]"
3,0,3,"[1, 2]",1,5.0,0,,,False,"[2, 2]"
4,0,4,"[2, 2]",0,0.0,0,,,False,"[2, 3]"
...,...,...,...,...,...,...,...,...,...,...
164787,9999,12,"[6, 6]",1,0.0,0,,,False,"[7, 6]"
164788,9999,13,"[7, 6]",1,0.0,0,,,False,"[8, 6]"
164789,9999,14,"[8, 6]",1,0.0,0,,,False,"[9, 6]"
164790,9999,15,"[9, 6]",0,0.0,0,,,False,"[9, 7]"


In [5]:
df['delayed_reward'] = pd.to_numeric(df['delayed_reward'])
df = df.sort_values(by=['episode_id', 'transition_id'])
df.reset_index(inplace=True, drop=True)
print("Total transitions:", len(df), " | Total episodes:", len(df['episode_id'].unique()))

df.to_pickle('../data/gridworld_ndm_10k_fixed.pkl')

Total transitions: 164792  | Total episodes: 10000
