In [9]:
import os
import numpy as np

from IPython.display import Image
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_style('ticks')

matplotlib.rcParams.update({'font.size': 16})
matplotlib.rc('axes', titlesize=16)

from infomercial.exp import info_bandit
from infomercial.local_gym import bandit
from infomercial.exp.info_bandit import load_checkpoint
import gym

In [2]:
ls ../data/

exp1.log                                exp2_BanditOneHigh1000-v0_9.pkl
exp1_1.pkl                              exp2_BanditOneHigh121-v0_1.pkl
exp1_2.pkl                              exp2_BanditOneHigh121-v0_10.pkl
exp2.log                                exp2_BanditOneHigh121-v0_11.pkl
exp2_BanditHardAndSparse10-v0_1.pkl     exp2_BanditOneHigh121-v0_12.pkl
exp2_BanditHardAndSparse10-v0_10.pkl    exp2_BanditOneHigh121-v0_13.pkl
exp2_BanditHardAndSparse10-v0_11.pkl    exp2_BanditOneHigh121-v0_14.pkl
exp2_BanditHardAndSparse10-v0_12.pkl    exp2_BanditOneHigh121-v0_15.pkl
exp2_BanditHardAndSparse10-v0_13.pkl    exp2_BanditOneHigh121-v0_16.pkl
exp2_BanditHardAndSparse10-v0_14.pkl    exp2_BanditOneHigh121-v0_17.pkl
exp2_BanditHardAndSparse10-v0_15.pkl    exp2_BanditOneHigh121-v0_18.pkl
exp2_BanditHardAndSparse10-v0_16.pkl    exp2_BanditOneHigh121-v0_19.pkl
exp2_BanditHardAndSparse10-v0_17.pkl    exp2_BanditOneHigh121-v0_2.pkl
exp2_BanditHardAndSparse10-v0_18.pkl    exp2_BanditOn

In [12]:
data_path ="/Users/qualia/Code/infomercial/data/"
exp_name = "exp2"
num_exps = 50
num_episodes = 10000
env_names = [
    "BanditOneHot2-v0", 
    "BanditOneHot10-v0",
    "BanditOneHot121-v0",
    "BanditOneHot1000-v0",
    "BanditOneHigh2-v0",
    "BanditOneHigh10-v0",
    "BanditOneHigh121-v0",
    "BanditOneHigh1000-v0",
    "BanditHardAndSparse2-v0",
    "BanditHardAndSparse10-v0",
    "BanditHardAndSparse121-v0", 
    "BanditHardAndSparse1000-v0"
]

In [17]:
# Gather traces by bandit: scores, Qs in a big numpy array (n_exp, n_episodes)
scores_E = {}
scores_R = {}
values_E = {}
values_R = {}
controlling = {}
actions = {}
best = {}

for env in env_names:
    # Preallocate the arrays for this env
    scores_E[env] = np.zeros((num_episodes, num_exps))
    scores_R[env] = np.zeros((num_episodes, num_exps))
    values_E[env] = np.zeros((num_episodes, num_exps))
    values_R[env] = np.zeros((num_episodes, num_exps))
    controlling[env] = np.zeros((num_episodes, num_exps))
    actions[env] = np.zeros((num_episodes, num_exps))
    best[env] = np.zeros(num_exps)
    
    # Load and repackage
    for n in range(num_exps):
        result = load_checkpoint(os.path.join(data_path, f"{exp_name}_{env}_{n+1}.pkl"))
        scores_E[env][:, n] = result["scores_E"]
        scores_R[env][:, n] = result["scores_R"]
        values_E[env][:, n] = result["values_E"]
        values_R[env][:, n] = result["values_R"]
        controlling[env][:, n] = result["policies"]
        actions[env][:, n] = result["actions"]
        best[env][n] = result["best"]

In [18]:
# Gather stats by bandit: total R, p_best[-100:-1], Avg score
total_R = {}
for env in env_names:
    total_R[env] = np.zeros(num_exps)
    
    for n in range(num_exps):
        total = scores_R[env][:, n].sum()
        total_R[env][n] = total

# Est. prob. that the action was correct.
p_best = {}
for env in env_names:
    best = best[env][0]
    p_best[env] = np.zeros(num_episodes)
    
    for i in range(num_episodes):
        actions_i = actions[env][i,:]
        p_best[env][i] = np.sum(actions_i == best) / actions_i.size
        
# Avg scores
avg_scores_E = {}
avg_scores_R = {}
for env in env_names:
    
    avg_scores_E[env] = np.zeros(num_episodes)
    avg_scores_R[env] = np.zeros(num_episodes)
    
    for i in range(num_episodes):
        s_E_i = scores_E[env][i,:]
        s_R_i = scores_R[env][i,:]
        
        avg_scores_E[env][i] = s_E_i.mean()
        avg_scores_R[env][i] = s_R_i.mean()

In [31]:
# For each bandit:
# Plot raw actions, with p_best overlaid
# Plot scores, avg score overlaid



In [None]:
# For all bandits plot total reward, 
# with a line added to indicate 
# the max. exp. total value