# CARL Bandits

## Setup

Let us start by importing a couple of generic libraries.

In [1]:
import os
os.chdir(os.path.abspath(os.path.pardir))

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm

from src.SCMMappings import Abstraction
from src.measuring import ICEvaluator
from src.examples import smokingmodels as sm
import src.evaluationsets as esets

import src.bandit_envs as BE
import src.bandit_agents as BA
import CARLa_trials as CTr
import CARLa_plots as Cplt
import CARLa_abserr as Cae

INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [3]:
np.random.seed(0)

In [4]:
def alpha_pushforward(p,alphamap):
    sizecodomain = np.max(list(alphamap.values()))+1
    alphap = np.zeros(sizecodomain)
    for k in alphamap.keys():
        alphap[alphamap[k]] += p[k]
    return alphap

In [5]:
def build_action_counter(n_actions,history_actions):
    counter = np.zeros(n_actions)
    for a in history_actions:
        counter[a] += 1
    return counter
    
def get_action_distributions(n_actions,Q,c,action_counter):
    p = np.zeros(n_actions)
    p[np.argmax(Q + c * np.sqrt( np.log(np.sum(action_counter)) / action_counter  ))] = 1
    return p

## Model 1 setup

In [6]:
import src.examples.randomgenerators as rg

In [7]:
MphiS = np.array([[.8],[.2]])
MphiT = np.array([[.2,.8],[.8,.2]])
MphiC = np.array([[.7,.3],[.3,.7]])
M0 = sm.M_pgmpy_chain_STC(MphiS,MphiT,MphiC,S='T',T='M',C='Y')

MphiS = np.array([[.8],[.2]]) 
MphiC = np.array([[.7,.3],[.3,.7]])
M1 = sm.M_pgmpy_chain_SC(MphiS,MphiC,S='T_',C='Y_')   
R = ['T','Y']
a = {'T': 'T_',
    'Y': 'Y_'}
alphas = {'T_': np.array([[0,1],[1,0]]),
        'Y_': np.array([[0,1],[1,0]])}

In [8]:
Ab = Abstraction(M0,M1,R,a,alphas)
Ae = ICEvaluator(Ab)
J = esets.get_causal_sets_in_M0_with_directed_path_in_M1_and_M0(Ab.M0,Ab.M1,Ab.a)
Ae.compute_overall_error(J)

0.2290841420720751

In [9]:
map_rewards = {i:np.argmax(alphas['Y_'][:,i]) for i in range(alphas['Y_'].shape[1])}
    
ydomain0 = lambda x:x
ydomain1 = lambda x:x
map_ydomains = {ydomain0(i):ydomain1(j) for i,j in map_rewards.items()}

actions0=[{}, {'T':0}, {'T':1}]
actions1=[{}, {'T_':0}, {'T_':1}]
map_actions = {0: 0, 1: 2, 2: 1}

## Experiment settings

In [10]:
params_to_save_start = set(dir())
simulname = 'simul_2_2A'

n_trials = 20
n_steps = 500

### Bandits setting

In [11]:
B0 = BE.SCMEnv(scm=Ab.M0, actions=[{}, {'T':0}, {'T':1}], target='Y')
B1 = BE.SCMEnv(scm=Ab.M1, actions=[{}, {'T_':0}, {'T_':1}], target='Y_')

### Learning setting

In [12]:
bandits = [B0,B1,B1]

labels = ['M0: ucb','M1: ucb','M1: imitation']
trainalg = ['ucb', 'ucb', 'imitation']
params = {}

# Alg params
params['Qinit'] = [BA.initialize_Q_ones,BA.initialize_Q_ones,BA.initialize_Q_ones]

# UCB params
params['c'] = [1.0] * len(labels)

## Statistics settings

In [13]:
agentsA = [[] for _ in range(len(labels))]

params_to_save = set(dir()) - params_to_save_start
simulparams = [s for s in params_to_save if s[0]!='_']

# Training

In [14]:
for t in tqdm(range(n_trials)):
    ag0 = CTr.run_trial(bandits[0],n_steps,trainalg[0],
                        {k: params[k][0] for k in params.keys()},
                        map_actions,map_rewards,map_ydomains)
    agentsA[0].append(ag0)
    
    for i in range(1,len(labels)):
        ag = CTr.run_trial(bandits[i],n_steps,trainalg[i],
                           {k: params[k][i] for k in params.keys()},
                            map_actions,map_rewards,map_ydomains,base_bandit=ag0)
        agentsA[i].append(ag)

  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / sel

  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / sel

  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / sel

  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / sel

  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / self.action_counter  )
  U = self.c * np.sqrt( np.log(np.sum(self.action_counter)) / sel

# Persistence

In [15]:
stats = {simulparams[i]: eval(simulparams[i]) for i in range(len(simulparams))}

# Evaluation of the results

In [16]:
cumregret0A = np.zeros((n_trials,n_steps))

for trial in range(n_trials):
    n_actions = len(B0.actions)
    Qs = agentsA[0][trial].history_Qs
    actions = agentsA[0][trial].history_actions

    for step in range(n_steps):
        p = get_action_distributions(n_actions,Qs[step],params['c'][0], build_action_counter(n_actions,actions[0:step+1]))
        cumregret0A[trial,step]= cumregret0A[trial,step-1]-np.sum(p*B0.optim_gaps) 

  p[np.argmax(Q + c * np.sqrt( np.log(np.sum(action_counter)) / action_counter  ))] = 1
  p[np.argmax(Q + c * np.sqrt( np.log(np.sum(action_counter)) / action_counter  ))] = 1


In [17]:
cumregret1A = np.zeros((n_trials,n_steps))

for trial in range(n_trials):
    n_actions = len(B1.actions)
    Qs = agentsA[1][trial].history_Qs
    actions = agentsA[1][trial].history_actions

    for step in range(n_steps):
        p = get_action_distributions(n_actions,Qs[step],params['c'][1], build_action_counter(n_actions,actions[0:step+1]))
        cumregret1A[trial,step]= cumregret1A[trial,step-1]-np.sum(p*B1.optim_gaps)   

  p[np.argmax(Q + c * np.sqrt( np.log(np.sum(action_counter)) / action_counter  ))] = 1
  p[np.argmax(Q + c * np.sqrt( np.log(np.sum(action_counter)) / action_counter  ))] = 1


In [18]:
cumregretAA = np.zeros((n_trials,n_steps))

for trial in range(n_trials):
    n_actions = len(B1.actions)
    Qs = agentsA[0][trial].history_Qs
    actions = agentsA[0][trial].history_actions

    for step in range(n_steps):
        p = get_action_distributions(n_actions,Qs[step],params['c'][0], build_action_counter(n_actions,actions[0:step+1]))
        alphap = alpha_pushforward(p,map_actions)
        cumregretAA[trial,step]= cumregretAA[trial,step-1]-np.sum(alphap*B1.optim_gaps) 

  p[np.argmax(Q + c * np.sqrt( np.log(np.sum(action_counter)) / action_counter  ))] = 1
  p[np.argmax(Q + c * np.sqrt( np.log(np.sum(action_counter)) / action_counter  ))] = 1


## Model 2 setup

In [19]:
MphiS = np.array([[.8],[.2]])
MphiT = np.array([[.2,.8],[.8,.2]])
MphiC = np.array([[.7,.3],[.3,.7]])
M0 = sm.M_pgmpy_chain_STC(MphiS,MphiT,MphiC,S='T',T='M',C='Y')

MphiS = np.array([[.8],[.2]]) 
MphiC = np.dot(MphiC,MphiT)
M1 = sm.M_pgmpy_chain_SC(MphiS,MphiC,S='T_',C='Y_')   
R = ['T','Y']
a = {'T': 'T_',
    'Y': 'Y_'}
alphas = {'T_': np.eye(2),
        'Y_': np.eye(2)}

In [20]:
Ab = Abstraction(M0,M1,R,a,alphas)
Ae = ICEvaluator(Ab)
J = esets.get_causal_sets_in_M0_with_directed_path_in_M1_and_M0(Ab.M0,Ab.M1,Ab.a)
Ae.compute_overall_error(J)

0

In [21]:
map_rewards = {i:np.argmax(alphas['Y_'][:,i]) for i in range(alphas['Y_'].shape[1])}
    
ydomain0 = lambda x:x
ydomain1 = lambda x:x
map_ydomains = {ydomain0(i):ydomain1(j) for i,j in map_rewards.items()}

actions0=[{}, {'T':0}, {'T':1}]
actions1=[{}, {'T_':0}, {'T_':1}]
map_actions = {0: 0, 1: 1, 2: 2}

## Experiment settings

In [22]:
params_to_save_start = set(dir())
simulname = 'simul_2_2B'

n_trials = 20
n_steps = 500

### Bandits setting

In [23]:
B0 = BE.SCMEnv(scm=Ab.M0, actions=[{}, {'T':0}, {'T':1}], target='Y')
B1 = BE.SCMEnv(scm=Ab.M1, actions=[{}, {'T_':0}, {'T_':1}], target='Y_')

### Learning setting

In [24]:
bandits = [B0,B1,B1]

labels = ['Base: ucb','Abs: ucb','Abs: imitation']
trainmode = ['direct','direct','imitation']
trainalg = ['ucb', 'ucb', 'ucb']
params = {}

# Alg params
params['Qinit'] = [BA.initialize_Q_ones,BA.initialize_Q_ones,BA.initialize_Q_ones]

# UCB params
params['c'] = [1.0] * len(labels)

## Statistics settings

In [25]:
agentsB = [[] for _ in range(len(labels))]

params_to_save = set(dir()) - params_to_save_start
simulparams = [s for s in params_to_save if s[0]!='_']

# Training

In [26]:
for t in tqdm(range(n_trials)):
    ag0 = CTr.run_trial(bandits[0],n_steps,trainmode[0],trainalg[0],
                        {k: params[k][0] for k in params.keys()},
                        map_actions,map_rewards,map_ydomains)
    agentsB[0].append(ag0)
    
    for i in range(1,len(labels)):
        ag = CTr.run_trial(bandits[i],n_steps,trainmode[i],trainalg[i],
                           {k: params[k][i] for k in params.keys()},
                            map_actions,map_rewards,map_ydomains,base_bandit=ag0)
        agentsB[i].append(ag)

  0%|                                                                                                           | 0/20 [00:00<?, ?it/s]


UnboundLocalError: local variable 'Ag' referenced before assignment

# Persistence

In [None]:
stats = {simulparams[i]: eval(simulparams[i]) for i in range(len(simulparams))}

# Evaluation of the results

In [None]:
cumregret0B = np.zeros((n_trials,n_steps))

for trial in range(n_trials):
    n_actions = len(B0.actions)
    Qs = agentsB[0][trial].history_Qs
    actions = agentsB[0][trial].history_actions

    for step in range(n_steps):
        p = get_action_distributions(n_actions,Qs[step],params['c'][0], build_action_counter(n_actions,actions[0:step+1]))
        cumregret0B[trial,step]= cumregret0B[trial,step-1]-np.sum(p*B0.optim_gaps) 

In [None]:
cumregret1B = np.zeros((n_trials,n_steps))

for trial in range(n_trials):
    n_actions = len(B1.actions)
    Qs = agentsB[1][trial].history_Qs
    actions = agentsB[1][trial].history_actions

    for step in range(n_steps):
        p = get_action_distributions(n_actions,Qs[step],params['c'][1], build_action_counter(n_actions,actions[0:step+1]))
        cumregret1B[trial,step]= cumregret1B[trial,step-1]-np.sum(p*B1.optim_gaps)   

In [None]:
cumregretAB = np.zeros((n_trials,n_steps))

for trial in range(n_trials):
    n_actions = len(B1.actions)
    Qs = agentsB[0][trial].history_Qs
    actions = agentsB[0][trial].history_actions

    for step in range(n_steps):
        p = get_action_distributions(n_actions,Qs[step],params['c'][0], build_action_counter(n_actions,actions[0:step+1]))
        alphap = alpha_pushforward(p,map_actions)
        cumregretAB[trial,step]= cumregretAB[trial,step-1]-np.sum(alphap*B1.optim_gaps) 

In [None]:
import matplotlib
font = {'family' : 'sans-serif',
        'weight' : 'normal',
        'size'   : 18}

plt.rc('font', **font)

In [None]:
plt.plot(np.mean(cumregretAA,axis=0)-np.mean(cumregret1A,axis=0),label='Non-zero IC')
plt.plot(np.mean(cumregretAB,axis=0)-np.mean(cumregret1B,axis=0),label='Zero IC',linestyle='--')
plt.xlabel('Steps')
plt.ylabel('Regret Difference')
plt.legend()

In [None]:
style = ['-','--']
for trainmode in [1,2]:
    cumregrets = np.array([agentsB[trainmode][j].get_cumulative_regret() for j in range(n_trials)])
    avg = np.mean(cumregrets,axis=0)
    std = np.std(cumregrets,axis=0)

    plt.plot(np.arange(n_steps),avg,label=labels[trainmode],color="C{}".format(trainmode-1),linestyle=style[trainmode-1])
    plt.fill_between(np.arange(n_steps),avg+std,avg-std, color='C{}'.format(trainmode-1), alpha=0.3)
    
    plt.xlabel('Steps')
    plt.ylabel('Cumulative regret')
    plt.legend()
    #ax[i,0].set_title(labels[i])
    #ax[i,0].plot(np.arange(n_steps),np.max(truerewards[i])*np.arange(n_steps),'k')