In [1]:
import json
import sys
sys.path.append('../simulated_fqi/')
import seaborn as sns
import tqdm
import matplotlib.pyplot as plt 
import numpy as np
import torch
import random
import shap
import configargparse
import torch
import torch.optim as optim

from environments import CartPoleRegulatorEnv
from environments import CartEnv
from environments import AcrobotEnv
from models.agents import NFQAgent
from models.networks import NFQNetwork, ContrastiveNFQNetwork
from util import get_logger, close_logger, load_models, make_reproducible, save_models
import matplotlib.pyplot as plt
import numpy as np
import itertools
from train import fqi

In [None]:
def runFQI(
verbose=True, 
is_contrastive=False, 
epoch=1000, 
init_experience=200, 
evaluations=5, 
force_left=5, 
random_seed=None, 
reward_weights=np.asarray([0.1] * 5)
):
    # Setup environment
    bg_cart_mass = 1.0
    fg_cart_mass = 1.0
    train_env_bg = CartPoleRegulatorEnv(
        group=0,
        masscart=bg_cart_mass,
        mode="train",
        force_left=force_left,
        is_contrastive=is_contrastive,
    )
    train_env_fg = CartPoleRegulatorEnv(
        group=1,
        masscart=fg_cart_mass,
        mode="train",
        force_left=force_left,
        is_contrastive=is_contrastive,
    )
    eval_env_bg = CartPoleRegulatorEnv(
        group=0,
        masscart=bg_cart_mass,
        mode="eval",
        force_left=force_left,
        is_contrastive=is_contrastive,
    )
    eval_env_fg = CartPoleRegulatorEnv(
        group=1,
        masscart=fg_cart_mass,
        mode="eval",
        force_left=force_left,
        is_contrastive=is_contrastive,
    )

    # Log to File, Console, TensorBoard, W&B
    logger = get_logger()
    
    # NFQ Main loop
    bg_rollouts = []
    fg_rollouts = []
    total_cost = 0
    if init_experience > 0:
        for _ in range(init_experience):
            rollout_bg, episode_cost = train_env_bg.generate_rollout(
                None, render=False, group=0
            )
            rollout_fg, episode_cost = train_env_fg.generate_rollout(
                None, render=False, group=1
            )
            bg_rollouts.extend(rollout_bg)
            fg_rollouts.extend(rollout_fg)
            total_cost += episode_cost
    bg_rollouts.extend(fg_rollouts)
    all_rollouts = bg_rollouts.copy()

    bg_rollouts_test = []
    fg_rollouts_test = []
    if init_experience > 0:
        for _ in range(init_experience):
            rollout_bg, episode_cost = eval_env_bg.generate_rollout(
                None, render=False, group=0
            )
            rollout_fg, episode_cost = eval_env_fg.generate_rollout(
                None, render=False, group=1
            )
            bg_rollouts_test.extend(rollout_bg)
            fg_rollouts_test.extend(rollout_fg)
    bg_rollouts_test.extend(fg_rollouts)
    all_rollouts_test = bg_rollouts_test.copy()
    # Setup agent
    nfq_net = ContrastiveNFQNetwork(
        state_dim=train_env_bg.state_dim, is_contrastive=is_contrastive
    )

    if is_contrastive:
        optimizer = optim.Adam(
            itertools.chain(
                nfq_net.layers_shared.parameters(),
                nfq_net.layers_last_shared.parameters(),
            ),
            lr=1e-1,
        )
    else:
            optimizer = optim.Adam(nfq_net.parameters(), lr=1e-1)
    nfq_agent = NFQAgent(nfq_net, optimizer)

    bg_success_queue = [0] * 3
    fg_success_queue = [0] * 3
    epochs_fg = 0
    eval_fg = 0
    for epoch in range(epoch + 1):

        state_action_b, target_q_values, groups = nfq_agent.generate_pattern_set(
            all_rollouts, reward_weights=reward_weights
        )
        X = state_action_b
        train_groups = groups

        if not nfq_net.freeze_shared:
            loss = nfq_agent.train((state_action_b, target_q_values, groups))

        eval_episode_length_fg, eval_success_fg, eval_episode_cost_fg = 0, 0, 0
        if nfq_net.freeze_shared:
            eval_fg += 1

            if eval_fg > 50:
                loss = nfq_agent.train((state_action_b, target_q_values, groups))
        (
            eval_episode_length_bg,
            eval_success_bg,
            eval_episode_cost_bg,
        ) = nfq_agent.evaluate(eval_env_bg, render=False)
        (
            eval_episode_length_fg,
            eval_success_fg,
            eval_episode_cost_fg,
        ) = nfq_agent.evaluate(eval_env_fg, render=False)

        bg_success_queue = bg_success_queue[1:]
        bg_success_queue.append(1 if eval_success_bg else 0)

        fg_success_queue = fg_success_queue[1:]
        fg_success_queue.append(1 if eval_success_fg else 0)

        printed_bg = False
        printed_fg = False

        if sum(bg_success_queue) == 3 and not nfq_net.freeze_shared == True:
            if epochs_fg == 0:
                epochs_fg = epoch
            printed_bg = True
            nfq_net.freeze_shared = True
            if verbose:
                print("FREEZING SHARED")
            for param in nfq_net.layers_fg.parameters():
                param.requires_grad = False
            for param in nfq_net.layers_last_fg.parameters():
                param.requires_grad = False

            optimizer = optim.Adam(
                itertools.chain(
                    nfq_net.layers_fg.parameters(),
                    nfq_net.layers_last_fg.parameters(),
                ),
                lr=1e-1,
            )
            nfq_agent._optimizer = optimizer

        # Print current status
        if verbose:
            logger.info(
                "Epoch {:4d} | Eval BG {:4d} / {:4f} | Eval FG {:4d} / {:4f} | Train Loss {:.4f}".format(
                    epoch,
                    eval_episode_length_bg,
                    eval_episode_cost_bg,
                    eval_episode_length_fg,
                    eval_episode_cost_fg,
                    loss,
                )
            )
        if sum(fg_success_queue) == 3:
            printed_fg = True
            break

    eval_env_bg.step_number = 0
    eval_env_fg.step_number = 0

    eval_env_bg.max_steps = 1000
    eval_env_fg.max_steps = 1000

    performance_fg = []
    performance_bg = []
    num_steps_bg = []
    num_steps_fg = []
    total = 0
    for it in range(evaluations):
        (
            eval_episode_length_bg,
            eval_success_bg,
            eval_episode_cost_bg,
        ) = nfq_agent.evaluate(eval_env_bg, False)
        if verbose:
            print(eval_episode_length_bg, eval_success_bg)
        num_steps_bg.append(eval_episode_length_bg)
        performance_bg.append(eval_episode_length_bg)
        total += 1
        train_env_bg.close()
        eval_env_bg.close()

        (
            eval_episode_length_fg,
            eval_success_fg,
            eval_episode_cost_fg,
        ) = nfq_agent.evaluate(eval_env_fg, False)
        if verbose:
            print(eval_episode_length_fg, eval_success_fg)
        num_steps_fg.append(eval_episode_length_fg)
        performance_fg.append(eval_episode_length_fg)
        total += 1
        train_env_fg.close()
        eval_env_fg.close()
    print("Fg trained after " + str(epochs_fg) + " epochs")
    print("BG stayed up for steps: ", num_steps_bg)
    print("FG stayed up for steps: ", num_steps_fg)

In [None]:
def getRhos(W=False, PD=True, vset='test'):

        rhos = np.ones([self.NV[vset], self.maxT])
        epRhos = []

        for i, v in enumerate(tqdm(self.visits[vset])):
            
            # load episode
            states, actions, phis, rewards = self.getEpisode(v, vset)
            epT = len(states)
            T = min(epT, self.maxT)            
            
            # load action probabilities
            prob_b = self.piB['cat'].predict_proba(states)[np.arange(epT), actions][:T]
            if self.piE['cat'].predict_proba(states).shape[1] < 4:
                #print('Warning: < 4 classes')
                prob_e = np.zeros([T, 4])
                cols = np.unique(self.piE['cat'].predict(states))
                probs = self.piE['cat'].predict_proba(states)
                for i, a in enumerate(cols):
                    prob_e[:, a] = probs[:, i]
                prob_e = prob_e[np.arange(epT), actions][:T]
            else:
                prob_e = self.piE['cat'].predict_proba(states)[np.arange(epT), actions][:T]
            
            # calculate importance weights
            if PD:
                # per-step cumulative weights
                invprop = np.cumprod(prob_e/prob_b, axis=0)
                # clip importance weights
                invprop[invprop<1e-3] = 1e-3
                invprop[invprop>1e3] = 1e3
                
                rhos[i, :len(invprop)] = list(invprop)
                rhos[i, len(invprop):] = np.ones(self.maxT-len(invprop)) * rhos[i,len(invprop)-1]
                epRhos.append({'s': states, 'a': actions, 'phi': phis,'r': rewards})  
        norm = self.NV[vset]
        if W: norm = np.sum(rhos, axis=0)    
        for i in range(self.NV[vset]): 
            epRhos[i]['rho'] = rhos[i,:] / norm
            
        return epRhos

In [None]:
def find_feature_expectations(rhos, behav=False, vset='train'):

        gamma_vec = [self.gamma**(i+1) for i in range(self.maxT)]
        T = [len(rhos[i]['phi']) for i in range(len(rhos))]  

        if behav:
            print('Simple averaging')
            estimated_mu = np.mean(np.vstack([np.sum(rhos[i]['phi'] * np.array(gamma_vec[:T[i]])[:,np.newaxis], axis=0) 
                                      for i in range(self.NV[vset])]), axis=0)
        else:
            print('PDWIS estimate')
            estimated_mu = np.mean(np.vstack([np.sum(rhos[i]['phi'] * (gamma_vec[:T[i]] * rhos[i]['rho'][:T[i]])[:,np.newaxis],
                                             axis=0) for i in range(self.NV[vset])]), axis=0)

        return estimated_mu

In [None]:
def find_feature_expectations():
    

In [None]:
epochs=10; learning_rate=0.5; init_w = [0.1]
# Initialize reward weights:
w_vecs = []
if init_w is None:
    w = np.ones(len(reward_weights))/float(len(reward_weights))
else:
    w = init_w

muB = None

# Find the difference between just training on the train set and 

for i in range(epochs):
    print('Epoch', i, '- Train pi with current w=', w)
    reward_weights = w
    w_vecs.append(w)
    try:
        runFQI(reward_weights=reward_weights)

        print('Evaluate feature expectations for pi')
        # This gives us importance sampling. We should do it for all samples instead. 
        # epRhos = self.getRhos(vset='train')
        mu = self.find_feature_expectations(epRhos, behav=False, vset='train')
        print(mu)

        print('Initialize behaviour mu:')
        if muB is None:
            muB = self.find_feature_expectations(epRhos, behav=True, vset='train')
        print(muB)

        print('Gradient update for new w')
        grad = norm(muB) - norm(mu)
    except:
        print('Error - skip update')
        grad = 0
    w += learning_rate*(0.95**i) * grad
    w = w/np.sum(np.abs(w))


return w