### DQN confidence prediction using Conformal Prediction

In [1]:
# !pip install Box2D
# !pip install 'gym[all]'
# !pip install pyyaml
import gym
import numpy as np
import matplotlib.pyplot as plt
from collections import deque, defaultdict

import time
import sys
from tqdm import tqdm
import yaml
import re

from dqnetwork import DQNetwork
from agent import Agent

import torch
import torch.nn as nn
import torch.nn.functional as F

from os import listdir, getcwd
from os.path import isabs, join

In [2]:
## Load the environment
env_id = 'LunarLander-v2'
env = gym.make(env_id)

## Step 1. Load Expert & Evaluation Policies

In [3]:
behavior_agent = Agent(path="../../models/behavior_DQN_policy.pth") #loads behavior policy
eval_agent   = Agent(path="../../models/evaluation_DQN_policy.pth") #loads evaluation policy

Model loaded into local and target networks!
Model loaded into local and target networks!


## Step 2. Running Trajectory rollout

In [4]:
def generate_trajectory(b_agent:Agent, e_agent:Agent):
    """
    Generate Probability for an evaluation agent w.r.t behavior agent.
    @Param:
    1. b_agent - (Agent) Behavior policy.
    2. e_agent - (Agent) Evaluation policy.
    @Return:
    - actions - (np.array[np.array]) set of actions taken by behavior and evaluation agent.
    - probs   - (np.array[np.array[np.array]]) set of probability (shape - env.action_size) 
                for each timestep both agents.
    - states  - (np.array[np.array]) set of states taken by behavior and evaluation agent.
    """
    
    #### 
    states, e_actions, b_actions, e_probs, b_probs = [], [], [], [], []
    ####
    
    state = env.reset()
    total_reward = 0
    done = False #terminal condition
    
    while not done:
        b_action, b_prob = b_agent.get_action(state, eps=0) #get action using behavior agent.
        e_action, e_prob = e_agent.get_action(state, eps=0) #get action using evaluation agent.
        
        next_state, reward, done, info = env.step(b_action) #rollout from Expert (behavior) policy
        
        total_reward += reward
        
        #### append ####
        states.append(state) #append states to feature matrix
        
        e_actions.append(e_action) #append evaluation policy actions
        b_actions.append(b_action) #append behavior policy actions
        
        b_probs.append(b_prob) #append stochastic actions of behavior agent
        e_probs.append(e_prob) #append stochastic actions of evaluation agent
        #### append ####
        
        state = next_state
        
    print("TOTAL REWARD FROM EXPERT POLICY", total_reward)
    
    return np.array(states), np.array(e_actions), np.array(b_actions), np.array(e_probs), np.array(b_probs)

In [5]:
feature_matrix, Y_pred, Y_test, eval_prob, behv_prob = generate_trajectory(behavior_agent, eval_agent)

TOTAL REWARD FROM EXPERT POLICY 199.08916180202797


In [6]:
feature_matrix.shape, Y_pred.shape, Y_test.shape, eval_prob.shape, behv_prob.shape

((851, 8), (851,), (851,), (851, 1, 4), (851, 1, 4))

## Step 3. Conformal Prediction

In [7]:
## TODO