### DQN confidence prediction using Conformal Prediction

In [1]:
# !pip install Box2D
# !pip install 'gym[all]'
# !pip install pyyaml
import gym
import numpy as np
import matplotlib.pyplot as plt
from collections import deque, defaultdict

import time
import sys
from tqdm import tqdm
import yaml
import re

from dqnetwork import DQNetwork
from agent import Agent

import torch
import torch.nn as nn
import torch.nn.functional as F

from os import listdir, getcwd
from os.path import isabs, join

In [12]:
## Load the environment
env_id = 'LunarLander-v2'
env = gym.make(env_id)

In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #Enable cuda if available

# Deep Double Q Network with Prioritized Experience Replay Buffers

### Step 1. Load Expert & Evaluation Policies

In [3]:
behavior_agent = Agent(path="../../models/behavior_DQN_policy.pth") #loads behavior policy
eval_agent   = Agent(path="../../models/evaluation_DQN_policy.pth") #loads evaluation policy

Model loaded into local and target networks!
Model loaded into local and target networks!


### Step 2. Running Trajectory rollout

In [23]:
def generate_trajectory(b_agent:Agent, e_agent:Agent):
    """
    Generate Probability for an evaluation agent w.r.t behavior agent.
    @Param:
    1. b_agent - (Agent) Behavior policy.
    2. e_agent - (Agent) Evaluation policy.
    @Return:
    - actions - (np.array[np.array]) set of actions taken by behavior and evaluation agent.
    - probs   - (np.array[np.array[np.array]]) set of probability (shape - env.action_size) 
                for each timestep both agents.
    - states  - (np.array[np.array]) set of states taken by behavior and evaluation agent.
    """
    
    #### 
    states, e_actions, b_actions, e_probs, b_probs = [], [], [], [], []
    ####
    
    state = env.reset()
    total_reward = 0
    done = False #terminal condition
    
    while not done:
        b_action, b_prob = b_agent.get_action(state, eps=0) #get action using behavior agent.
        e_action, e_prob = e_agent.get_action(state, eps=0) #get action using evaluation agent.
        
        next_state, reward, done, info = env.step(b_action) #rollout from Expert (behavior) policy
        
        total_reward += reward
        
        #### append ####
        states.append(state) #append states to feature matrix
        
        e_actions.append(e_action) #append evaluation policy actions
        b_actions.append(b_action) #append behavior policy actions
        
        b_probs.append(b_prob) #append stochastic actions of behavior agent
        e_probs.append(e_prob) #append stochastic actions of evaluation agent
        #### append ####
        
        state = next_state
        
    print("TOTAL REWARD FROM EXPERT POLICY", total_reward)
    assert(total_reward >= 200) #conditioned on truth
    return np.array(states), np.array(e_actions), np.array(b_actions), np.array(e_probs), np.array(b_probs)

In [24]:
feature_matrix, Y_eval_train, Y_behv_train, eval_prob, behv_prob = generate_trajectory(behavior_agent, eval_agent)

TOTAL REWARD FROM EXPERT POLICY 241.35759855984472


In [25]:
feature_matrix.shape, Y_pred.shape, Y_test.shape, eval_prob.shape, behv_prob.shape

((582, 8), (585,), (585,), (582, 1, 4), (582, 1, 4))

# Conformal Prediction

<p>
Steps for conformal prediction:
<ol>
    <li>Calculate nonconformal score using Nearest Centroid algorithm</li>
    <li>Calculate p-values corresponding to the current possible prediction/label</li>
    <li>Output j as predicted label of the current example with p-value $p_j$ if and only if $p_j > \epsilon $ </li>
</ol>
</p>

### Nearest Centroid algorithm

In [59]:
def euclid(point, centroid):
    """Calculate the distance between a test point and a centroid point"""
    #assuming point and centroid are tensors
    point = np.array(point)
    centroid = np.array(centroid)
    return np.linalg.norm(point - centroid)

In [71]:
def centroid(data):
    """Find the centroid for one class of object"""
    features, observations = data.shape
    if(features > observations):
        raise ValueError("too few observations")
    
    central = []
    for i in range(features):
        mean = np.mean(data[i])
        central.append(mean)
    return central

In [72]:
def split_classes(data, labels):
    """Split data into N classes based on corresponding label from evaluation agent"""
    hash_table = {}
    for i in range(data.shape[0]):
        label = labels[i].item() #get class label
        if(label in hash_table):
            hash_table[label].append(data[i])
        else:
            hash_table[label] = [data[i]]
    
    #convert each value to nd.array
    for key in hash_table:
        hash_table[key] = np.array(hash_table[key])
        
    return np.array(list(hash_table.values()))

In [73]:
action_classes = split_classes(feature_matrix, Y_eval_train) #each category is labeled as chronological class

In [74]:
action_centroids = []

for matrix in action_classes:
    action_centroids.append( centroid(matrix.T) )
    
action_centroids = np.array(action_centroids) #to numpy

In [76]:
action_centroids.shape

(4, 8)