In [4]:
import pandas as pd
import numpy as np
import pickle, os, csv, math, time, joblib
from joblib import Parallel, delayed
import datetime as dt
from datetime import date, datetime, timedelta
from collections import Counter
import copy as cp
import tqdm
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import log_loss, f1_score, precision_score, recall_score, accuracy_score
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import collections 
#import shap
import seaborn as sns
import random

# Simulation
* Simulate RL data from two different distributions, generate transition tuples

In [5]:
# Generate transition matrices, separate distributions for each one
shape, scale = 2., 1. 
transition_foreground = np.random.gamma(shape, scale, (12, 10))

mu, sigma = 0, 0.1 # mean and standard deviation
transition_background = np.random.normal(mu, sigma, (12, 10))

In [6]:
# Generate reward function
mu, sigma = 0, 2
reward_function = np.random.normal(mu, sigma, (12, 1))

In [7]:
# Params
exploit = 0.8
explore = 1-exploit
num_samples = 10000
actions = [[0, 0], [0, 1], [1, 0], [1, 1]]
mu, sigma = 0, 1

In [8]:
transition_tuples = []
for i in range(num_samples):
    # All initial states are generated from random normal
    s = np.random.normal(mu, sigma, (10, 1))
    
    flip = random.uniform(0, 1)
    # Exploit
    if flip < exploit:
        # Decide which transition matrix
        flip = np.random.choice(1)
        
        all_rewards = []
        for j, a in enumerate(actions):
            a = np.asarray(a)
            a = np.reshape(a, (2, 1))
            s_a = np.concatenate((s, a))
            reward = np.dot(reward_function.T, s_a)
            all_rewards.append(reward)
        
        noise = np.random.normal(0, 0.01, 1)
        all_rewards = np.asarray(all_rewards)
        a = actions[np.argmax(all_rewards)]
        reward = np.max(all_rewards) + noise
        
        if flip == 0:
            ns = np.dot(s_a.T, transition_foreground) 
        else:
            ns = np.dot(s_a.T, transition_background) 
        ns = np.add(ns , np.random.normal(0, 0.01, (1, 10))) # Add noise
    # Explore
    else:
        a = np.asarray(actions[np.random.choice(3)])
        a = np.reshape(a, (2, 1))
        s_a = np.concatenate((s, a)) # concatenate the state and action
        
        # Decide which transition matrix
        flip = np.random.choice(1)
        if flip == 0:
            ns = np.dot(s_a.T, transition_foreground)
        else:
            ns = np.dot(s_a.T, transition_background)
        reward = np.dot(reward_function.T, s_a) + np.random.normal(0, 0.01, 1)
        ns = np.add(ns , np.random.normal(0, 0.01, (1, 10))) # Add noise
    
    # Transition tuple includes state, action, next state, reward, indication of foreground/background
    # 1 if foreground 0 if background
    transition_tuples.append((s, list(a), ns, reward, 1 - flip))

In [15]:
split = int(0.8*len(transition_tuples))
train_tuples = transition_tuples[:split]
test_tuples = transition_tuples[split:]

In [16]:
train = {}
test = {}
elts = ['s', 'a', 'ns', 'r', 'ds']
for elt in elts:
    train[elt] = []
    test[elt] = []

for tup in train_tuples:
    train['s'].append(tup[0])
    train['a'].append(tup[1])
    train['ns'].append(tup[2])
    train['r'].append(tup[3])
    train['ds'].append(tup[4])

for tup in test_tuples:
    test['s'].append(tup[0])
    test['a'].append(tup[1])
    test['ns'].append(tup[2])
    test['r'].append(tup[3])
    test['ds'].append(tup[4])

# FQI

* Fit vanilla FQI algorithm, with linear regression on both data
* Do mixed model linear contrastive case
* What do we expect when we validate it?

In [45]:
# Define action space, the potential classes of action items. 
def a2c(a):
    actions = [[0, 0], [0, 1], [1, 0], [1, 1]]
    classes = []
    for action in a:
        c = np.where(actions==action)
        classes.append(c)
    
    return classes
        
# Mapping states to actions?        
def c2a(c):
    d = {0: [0, 0], 1: [0, 1], 2: [1, 0], 3: [1, 1]}
    return np.array([d[k] for k in c])

def random_weights(size=5):
    
    #w = 2*np.random.uniform(size=size) - 1
    w = norm(np.random.uniform(size=size))
    #w / np.sum(np.abs(w))
    
    return w

def norm(vec):
    return vec/np.sum(np.abs(vec))

def learnBehaviour(training_set, test_set):  
    floc = "simulated_fqi/behavior.pkl"
    behaviour_pi = LGBMClassifier(n_estimators=2000, class_weight='balanced', num_leaves=100)
    behaviour_pi.fit(np.vstack((training_set['s'], test_set['s'])),
                         a2c(np.vstack((training_set['a'], test_set['a']))))
    pickle.dump(behaviour_pi, open(floc, 'wb'))    
    
    return behaviour_pi

In [46]:
class FQIagent():
    def __init__(self, train_tuples, test_tuples, iters=100, gamma=0.99, batch_size=100, prioritize=True, estimator='gbm',
                 weights=np.array([1, 1, 1, 1, 1])/5., maxT=36):
        
        self.iters = iters
        self.gamma = gamma
        self.batch_size = batch_size
        self.prioritize_a = prioritize
        self.training_set = train_tuples
        self.test_set = test_tuples
        self.visits = {'train': len(train_tuples), 'test': len(test_tuples)}
        self.NV = {'train': len(train_tuples), 'test': len(test_tuples)}
        self.n_samples = len(test_tuples)
        self.actions, self.unique_actions, self.action_counts, self.n_actions = self.sub_actions()
        self.state_feats = [str(x) for x in range(10)]
        self.n_features = len(self.state_feats)
        self.reward_weights = weights
        self.maxT = maxT
        self.piB = learnBehaviour(self.training_set, self.test_set)
        
        if estimator == 'tree':
            self.q_est = {}
            for k in self.piB.keys():
                self.q_est[k] = ExtraTreesRegressor(n_estimators=50, max_depth=None, min_samples_leaf=10, min_samples_split=2,
                                             random_state=0)
        elif estimator == 'gbm':
            self.q_est = {}
            for k in self.piB.keys():
                self.q_est[k] = LGBMRegressor(n_estimators=50, silent=True)
        elif estimator == 'nn':
            self.q_est = None
            
        self.piE = {}
        for k in self.piB.keys():
            self.piE[k] = LGBMClassifier(n_estimators=50, silent=True)
        
        self.eval_est = LGBMRegressor(n_estimators=50, silent=True)

    def sub_actions(self):
        
        a = self.training_set['a']

        unique_actions = 0
        action_counts = 0
        n_actions = 0
        
        unique_actions, action_counts = np.unique(a, axis=0, return_counts=True)
        n_actions = len(unique_actions)
                
        return actions, unique_actions, action_counts, n_actions
        
    def sample_probs(self):
        
        if self.prioritize_a:
            sample_probs = 1.*np.ones(self.n_samples)/self.n_samples
            for i, a in enumerate(self.unique_actions[self.ak]):
                sample_probs[np.where((self.actions[self.ak]==a).all(axis=1))[0]] = 1./self.action_counts[self.ak][i]

            sample_probs = sample_probs/np.sum(sample_probs)
        else:
            sample_probs = None
            
        return sample_probs
    
    def sampleTuples(self):
    
        # Get batch of (prioritized) samples
        v = self.training_set['vnum']
        choose_from = list(set(range(self.n_samples)) - set(np.where(np.roll(v,1)!=v)[0] - 1))[:-1]
        self.batch_size = len(choose_from)
        probs = self.sample_probs()
        if probs is not None:
            probs = probs[choose_from]/np.sum(probs[choose_from])
        ids = np.random.choice(choose_from, self.batch_size, replace=False, p=probs)
        batch = {}
        for k in self.training_set.keys():
            batch[k] = self.training_set[k][ids]
        batch['a'] = self.actions[self.ak][ids]
        batch['r'] = np.dot(batch['phi']*[1, 1, 10, 10, 100], self.reward_weights)
        #print(batch['phi'], batch['r'], batch['a'])
        batch['s_ids'] = ids
        batch['ns_ids'] = ids + 1
    
        return batch
    
    def fitQ(self, batch, Q):
        
        # input = [state action]
        x =  np.hstack((batch['s'], batch['a']))
        
        # target = r + gamma * max_a(Q(s', a))      == r for first iteration
        y = batch['r'] + (self.gamma * np.max(Q[batch['ns_ids'], :], axis=1))
        
        self.q_est[self.ak].fit(x, y)   
    
    def updateQtable(self, Qtable, batch):
        
        for i, a in enumerate(self.unique_actions[self.ak]):
            #print(a, i)
            Qtable[batch['s_ids'], i] = self.q_est[self.ak].predict(np.hstack((batch['ns'], np.tile(a, (self.batch_size,1)))))
        return Qtable
    
    def runFQI(self, repeats=1):
        
        print('Learning policy:', self.ak)
        meanQtable = np.zeros((self.n_samples, self.n_actions[self.ak]))
        
        for r in range(repeats):
            print('Run', r, ':')
            print('Initialize: get batch, set initial Q')
            Qtable = np.zeros((self.n_samples, self.n_actions[self.ak]))
            Qdist = []

            #print('Run FQI')
            for iteration in range(self.iters):

                # copy q-table
                Qold = cp.deepcopy(Qtable)

                # sample batch  
                batch = self.sampleTuples() 

                # learn q_est with samples, targets from batch
                self.fitQ(batch, Qtable)

                # update Q table for all s given new estimator
                self.updateQtable(Qtable, batch)

                # check divergence from last estimate
                Qdist.append(mean_absolute_error(Qold, Qtable))
         
            #plt.plot(Qdist)
            meanQtable += Qtable
        
        meanQtable = meanQtable / repeats
        print('Learn policy')
        self.getPi(meanQtable)
        return Qdist
                    
    
    def getPi(self, Qtable):
        optA = np.argmax(Qtable, axis=1)
        #print("Fitting to training set")
        #print("Optimal actions: ", optA)
        self.piE[self.ak].fit(self.training_set['s'], optA)
        #print("Done Fitting")
        
        
    def testPi(self, vnum, behavior):

        if vnum in self.test_set['vnum']: data = self.test_set
        elif vnum in self.training_set['vnum']: data = self.training_set

        inds = np.where(data['vnum']==vnum)[0]
        #evalA = self.piE['cat'].predict(data['s'][inds])
        #behavA = a2c(data['a'][inds], 'cat')
        
        # actions based on policy we learn
        evalA = self.piE[self.ak].predict(data['s'][inds])
        # actual historical actions
        behavA = a2c(data['a'][inds], self.ak)
        # predicted actions based on historical actions model
        behavB = behavior[self.ak].predict(data['s'][inds])

        #plt.title(vnum)

        return inds, evalA, behavA, behavB
               
    def fitQeval(self, batch, Q):
        
        # input = [state action]
        x =  np.hstack((batch['s'], batch['a']))
                
        # target = r + gamma * (Q(s', pi(a)))   
        expected_a = self.piE['cat'].predict(batch['ns'])
        y = batch['r'] + (self.gamma * (Q[batch['ns_ids'], expected_a]))
        
        self.eval_est.fit(x, y)

    def updateQEtable(self, Qtable, batch):
        
        for i, a in enumerate(self.unique_actions[self.ak]):
            #print(a, i)
            Qtable[batch['s_ids'], i] = self.eval_est.predict(np.hstack((batch['ns'], np.tile(a, (self.batch_size,1)))))
        return Qtable
    
    def runFQE(self):
        
        print('Initialize: get batch, set initial Q')
        QEtable = np.random.rand(self.n_samples, self.n_actions[self.ak]) - 1
        QEdist = []
        
        print('Run FQE')
        for iteration in range(self.iters):
            
            # copy q-table
            QEold = cp.deepcopy(QEtable)
              
            # sample batch  
            batch = self.sampleTuples() 
             
            # learn q_est with samples, targets from batch
            self.fitQeval(batch, QEtable)
              
            # update Q table for all s given new estimator
            self.updateQEtable(QEtable, batch)
              
            # check divergence from last estimate
            QEdist.append(mean_absolute_error(QEold, QEtable))
                     
        plt.plot(QEdist)
        
        Vvals = np.max(QEtable, axis=1)
        Vest = LGBMRegressor(n_estimators=100, silent=False)
        Vest.fit(self.training_set['s'], Vvals)
        
        estimated_v = Vest.predict(self.test_set['s'])
        
        return estimated_v

    def getEpisode(self, v, vset='test'):
        
        if vset=='test': dataset = self.test_set
        else: dataset = self.training_set
        ep = {}
        inds = np.where(dataset['vnum'] == v)[0]
        for k in dataset.keys():
            ep[k] = dataset[k][inds]
            
        states = ep['s']
        actions = a2c(ep['a'], ak='cat')
        phis = ep['phi']
        rewards = np.dot(ep['phi'], self.reward_weights)
        
        return states, actions, phis, rewards

    def getRhos(self, W=False, PD=True, vset='test'):

        rhos = np.ones([self.NV[vset], self.maxT])
        epRhos = []

        for i, v in enumerate(self.visits[vset]):
            
            # load episode
            states, actions, phis, rewards = self.getEpisode(v, vset)
            epT = len(states)
            T = min(epT, self.maxT)            
            
            # load action probabilities (unclear why the T isn't appropriate here)
            #print(str(self.piB['cat'].predict_proba(states).shape))
            #print(str(np.arange(epT)))
            #print(str(actions))
            #print(str(np.arange(epT), actions))
            prob_b = self.piB['cat'].predict_proba(states)[np.arange(epT), actions][:T]
            #print(str(prob_b))
            if self.piE['cat'].predict_proba(states).shape[1] < 4:
                #print('Warning: < 4 classes')
                prob_e = np.zeros([T, 4])
                cols = np.unique(self.piE['cat'].predict(states))
                probs = self.piE['cat'].predict_proba(states)
                for i, a in enumerate(cols):
                    prob_e[:, a] = probs[:, i]
                prob_e = prob_e[np.arange(epT), actions][:T]
            else:
                prob_e = self.piE['cat'].predict_proba(states)[np.arange(epT), actions][:T]
            
            # calculate importance weights
            if PD:
                # per-step cumulative weights
                invprop = np.cumprod(prob_e/prob_b, axis=0)
                # clip importance weights
                invprop[invprop<1e-3] = 1e-3
                invprop[invprop>1e3] = 1e3
                
                rhos[i, :len(invprop)] = list(invprop)
                rhos[i, len(invprop):] = np.ones(self.maxT-len(invprop)) * rhos[i,len(invprop)-1]
                epRhos.append({'s': states, 'a': actions, 'phi': phis,'r': rewards})  
        norm = self.NV[vset]
        if W: norm = np.sum(rhos, axis=0)    
        for i in range(self.NV[vset]): 
            epRhos[i]['rho'] = rhos[i,:] / norm
            
        return epRhos
            
    def perdecisionIS(self, vset='test', W=False):
        
        rhos = self.getRhos(W=W)
        T = [len(rhos[i]['phi']) for i in range(len(rhos))]        
        gamma_vec = [self.gamma**(i+1) for i in range(self.maxT)]                    
        
        estimated_mu = np.vstack([np.sum(rhos[i]['phi'][:T[i]] * (gamma_vec[:T[i]] * rhos[i]['rho'][:T[i]])[:,np.newaxis],
                                         axis=0) for i in range(self.NV[vset])])
        
        estimated_V = np.dot(estimated_mu, np.array(self.reward_weights))
                
        return estimated_mu, estimated_V        
    

    def find_feature_expectations(self, rhos, behav=False, vset='train'):

        gamma_vec = [self.gamma**(i+1) for i in range(self.maxT)]
        T = [len(rhos[i]['phi']) for i in range(len(rhos))]  

        if behav:
            print('Simple averaging')
            estimated_mu = np.mean(np.vstack([np.sum(rhos[i]['phi'] * np.array(gamma_vec[:T[i]])[:,np.newaxis], axis=0) 
                                      for i in range(self.NV[vset])]), axis=0)
        else:
            print('PDWIS estimate')
            estimated_mu = np.mean(np.vstack([np.sum(rhos[i]['phi'] * (gamma_vec[:T[i]] * rhos[i]['rho'][:T[i]])[:,np.newaxis],
                                             axis=0) for i in range(self.NV[vset])]), axis=0)

        return estimated_mu


    def irl(self, epochs=10, learning_rate=0.5, init_w=None):

        # Initialize reward weights:
        w_vecs = []
        if init_w is None:
            #w = random_weights(len(self.reward_weights))
            w = np.ones(len(self.reward_weights))/float(len(self.reward_weights))
        else:
            w = init_w
        feature_matrix = self.training_set['phi']
        muB = None

        for i in range(epochs):

            print('Epoch', i, '- Train pi with current w=', w)
            #r = np.dot(feature_matrix, w)
            self.reward_weights = w
            w_vecs.append(w)
            #try:
            self.runFQI()
            print('piE:', np.unique(self.piE['cat'].predict(self.training_set['s']), return_counts=True))

            print('Evaluate feature expectations for pi')
            epRhos = self.getRhos(vset='train')
            mu = self.find_feature_expectations(epRhos, behav=False, vset='train')
            print(mu)

            print('Initialize behaviour mu:')
            if muB is None:
                muB = self.find_feature_expectations(epRhos, behav=True, vset='train')
            print(muB)

            print('Gradient update for new w')
            grad = norm(muB) - norm(mu)
#             except:
#                 print('Error - skip update')
#                 grad = 0
            w += learning_rate*(0.95**i) * grad
            w = w/np.sum(np.abs(w))
            

        return w

In [47]:
agent = FQIagent(train_tuples=train, test_tuples=test, weights=[1, 1, 1, 1, 1])
Q_dist = agent.runFQI(repeats=1)
plt.plot(Q_dist, label= "Vanilla FQI")
plt.xlabel("Iteration")
plt.ylabel("Q Estimate")
plt.legend()
plt.show()

ValueError: Unknown label type: 'unknown'