In [10]:
import os, sys
import json
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random

# Simulation
* Simulate RL data from two different distributions, generate transition tuples

In [7]:
# Generate transition matrices, separate distributions for each one
shape, scale = 2., 1. 
transition_foreground = np.random.gamma(shape, scale, (12, 10))

mu, sigma = 0, 0.1 # mean and standard deviation
transition_background = np.random.normal(mu, sigma, (12, 10))

In [8]:
# Generate reward function
mu, sigma = 0, 2
reward_function = np.random.normal(mu, sigma, (12, 1))

In [30]:
# Params
exploit = 0.8
explore = 1-exploit
num_samples = 1000
actions = [[0, 0], [0, 1], [1, 0], [1, 1]]
mu, sigma = 0, 1

In [42]:
transition_tuples = []
for i in range(num_samples):
    # All initial states are generated from random normal
    s = np.random.normal(mu, sigma, (10, 1))
    
    flip = random.uniform(0, 1)
    # Exploit
    if flip < exploit:
        # Decide which transition matrix
        flip = np.random.choice(1)
        
        all_rewards = []
        for j, a in enumerate(actions):
            a = np.asarray(a)
            a = np.reshape(a, (2, 1))
            s_a = np.concatenate((s, a))
            reward = np.dot(reward_function.T, s_a)
            all_rewards.append(reward)
        
        noise = np.random.normal(0, 0.01, 1)
        all_rewards = np.asarray(all_rewards)
        a = actions[np.argmax(all_rewards)]
        reward = np.max(all_rewards) + noise
        
        if flip == 0:
            ns = np.dot(s_a.T, transition_foreground) 
        else:
            ns = np.dot(s_a.T, transition_background) 
        ns = np.add(ns , np.random.normal(0, 0.01, (1, 10))) # Add noise
    # Explore
    else:
        a = np.asarray(actions[np.random.choice(3)])
        a = np.reshape(a, (2, 1))
        s_a = np.concatenate((s, a)) # concatenate the state and action
        
        # Decide which transition matrix
        flip = np.random.choice(1)
        if flip == 0:
            ns = np.dot(s_a.T, transition_foreground)
        else:
            ns = np.dot(s_a.T, transition_background)
        reward = np.dot(reward_function.T, s_a) + np.random.normal(0, 0.01, 1)
        ns = np.add(ns , np.random.normal(0, 0.01, (1, 10))) # Add noise
    
    # Transition tuple includes state, action, next state, reward, indication of foreground/background
    # 1 if foreground 0 if background
    transition_tuples.append((s, list(a), ns, reward, 1 - flip))

In [43]:
transition_tuples[0]

(array([[ 0.73812211],
        [ 0.96647785],
        [ 0.39174267],
        [ 0.79444951],
        [ 1.51031674],
        [-0.79999296],
        [-1.09701259],
        [-0.25926689],
        [ 0.63007354],
        [ 0.28462285]]),
 [1, 1],
 array([[ 4.56287606,  1.63101572, 18.00211842,  9.84517179, 17.63163768,
         12.81082134, 11.40057412, 15.23515288,  7.19980815, 14.58702076]]),
 array([4.87665246]),
 1)

# FQI