In [10]:
import os, sys
import json
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random

# Simulation
* Simulate RL data from two different distributions, generate transition tuples

In [7]:
# Generate transition matrices, separate distributions for each one
shape, scale = 2., 1. 
transition_foreground = np.random.gamma(shape, scale, (12, 10))

mu, sigma = 0, 0.1 # mean and standard deviation
transition_background = np.random.normal(mu, sigma, (12, 10))

In [8]:
# Generate reward function
mu, sigma = 0, 2
reward_function = np.random.normal(mu, sigma, (12, 1))

In [30]:
# Params
exploit = 0.8
explore = 1-exploit
num_samples = 1000
actions = [[0, 0], [0, 1], [1, 0], [1, 1]]
mu, sigma = 0, 1

In [31]:
transition_tuples = []
for i in range(num_samples):
    # All initial states are generated from random normal
    s = np.random.normal(mu, sigma, (10, 1))
    
    flip = random.uniform(0, 1)
    # Exploit
    if flip < exploit:
        # Decide which transition matrix
        flip = np.random.choice(1)
        
        all_rewards = []
        for j, a in enumerate(actions):
            a = np.asarray(a)
            a = np.reshape(a, (2, 1))
            s_a = np.concatenate((s, a))
            reward = np.dot(reward_function.T, s_a)
            all_rewards.append(reward)
        
        all_rewards = np.asarray(all_rewards)
        a = actions[np.argmax(all_rewards)]
        reward = np.max(all_rewards)
        
        if flip == 0:
            ns = np.dot(s_a.T, transition_foreground)
        else:
            ns = np.dot(s_a.T, transition_background) 
    # Explore
    else:
        a = np.asarray(actions[np.random.choice(3)])
        a = np.reshape(a, (2, 1))
        s_a = np.concatenate((s, a)) # concatenate the state and action
        
        # Decide which transition matrix
        flip = np.random.choice(1)
        if flip == 0:
            ns = np.dot(s_a.T, transition_foreground)
        else:
            ns = np.dot(s_a.T, transition_background)
        reward = np.dot(reward_function.T, s_a)
    
    transition_tuples.append((s, list(a), ns, reward))

In [32]:
transition_tuples[0]

(array([[-0.1008517 ],
        [-1.57502093],
        [ 0.69657444],
        [ 1.14820428],
        [ 0.40980247],
        [-0.25693566],
        [ 0.46171094],
        [ 0.81657324],
        [ 1.16787237],
        [ 1.3785051 ]]),
 [1, 1],
 array([[ 9.17417014, 10.36570878,  7.4759017 , 16.7367186 , 13.26857162,
         11.31506481, 21.0419554 ,  5.39447119, 16.19687638, 17.06996801]]),
 -1.6607431292157382)

# FQI