############################################################################################

In [None]:
from timeit import default_timer

start = default_timer()

# do stuff

duration = default_timer() - start

In [None]:
import time

t0 = time.time()
#code_block
t1 = time.time()

total = t1-t0

normal distrib: $[\mu - 2\sigma, \mu + 2\sigma]$ 95% of proba in, and for $\mu +/- 3\sigma$, 99.8%

Now let's compute or robust version of the Q-Learning algorithm (in finite spaces)

Import all useful libraries

In [1]:
#%load_ext autoreload
#%autoreload 2
import numpy as np
import copy 
import random
import pandas as pd
from tqdm import tqdm
from scipy.stats import binom
from scipy.optimize import minimize
import matplotlib.pyplot as plt

Import the q-learning function

In [2]:
from q_learning import *

"Robust" Settings

In [3]:
nr_coins = 10
X        = np.linspace(0, nr_coins, nr_coins+1)        # States
A        = np.array([-1, 0, 1])                        # Actions

def r(x,a,y):
    return(a * (y>x) - a * (y<x) - np.abs(a) * (x==y)) # Reward function

def P1_0(x,a):
    return binom.rvs(nr_coins, 0.5) # Assumption that is a fair coin

# Adding some robustness to the model of a "fair coin"
eps = 0.1
def Pp_0(x,a):
    return binom.rvs(nr_coins, 0.5 + eps)
def Pm_0(x,a):
    return binom.rvs(nr_coins, 0.5 - eps)

alpha      = 0.95 # Discount Factor
x_0        = 5    # Initial Value
eps_greedy = 0.1  # Epsilon greedy policy

In [4]:
Nr_iter = 100_000

Q_opt_robust = robust_q_learning(X, A, r, np.array([P1_0, Pp_0, Pm_0]), alpha, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))

 93%|█████████▎| 92782/100000 [04:08<00:17, 413.37it/s]

In [None]:
# Build the functions that allow us to get the index of an element a (reps. x) in A (resp. X)
if np.ndim(A)>1:
    A_list = A
else:
    A_list = np.array([[a] for a in A])
if np.ndim(X)>1:
    X_list = X
else:
    X_list = np.array([[x] for x in X])

def a_index(a):
    return np.flatnonzero((a==A_list).all(1))[0]
def x_index(x):
    return np.flatnonzero((x==X_list).all(1))[0]

# Get the result of the Q-Learning algorithm,
# Get the optimal results for each x in X
def a_opt(x, Q_opt):
    return A[np.argmax(Q_opt[x_index(x),:])]

In [None]:
df = pd.DataFrame(np.array([[a_opt(x, Q_opt_robust) for x in X]]))
df["State"]=["Robust, finite spaces"]
df = df.set_index("State").reset_index()
df

In [None]:
EPS = [1, 2, 0.5]

nr_coins = 10
X        = np.linspace(0, nr_coins, nr_coins+1)        # States
A        = np.array([-1, 0, 1])                        # Actions

def r(x,a,y):
    return(a * (y>x) - a * (y<x) - np.abs(a) * (x==y)) # Reward function

def P1_0(x,a):
    return binom.rvs(nr_coins, 0.5) # Assumption that is a fair coin


# Adding some robustness to the model of a "fair coin"
eps     = EPS[0] / nr_coins
nr_prob = 50
P       = []
for n in range(nr_prob//2 + 1):
    def Pp(x,a):
        return binom.rvs(nr_coins, 0.5 + (eps * n / (nr_prob//2)))
    P += [Pp]
    def Pm(x,a):
        return binom.rvs(nr_coins, 0.5 - (eps * n / (nr_prob//2)))
    P += [Pm]

alpha      = 0.95 # Discount Factor
x_0        = 5    # Initial Value
eps_greedy = 0.1  # Epsilon greedy policy

In [None]:
Nr_iter = 10_000

Q_opt_robust = robust_q_learning(X, A, r, np.array(P), alpha, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))

In [None]:
# Build the functions that allow us to get the index of an element a (reps. x) in A (resp. X)
if np.ndim(A)>1:
    A_list = A
else:
    A_list = np.array([[a] for a in A])
if np.ndim(X)>1:
    X_list = X
else:
    X_list = np.array([[x] for x in X])

def a_index(a):
    return np.flatnonzero((a==A_list).all(1))[0]
def x_index(x):
    return np.flatnonzero((x==X_list).all(1))[0]

# Get the result of the Q-Learning algorithm,
# Get the optimal results for each x in X
def a_opt(x, Q_opt):
    return A[np.argmax(Q_opt[x_index(x),:])]

In [None]:
df = pd.DataFrame(np.array([[a_opt(x, Q_opt_robust) for x in X]]))
df["State"]=["Robust, finite spaces"]
df = df.set_index("State").reset_index()
df