First running of q-learning algorithm

Import all useful libraries

In [1]:
#%load_ext autoreload
#%autoreload 2
import numpy as np
import copy 
import random
import pandas as pd
from tqdm import tqdm
from scipy.stats import binom
from scipy.optimize import minimize
import matplotlib.pyplot as plt

Import the q-learning function

In [2]:
from q_learning import *

Settings

In [3]:
nr_coins = 10
X        = np.linspace(0, nr_coins, nr_coins+1)        # States
A        = np.array([-1, 0, 1])                        # Actions

def r(x,a,y):
    return(a * (y>x) - a * (y<x) - np.abs(a) * (x==y)) # Reward function

def P_0(x,a):
    return binom.rvs(nr_coins, 0.5) # Assumption that is a fair coin

alpha      = 0.95 # Discount Factor
x_0        = 5    # Initial Value
eps_greedy = 0.1  # Epsilon greedy policy


Training

In [4]:
Nr_iter = 10_000

Q_opt_nonrobust = q_learning(X, A, r, P_0, alpha, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))

100%|██████████| 10000/10000 [00:05<00:00, 1772.96it/s]


In [5]:
# Build the functions that allow us to get the index of an element a (reps. x) in A (resp. X)
if np.ndim(A)>1:
    A_list = A
else:
    A_list = np.array([[a] for a in A])
if np.ndim(X)>1:
    X_list = X
else:
    X_list = np.array([[x] for x in X])

def a_index(a):
    return np.flatnonzero((a==A_list).all(1))[0]
def x_index(x):
    return np.flatnonzero((x==X_list).all(1))[0]

# Get the result of the Q-Learning algorithm,
# Get the optimal results for each x in X
def a_opt_nonrobust(x):
    return A[np.argmax(Q_opt_nonrobust[x_index(x),:])]

In [6]:
df = pd.DataFrame(np.array([[a_opt_nonrobust(x) for x in X]]))
df["State"]=["Non-Robust"]
df = df.set_index("State").reset_index()
df

Unnamed: 0,State,0,1,2,3,4,5,6,7,8,9,10
0,Non-Robust,-1,1,1,1,1,0,-1,-1,-1,-1,-1


For each states we get the best "bet" we could make.
This means that from each state we are we have an idea of what to do as an "action" to get the best result.

Some observation of the sensitivities:

In [7]:
Nr_iter    = 10_000
eps_greedy = 0.1

Q_opt_nonrobust_alpha025 = q_learning(X, A, r, P_0, 0.25, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
Q_opt_nonrobust_alpha05  = q_learning(X, A, r, P_0, 0.5, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
Q_opt_nonrobust_alpha075 = q_learning(X, A, r, P_0, 0.75, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
Q_opt_nonrobust_alpha09  = q_learning(X, A, r, P_0, 0.9, x_0, eps_greedy, Nr_iter = Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
Q_opt_nonrobust_alpha095 = q_learning(X, A, r, P_0, 0.95, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
Q_opt_nonrobust_alpha099 = q_learning(X, A, r, P_0, 0.99, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))

100%|██████████| 10000/10000 [00:04<00:00, 2030.96it/s]
100%|██████████| 10000/10000 [00:06<00:00, 1438.86it/s]
100%|██████████| 10000/10000 [00:06<00:00, 1540.52it/s]
100%|██████████| 10000/10000 [00:06<00:00, 1596.03it/s]
100%|██████████| 10000/10000 [00:05<00:00, 1720.33it/s]
100%|██████████| 10000/10000 [00:06<00:00, 1577.15it/s]


In [8]:
def a_opt_nonrobust(x, Q_opt):
    return A[np.argmax(Q_opt[x_index(x),:])]

In [9]:
df = pd.DataFrame(np.array([[a_opt_nonrobust(x, Q_opt_nonrobust_alpha025) for x in X],
                            [a_opt_nonrobust(x, Q_opt_nonrobust_alpha05) for x in X],
                            [a_opt_nonrobust(x, Q_opt_nonrobust_alpha075) for x in X],
                            [a_opt_nonrobust(x, Q_opt_nonrobust_alpha09) for x in X],
                            [a_opt_nonrobust(x, Q_opt_nonrobust_alpha095) for x in X],
                            [a_opt_nonrobust(x, Q_opt_nonrobust_alpha099) for x in X]]))
df["State"]=["Non-Robust, alpha = 0.25","Non-Robust, alpha = 0.5","Non-Robust, alpha = 0.75","Non-Robust, alpha = 0.9", "Non-Robust, alpha = 0.95", "Non-Robust, alpha = 0.99"]
df = df.set_index("State").reset_index()
df


Unnamed: 0,State,0,1,2,3,4,5,6,7,8,9,10
0,"Non-Robust, alpha = 0.25",1,1,1,1,1,0,-1,-1,-1,-1,-1
1,"Non-Robust, alpha = 0.5",1,1,1,1,0,0,-1,-1,-1,-1,-1
2,"Non-Robust, alpha = 0.75",1,1,1,1,1,0,-1,-1,-1,-1,-1
3,"Non-Robust, alpha = 0.9",-1,1,1,1,1,0,-1,-1,-1,-1,-1
4,"Non-Robust, alpha = 0.95",1,1,1,1,1,0,-1,-1,-1,-1,-1
5,"Non-Robust, alpha = 0.99",1,1,1,1,1,0,-1,-1,-1,-1,-1


In [10]:
Nr_iter = 10_000
alpha   = 0.95

Q_opt_nonrobust_epsilon0    = q_learning(X, A, r, P_0, alpha, x_0, eps_greedy = 0.1, Nr_iter = Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
Q_opt_nonrobust_epsilon001  = q_learning(X, A, r, P_0, alpha, x_0, eps_greedy = 0.01, Nr_iter = Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
Q_opt_nonrobust_epsilon005  = q_learning(X, A, r, P_0, alpha, x_0, eps_greedy = 0.05, Nr_iter = Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
Q_opt_nonrobust_epsilon01   = q_learning(X, A, r, P_0, alpha, x_0, eps_greedy = 0.1, Nr_iter = Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
Q_opt_nonrobust_epsilon05   = q_learning(X, A, r, P_0, alpha, x_0, eps_greedy = 0.5, Nr_iter = Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))

100%|██████████| 10000/10000 [00:05<00:00, 1821.66it/s]
100%|██████████| 10000/10000 [00:05<00:00, 1973.84it/s]
100%|██████████| 10000/10000 [00:06<00:00, 1568.28it/s]
100%|██████████| 10000/10000 [00:05<00:00, 1751.16it/s]
100%|██████████| 10000/10000 [00:05<00:00, 1825.83it/s]


In [11]:
df = pd.DataFrame(np.array([[a_opt_nonrobust(x, Q_opt_nonrobust_epsilon0) for x in X],
                            [a_opt_nonrobust(x, Q_opt_nonrobust_epsilon001) for x in X],
                            [a_opt_nonrobust(x, Q_opt_nonrobust_epsilon005) for x in X],
                            [a_opt_nonrobust(x, Q_opt_nonrobust_epsilon01) for x in X],
                            [a_opt_nonrobust(x, Q_opt_nonrobust_epsilon05) for x in X]]))
df["State"]=["Non-Robust, epsilon_greedy = 0","Non-Robust, epsilon_greedy = 0.01", "Non-Robust, epsilon_greedy = 0.05", "Non-Robust, epsilon_greedy = 0.1", "Non-Robust, epsilon_greedy = 0.5"]
df = df.set_index("State").reset_index()
df

Unnamed: 0,State,0,1,2,3,4,5,6,7,8,9,10
0,"Non-Robust, epsilon_greedy = 0",0,1,1,1,1,0,-1,-1,-1,-1,-1
1,"Non-Robust, epsilon_greedy = 0.01",0,0,1,1,1,0,0,0,-1,-1,-1
2,"Non-Robust, epsilon_greedy = 0.05",-1,1,1,1,1,0,-1,-1,-1,-1,-1
3,"Non-Robust, epsilon_greedy = 0.1",0,1,1,1,1,0,-1,-1,-1,-1,-1
4,"Non-Robust, epsilon_greedy = 0.5",0,1,1,1,1,0,-1,-1,-1,-1,-1
