In [None]:
#%load_ext autoreload
#%autoreload 2
import numpy as np
import copy 
import random
import pandas as pd
from tqdm import tqdm
from scipy.stats import binom
from scipy.optimize import minimize
import matplotlib.pyplot as plt

In [None]:
rng = np.random.default_rng()

In [None]:
from robust_q_learning_v2 import *
from q_learning import *

In [None]:
X = np.array([(0,0), (0,5), (0,10), (5,0), (5,5), (5,10), (10,0), (10,5), (10,10)]) # States
A = np.array([0, 5, 10]) # Actions

In [None]:
# Build the functions that allow us to get the index of an element a (reps. x) in A (resp. X)
if np.ndim(A) > 1:
    A_list = A
else:
    A_list = np.array([[a] for a in A])
if np.ndim(X) > 1:
    X_list = X
else:
    X_list = np.array([[x] for x in X])

def a_index(a):
    return np.flatnonzero((a==A_list).all(1))[0]
def x_index(x):
    return np.flatnonzero((x==X_list).all(1))[0]

In [None]:
phi_price = 1

def r(x,a,y):
    xj, xa = x
    yj, ya = y
    return phi_price * a * (xa - a >= 0) - phi_price * a * 10 * (xa - a < 0)# Reward function

eps_greedy = 0.1   # Epsilon greedy policy
alpha      = 0.95  # Discount Factor
x_0        = (10,10) # Initial Value
k_0        = 0     # Initial index of the corresponding MDP, starting with the central proba of 1/2

In [None]:
# First probability
rr1 = 1
dr1 = 0
gr1 = 1
def P1(x,a):
    unif      = np.random.uniform(0)
    if (unif > eps_greedy):
        xj, xa = x
        xa_ = xa - a
        yj  = min(rr1 * xa_, 10)
        ya  = min(max(xa_ - 5 * dr1 + gr1 * xj, 0), 10)
        return (yj, ya) if xa_ >= 0 else (0,0)
    else:
        return rng.choice(X)
    
# Other propabilities
rr2 = 2
dr2 = 0
gr2 = 1
def P2(x,a):
    unif      = np.random.uniform(0)
    if (unif > eps_greedy):
        xj, xa = x
        xa_ = xa - a
        yj  = min(rr2 * xa_, 10)
        ya  = min(max(xa_ - 5 * dr2 + gr2 * xj, 0), 10)
        return (yj, ya) if xa_ >= 0 else (0,0)
    else:
        return rng.choice(X)
rr3 = 1
dr3 = 1
gr3 = 1
def P3(x,a):
    unif      = np.random.uniform(0)
    if (unif > eps_greedy):
        xj, xa = x
        xa_ = xa - a
        yj  = min(rr3 * xa_, 10)
        ya  = min(max(xa_ - 5 * dr3 + gr3 * xj, 0), 10)
        return (yj, ya) if xa_ >= 0 else (0,0)
    else:
        return rng.choice(X)
rr4 = 2
dr4 = 1
gr4 = 1
def P4(x,a):
    unif      = np.random.uniform(0)
    if (unif > eps_greedy):
        xj, xa = x
        xa_ = xa - a
        yj  = min(rr4 * xa_, 10)
        ya  = min(max(xa_ - 5 * dr4 + gr4 * xj, 0), 10)
        return (yj, ya) if (xa_>=0 & yj>=0 & ya>=0) else (0,0)
    else:
        return rng.choice(X)
rr5 = 1
dr5 = 2
gr5 = 1
def P5(x,a):
    unif      = np.random.uniform(0)
    if (unif > eps_greedy):
        xj, xa = x
        xa_ = xa - a
        yj  = min(rr5 * xa_, 10)
        ya  = min(max(xa_ - 5 * dr5 + gr5 * xj, 0), 10)
        return (yj, ya) if xa_ >= 0 else (0,0)
    else:
        return rng.choice(X)
rr6 = 2
dr6 = 2
gr6 = 1
def P6(x,a):
    unif      = np.random.uniform(0)
    if (unif > eps_greedy):
        xj, xa = x
        xa_ = xa - a
        yj  = min(rr6 * xa_, 10)
        ya  = min(max(xa_ - 5 * dr6 + gr6 * xj, 0), 10)
        return (yj, ya) if xa_ >= 0 else (0,0)
    else:
        return rng.choice(X)

# CREATE THE PROBABILITY MEASURE OUT OF THE RANDOM VARIABLE
nr = 1_000
p1_ = np.zeros([len(X), len(A), len(X)])
p2_ = np.zeros([len(X), len(A), len(X)])
p3_ = np.zeros([len(X), len(A), len(X)])
p4_ = np.zeros([len(X), len(A), len(X)])
p5_ = np.zeros([len(X), len(A), len(X)])
p6_ = np.zeros([len(X), len(A), len(X)])
for n in range(nr):
    for x in X:
        for a in A:
            y1 = P1(x,a)
            x1_ = x_index(y1)
            p1_[x_index(x), a_index(a), x1_] += 1
            y2 = P2(x,a)
            x2_ = x_index(y2)
            p2_[x_index(x), a_index(a), x2_] += 1
            y3 = P3(x,a)
            x3_ = x_index(y3)
            p3_[x_index(x), a_index(a), x3_] += 1
            y4 = P4(x,a)
            x4_ = x_index(y4)
            p4_[x_index(x), a_index(a), x4_] += 1
            y5 = P5(x,a)
            x5_ = x_index(y5)
            p5_[x_index(x), a_index(a), x5_] += 1
            y6 = P6(x,a)
            x6_ = x_index(y6)
            p6_[x_index(x), a_index(a), x6_] += 1
p7_ = p1_/nr
p2_ = p2_/nr
p3_ = p3_/nr
p4_ = p4_/nr
p5_ = p5_/nr
p6_ = p6_/nr
def p1(x,a,y):
    return(p1_[x_index(x), a_index(a), x_index(y)])
def p2(x,a,y):
    return(p2_[x_index(x), a_index(a), x_index(y)])
def p3(x,a,y):
    return(p3_[x_index(x), a_index(a), x_index(y)])
def p4(x,a,y):
    return(p4_[x_index(x), a_index(a), x_index(y)])
def p5(x,a,y):
    return(p5_[x_index(x), a_index(a), x_index(y)])
def p6(x,a,y):
    return(p6_[x_index(x), a_index(a), x_index(y)])

In [None]:
Nr_iter = 1_000_000
Q_0_, V = q_learning(X, A, r, P1, alpha, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))

In [None]:
# Get the result of the Q-Learning algorithm,
# Get the optimal results for each x in X
def a_opt(x, Q_opt):
    return A[np.argmax(Q_opt[x_index(x),:])]

In [None]:
df = pd.DataFrame(np.array([[a_opt(x, Q_0_) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(0,10)', '(5,0)', '(5,5)', '(5,10)', '(10,0)', '(10,5)', '(10,10)']
df

In [None]:
a_P1 = [0, 0, 5, 0, 0, 5, 0, 5, 10]

In [None]:
Nr_iter = 1_000_000
Q_opt_robust, V = robust_q_learning_v2(X, A, r, np.array([P1, P2, P3, P4, P5, P6]), np.array([p1, p2, p3, p4, p5, p6]), alpha, x_0, k_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_opt_robust) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(0,10)', '(5,0)', '(5,5)', '(5,10)', '(10,0)', '(10,5)', '(10,10)']
df

In [None]:
a_robust_P1P6 = [0, 5, 10, 0, 5, 10, 0, 5, 10]

In [None]:
# Open system first probability
nr1 = 1
def P7(x,a):
    unif   = np.random.uniform(0)
    xj, xa = x
    xa_ = xa - a
    yj  = min(rr1 * xa_ + (nr1 * 5) * (unif > eps_greedy), 10)
    ya  = min(max(xa_ - 5 * dr1 + gr1 * xj, 0), 10)
    return (yj, ya) if xa_ >= 0 else (0,0)
    
# Other propabilities
def P8(x,a):
    unif   = np.random.uniform(0)
    xj, xa = x
    xa_ = xa - a
    yj  = min(rr2 * xa_ + (nr1 * 5) * (unif > eps_greedy), 10)
    ya  = min(max(xa_ - 5 * dr2 + gr2 * xj, 0), 10)
    return (yj, ya) if xa_ >= 0 else (0,0)

def P9(x,a):
    unif   = np.random.uniform(0)
    xj, xa = x
    xa_ = xa - a
    yj  = min(rr3 * xa_ + (nr1 * 5) * (unif > eps_greedy), 10)
    ya  = min(max(xa_ - 5 * dr3 + gr3 * xj, 0), 10)
    return (yj, ya) if xa_ >= 0 else (0,0)

def P10(x,a):
    unif   = np.random.uniform(0)
    xj, xa = x
    xa_ = xa - a
    yj  = min(rr4 * xa_ + (nr1 * 5) * (unif > eps_greedy), 10)
    ya  = min(max(xa_ - 5 * dr4 + gr4 * xj, 0), 10)
    return (yj, ya) if (xa_>=0 & yj>=0 & ya>=0) else (0,0)

def P11(x,a):
    unif   = np.random.uniform(0)
    xj, xa = x
    xa_ = xa - a
    yj  = min(rr5 * xa_ + (nr1 * 5) * (unif > eps_greedy), 10)
    ya  = min(max(xa_ - 5 * dr5 + gr5 * xj, 0), 10)
    return (yj, ya) if xa_ >= 0 else (0,0)

def P12(x,a):
    unif   = np.random.uniform(0)
    xj, xa = x
    xa_ = xa - a
    yj  = min(rr6 * xa_ + (nr1 * 5) * (unif > eps_greedy), 10)
    ya  = min(max(xa_ - 5 * dr6 + gr6 * xj, 0), 10)
    return (yj, ya) if xa_ >= 0 else (0,0)

nr2 = 2
def P13(x,a):
    unif   = np.random.uniform(0)
    xj, xa = x
    xa_ = xa - a
    yj  = min(rr1 * xa_ + (nr2 * 5) * (unif > eps_greedy), 10)
    ya  = min(max(xa_ - 5 * dr1 + gr1 * xj, 0), 10)
    return (yj, ya) if xa_ >= 0 else (0,0)
    
# Other propabilities
def P14(x,a):
    unif   = np.random.uniform(0)
    xj, xa = x
    xa_ = xa - a
    yj  = min(rr2 * xa_ + (nr2 * 5) * (unif > eps_greedy), 10)
    ya  = min(max(xa_ - 5 * dr2 + gr2 * xj, 0), 10)
    return (yj, ya) if xa_ >= 0 else (0,0)

def P15(x,a):
    unif   = np.random.uniform(0)
    xj, xa = x
    xa_ = xa - a
    yj  = min(rr3 * xa_ + (nr2 * 5) * (unif > eps_greedy), 10)
    ya  = min(max(xa_ - 5 * dr3 + gr3 * xj, 0), 10)
    return (yj, ya) if xa_ >= 0 else (0,0)

def P16(x,a):
    unif   = np.random.uniform(0)
    xj, xa = x
    xa_ = xa - a
    yj  = min(rr4 * xa_ + (nr2 * 5) * (unif > eps_greedy), 10)
    ya  = min(max(xa_ - 5 * dr4 + gr4 * xj, 0), 10)
    return (yj, ya) if (xa_>=0 & yj>=0 & ya>=0) else (0,0)

def P17(x,a):
    unif   = np.random.uniform(0)
    xj, xa = x
    xa_ = xa - a
    yj  = min(rr5 * xa_ + (nr2 * 5) * (unif > eps_greedy), 10)
    ya  = min(max(xa_ - 5 * dr5 + gr5 * xj, 0), 10)
    return (yj, ya) if xa_ >= 0 else (0,0)

def P18(x,a):
    unif   = np.random.uniform(0)
    xj, xa = x
    xa_ = xa - a
    yj  = min(rr6 * xa_ + (nr2 * 5) * (unif > eps_greedy), 10)
    ya  = min(max(xa_ - 5 * dr6 + gr6 * xj, 0), 10)
    return (yj, ya) if xa_ >= 0 else (0,0)

# CREATE THE PROBABILITY MEASURE OUT OF THE RANDOM VARIABLE
nr = 1_000
p7_ = np.zeros([len(X), len(A), len(X)])
p8_ = np.zeros([len(X), len(A), len(X)])
p9_ = np.zeros([len(X), len(A), len(X)])
p10_ = np.zeros([len(X), len(A), len(X)])
p11_ = np.zeros([len(X), len(A), len(X)])
p12_ = np.zeros([len(X), len(A), len(X)])
p13_ = np.zeros([len(X), len(A), len(X)])
p14_ = np.zeros([len(X), len(A), len(X)])
p15_ = np.zeros([len(X), len(A), len(X)])
p16_ = np.zeros([len(X), len(A), len(X)])
p17_ = np.zeros([len(X), len(A), len(X)])
p18_ = np.zeros([len(X), len(A), len(X)])
for n in range(nr):
    for x in X:
        for a in A:
            y7 = P7(x,a)
            x7_ = x_index(y7)
            p7_[x_index(x), a_index(a), x7_] += 1
            y8 = P8(x,a)
            x8_ = x_index(y8)
            p8_[x_index(x), a_index(a), x8_] += 1
            y9 = P9(x,a)
            x9_ = x_index(y9)
            p9_[x_index(x), a_index(a), x9_] += 1
            y10 = P10(x,a)
            x10_ = x_index(y10)
            p10_[x_index(x), a_index(a), x10_] += 1
            y11 = P11(x,a)
            x11_ = x_index(y11)
            p11_[x_index(x), a_index(a), x11_] += 1
            y12 = P12(x,a)
            x12_ = x_index(y12)
            p12_[x_index(x), a_index(a), x12_] += 1
            y13 = P13(x,a)
            x13_ = x_index(y13)
            p13_[x_index(x), a_index(a), x13_] += 1
            y14 = P14(x,a)
            x14_ = x_index(y14)
            p14_[x_index(x), a_index(a), x14_] += 1
            y15 = P15(x,a)
            x15_ = x_index(y15)
            p15_[x_index(x), a_index(a), x15_] += 1
            y16 = P16(x,a)
            x16_ = x_index(y16)
            p16_[x_index(x), a_index(a), x16_] += 1
            y17 = P17(x,a)
            x17_ = x_index(y17)
            p17_[x_index(x), a_index(a), x17_] += 1
            y18 = P18(x,a)
            x18_ = x_index(y18)
            p18_[x_index(x), a_index(a), x18_] += 1
p7_ = p7_/nr
p8_ = p8_/nr
p9_ = p9_/nr
p10_ = p10_/nr
p11_ = p11_/nr
p12_ = p12_/nr
p13_ = p13_/nr
p14_ = p14_/nr
p15_ = p15_/nr
p16_ = p16_/nr
p17_ = p17_/nr
p18_ = p18_/nr
def p7(x,a,y):
    return(p7_[x_index(x), a_index(a), x_index(y)])
def p8(x,a,y):
    return(p8_[x_index(x), a_index(a), x_index(y)])
def p9(x,a,y):
    return(p9_[x_index(x), a_index(a), x_index(y)])
def p10(x,a,y):
    return(p10_[x_index(x), a_index(a), x_index(y)])
def p11(x,a,y):
    return(p11_[x_index(x), a_index(a), x_index(y)])
def p12(x,a,y):
    return(p12_[x_index(x), a_index(a), x_index(y)])
def p13(x,a,y):
    return(p13_[x_index(x), a_index(a), x_index(y)])
def p14(x,a,y):
    return(p14_[x_index(x), a_index(a), x_index(y)])
def p15(x,a,y):
    return(p15_[x_index(x), a_index(a), x_index(y)])
def p16(x,a,y):
    return(p16_[x_index(x), a_index(a), x_index(y)])
def p17(x,a,y):
    return(p17_[x_index(x), a_index(a), x_index(y)])
def p18(x,a,y):
    return(p18_[x_index(x), a_index(a), x_index(y)])

In [None]:
Nr_iter = 1_000_000
Q_0_, V = q_learning(X, A, r, P7, alpha, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))

In [None]:
df = pd.DataFrame(np.array([[a_opt(x, Q_0_) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(0,10)', '(5,0)', '(5,5)', '(5,10)', '(10,0)', '(10,5)', '(10,10)']
df

In [None]:
a_P7 = [0, 0, 5, 0, 0, 5, 0, 5, 10]

In [None]:
Nr_iter = 1_000_000
Q_opt_robust, V = robust_q_learning_v2(X, A, r, np.array([P7, P8, P9, P10, P11, P12]), np.array([p7, p8, p9, p10, p11, p12]), alpha, x_0, k_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_opt_robust) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(0,10)', '(5,0)', '(5,5)', '(5,10)', '(10,0)', '(10,5)', '(10,10)']
df

In [None]:
a_P7P12 = [0, 5, 5, 0, 5, 10, 0, 5, 5]

In [None]:
Nr_iter = 1_000_000
Q_opt_robust, V = robust_q_learning_v2(X, A, r, np.array([P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18]), np.array([p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18]), alpha, x_0, k_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_opt_robust) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(0,10)', '(5,0)', '(5,5)', '(5,10)', '(10,0)', '(10,5)', '(10,10)']
df

In [None]:
a_open = 

In [None]:
Nr_iter = 1_000_000
Q_opt_robust, V = robust_q_learning_v2(X, A, r, np.array([P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18]), np.array([p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18]), alpha, x_0, k_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_opt_robust) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(0,10)', '(5,0)', '(5,5)', '(5,10)', '(10,0)', '(10,5)', '(10,10)']
df

In [None]:
a_all = 

In [None]:
policies = [a_P1, a_robust_P1P6, a_P7, a_P7P12, a_open, a_all]

In [None]:
P_ = [P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18]

In [None]:
Nr_iter = 100_000

In [None]:
CR = []
for a_tilde in policies:

    cr_p = []
    for P in P_:

        E = 0
        x = x_0
        for n in range(Nr_iter):
    
            a = a_tilde[x_index(x)]
            y = P(x,a)
            E += r(x, a, y)

            x = y

        cr_p += [E]

    CR += [cr_p]

CR = np.array(CR)