The idea is to model the impact of fishing quotas on a fish population, while taking into account food availability

Import all useful libraries

In [1]:
#%load_ext autoreload
#%autoreload 2
import numpy as np
import copy 
import random
import pandas as pd
from tqdm import tqdm
from scipy.stats import binom
from scipy.optimize import minimize
import matplotlib.pyplot as plt

Import the q-learning function

In [2]:
from robust_q_learning_v3 import *
from q_learning_v2 import *

First simplified case: one species, two size class

In [3]:
X = np.array([(0,0), (0,5), (5,0), (5,5)]) # States
A = np.array([0, 5]) # Actions

c1 = 1 / 5

def r(x,a,y):
    x1, x2 = x
    y1, y2 = y
    return(c1 * (x2 - a >= 0) * a - 10 * (x2 - a < 0)) # Reward function

rr = 1
dr = 0
gr = 1
npp = 0
def P(x,a):
    x1, x2 = x
    x2_ = max(x2 - a, 0)
    y1  = min(rr * x2_ + npp * 5, 5)
    y2  = min(x2_ - 5 * dr + gr * x1, 5)
    #if a==0:
    #    return (x2 , min(x1 + x2, 5))
    #elif a==5:
    #    return (0, x1)
    return(y1, y2)
    
eps1 = 0.1
npp2 = 1
def P2(x,a):
    x1, x2 = x
    unif = np.random.uniform(0)
    x2_ = max(x2 - a, 0)
    y1  = min(rr * x2_ + ((unif > eps1) * npp + (unif <= eps1) * npp2) * 5 , 5)
    y2  = min(x2_ - 5 * dr + gr * x1, 5)
    #if a==0:
    #    unif      = np.random.uniform(0)
    #    return (unif > eps1) * (x2 , min(x1 + x2, 5)) + (unif <= eps1) * (5 , min(x1 + x2, 5))
    #elif a==5:
    #    unif      = np.random.uniform(0)
    #    return (unif > eps2) * (0, x1) + (unif <= eps2) * (5, x1)
    return(y1, y2)

alpha      = 0.95  # Discount Factor
x_0        = (5,5) # Initial Value
eps_greedy = 0.1   # Epsilon greedy policy

In [4]:
# Build the functions that allow us to get the index of an element a (reps. x) in A (resp. X)
if np.ndim(A) > 1:
    A_list = A
else:
    A_list = np.array([[a] for a in A])
if np.ndim(X) > 1:
    X_list = X
else:
    X_list = np.array([[x] for x in X])

def a_index(a):
    return np.flatnonzero((a==A_list).all(1))[0]
def x_index(x):
    return np.flatnonzero((x==X_list).all(1))[0]

First non-robust runs

In [5]:
Nr_iter = 1_000_000

Q_0_, V = q_learning(X, A, r, P, alpha, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))

100%|██████████| 1000000/1000000 [00:29<00:00, 33957.25it/s]


In [6]:
# Get the result of the Q-Learning algorithm,
# Get the optimal results for each x in X
def a_opt(x, Q_opt):
    return A[np.argmax(Q_opt[x_index(x),:])]

In [7]:
df = pd.DataFrame(np.array([[a_opt(x, Q_0_) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(5,0)', '(5,5)']
df

Unnamed: 0,"(0,0)","(0,5)","(5,0)","(5,5)"
0,0,0,0,5


In [8]:
V

array([[368582.,  19347.],
       [281115.,  14926.],
       [ 23875.,   1234.],
       [ 14641., 276280.]])

In [9]:
x_0        = (5,0) # Initial Value
Nr_iter = 1_000_000
Q_0_, V = q_learning(X, A, r, P, alpha, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_0_) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(5,0)', '(5,5)']
df

100%|██████████| 1000000/1000000 [00:29<00:00, 34120.37it/s]


Unnamed: 0,"(0,0)","(0,5)","(5,0)","(5,5)"
0,0,0,0,5


In [10]:
x_0     = (0,5) # Initial Value
Nr_iter = 1_000_000
Q_0_, V = q_learning(X, A, r, P, alpha, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_0_) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(5,0)', '(5,5)']
df

100%|██████████| 1000000/1000000 [00:29<00:00, 33498.05it/s]


Unnamed: 0,"(0,0)","(0,5)","(5,0)","(5,5)"
0,0,0,0,5


In [11]:
x_0     = (0,0) # Initial Value
Nr_iter = 1_000_000
Q_0_, V = q_learning(X, A, r, P, alpha, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_0_) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(5,0)', '(5,5)']
df

100%|██████████| 1000000/1000000 [00:29<00:00, 33794.55it/s]


Unnamed: 0,"(0,0)","(0,5)","(5,0)","(5,5)"
0,0,0,0,5


In [12]:
x_0     = (0,0) # Initial Value
Nr_iter = 1_000_000
Q_0_, V = q_learning(X, A, r, P2, alpha, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_0_) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(5,0)', '(5,5)']
df

100%|██████████| 1000000/1000000 [00:34<00:00, 29259.13it/s]


Unnamed: 0,"(0,0)","(0,5)","(5,0)","(5,5)"
0,0,0,0,5


In [13]:
x_0     = (0,5) # Initial Value
Nr_iter = 1_000_000
Q_0_, V = q_learning(X, A, r, P2, alpha, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_0_) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(5,0)', '(5,5)']
df

100%|██████████| 1000000/1000000 [00:35<00:00, 28081.01it/s]


Unnamed: 0,"(0,0)","(0,5)","(5,0)","(5,5)"
0,0,0,0,5


In [14]:
x_0     = (5,0) # Initial Value
Nr_iter = 1_000_000
Q_0_, V = q_learning(X, A, r, P2, alpha, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_0_) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(5,0)', '(5,5)']
df

100%|██████████| 1000000/1000000 [00:35<00:00, 27895.35it/s]


Unnamed: 0,"(0,0)","(0,5)","(5,0)","(5,5)"
0,0,0,0,5


In [15]:
x_0     = (5,5) # Initial Value
Nr_iter = 1_000_000
Q_0_, V = q_learning(X, A, r, P2, alpha, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_0_) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(5,0)', '(5,5)']
df

100%|██████████| 1000000/1000000 [00:36<00:00, 27560.44it/s]


Unnamed: 0,"(0,0)","(0,5)","(5,0)","(5,5)"
0,0,0,0,5


Testing the accuracy of the result with the other function

In [16]:
X = np.array([(0,0), (0,5), (5,0), (5,5)]) # States
A = np.array([0, 5]) # Actions

c1 = 1 / 5

def r(x,a,y):
    x1, x2 = x
    y1, y2 = y
    return(c1 * (x2 - a >= 0) * a - (x2 - a < 0)) # Reward function

rr = 1
dr = 0
gr = 1
npp = 0
def P1(x,a):
    x1, x2 = x
    x2_ = max(x2 - a, 0)
    y1  = min(rr * x2_ + npp * 5, 5)
    y2  = min(x2_ - 5 * dr + gr * x1, 5)
    #if a==0:
    #    return (x2 , min(x1 + x2, 5))
    #elif a==5:
    #    return (0, x1)
    return(y1, y2)
    
eps1 = 0.1
npp2 = 1
def P2(x,a):
    x1, x2 = x
    unif = np.random.uniform(0)
    x2_ = max(x2 - a, 0)
    y1  = min(rr * x2_ + ((unif > eps1) * npp + (unif <= eps1) * npp2) * 5 , 5)
    y2  = min(x2_ - 5 * dr + gr * x1, 5)
    #if a==0:
    #    unif      = np.random.uniform(0)
    #    return (unif > eps1) * (x2 , min(x1 + x2, 5)) + (unif <= eps1) * (5 , min(x1 + x2, 5))
    #elif a==5:
    #    unif      = np.random.uniform(0)
    #    return (unif > eps2) * (0, x1) + (unif <= eps2) * (5, x1)
    return(y1, y2)

    
# CREATE THE PROBABILITY MEASURE OUT OF THE RANDOM VARIABLE
nr = 1_000
p1_ = np.zeros([len(X), len(A), len(X)])
p2_ = np.zeros([len(X), len(A), len(X)])
for n in range(nr):
    for x in X:
        for a in A:
            y1 = P1(x,a)
            x_1 = x_index(y1)
            p1_[x_index(x), a_index(a), x_1] += 1
            y2 = P2(x,a)
            x_2 = x_index(y2)
            p2_[x_index(x), a_index(a), x_2] += 1
p1_ = p1_/nr
p2_ = p2_/nr
def p1(x,a,y):
    return(p1_[x_index(x), a_index(a), x_index(y)])
def p2(x,a,y):
    return(p2_[x_index(x), a_index(a), x_index(y)])

alpha      = 0.95  # Discount Factor
x_0        = (5,5) # Initial Value
k_0        = 0     # Initial index of the corresponding MDP
eps_greedy = 0.1   # Epsilon greedy policy

In [17]:
Q_opt_robust, V = robust_q_learning_v2(X, A, r, np.array([P1, P2]), np.array([p1, p2]), alpha, x_0, k_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))

100%|██████████| 1000000/1000000 [03:26<00:00, 4840.34it/s]


In [18]:
df = pd.DataFrame(np.array([[a_opt(x, Q_opt_robust) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(5,0)', '(5,5)']
df

Unnamed: 0,"(0,0)","(0,5)","(5,0)","(5,5)"
0,0,0,0,5


In [19]:
V

array([[368525.,  19456.],
       [281248.,  14939.],
       [ 23344.,   1297.],
       [ 14703., 276488.]])

In [20]:
x_0        = (5,0) # Initial Value
Q_opt_robust, V = robust_q_learning_v2(X, A, r, np.array([P1, P2]), np.array([p1, p2]), alpha, x_0, k_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_opt_robust) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(5,0)', '(5,5)']
df

100%|██████████| 1000000/1000000 [03:31<00:00, 4721.21it/s]


Unnamed: 0,"(0,0)","(0,5)","(5,0)","(5,5)"
0,0,0,0,5


In [21]:
x_0        = (0,5) # Initial Value
Q_opt_robust, V = robust_q_learning_v2(X, A, r, np.array([P1, P2]), np.array([p1, p2]), alpha, x_0, k_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_opt_robust) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(5,0)', '(5,5)']
df

100%|██████████| 1000000/1000000 [03:28<00:00, 4802.79it/s]


Unnamed: 0,"(0,0)","(0,5)","(5,0)","(5,5)"
0,0,0,0,5


In [22]:
x_0        = (0,0) # Initial Value
Q_opt_robust, V = robust_q_learning_v2(X, A, r, np.array([P1, P2]), np.array([p1, p2]), alpha, x_0, k_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_opt_robust) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(5,0)', '(5,5)']
df

100%|██████████| 1000000/1000000 [03:26<00:00, 4836.57it/s]


Unnamed: 0,"(0,0)","(0,5)","(5,0)","(5,5)"
0,0,0,0,5


In [23]:
x_0        = (0,0) # Initial Value
Q_opt_robust, V = robust_q_learning_v2(X, A, r, np.array([P1]), np.array([p1]), alpha, x_0, k_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_opt_robust) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(5,0)', '(5,5)']
df

100%|██████████| 1000000/1000000 [01:52<00:00, 8865.04it/s]


Unnamed: 0,"(0,0)","(0,5)","(5,0)","(5,5)"
0,0,0,0,5


In [24]:
x_0        = (0,5) # Initial Value
Q_opt_robust, V = robust_q_learning_v2(X, A, r, np.array([P1]), np.array([p1]), alpha, x_0, k_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_opt_robust) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(5,0)', '(5,5)']
df

100%|██████████| 1000000/1000000 [01:55<00:00, 8631.77it/s]


Unnamed: 0,"(0,0)","(0,5)","(5,0)","(5,5)"
0,0,0,0,5


In [25]:
x_0        = (5,0) # Initial Value
Q_opt_robust, V = robust_q_learning_v2(X, A, r, np.array([P1]), np.array([p1]), alpha, x_0, k_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_opt_robust) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(5,0)', '(5,5)']
df

100%|██████████| 1000000/1000000 [01:55<00:00, 8688.54it/s]


Unnamed: 0,"(0,0)","(0,5)","(5,0)","(5,5)"
0,0,0,0,5


In [26]:
x_0        = (5,5) # Initial Value
Q_opt_robust, V = robust_q_learning_v2(X, A, r, np.array([P1]), np.array([p1]), alpha, x_0, k_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_opt_robust) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(5,0)', '(5,5)']
df

100%|██████████| 1000000/1000000 [01:52<00:00, 8893.58it/s]


Unnamed: 0,"(0,0)","(0,5)","(5,0)","(5,5)"
0,0,0,0,5


Lets now put more possible states

In [27]:
#%load_ext autoreload
#%autoreload 2
import numpy as np
import copy 
import random
import pandas as pd
from tqdm import tqdm
from scipy.stats import binom
from scipy.optimize import minimize
import matplotlib.pyplot as plt

In [28]:
from robust_q_learning_v2 import *
from q_learning import *

In [29]:
X = np.array([(0,0), (0,5), (0,10), (5,0), (5,5), (5,10), (10,0), (10,5), (10,10)]) # States
A = np.array([0, 5, 10]) # Actions

In [30]:
# Build the functions that allow us to get the index of an element a (reps. x) in A (resp. X)
if np.ndim(A) > 1:
    A_list = A
else:
    A_list = np.array([[a] for a in A])
if np.ndim(X) > 1:
    X_list = X
else:
    X_list = np.array([[x] for x in X])

def a_index(a):
    return np.flatnonzero((a==A_list).all(1))[0]
def x_index(x):
    return np.flatnonzero((x==X_list).all(1))[0]

In [31]:
c1 = 1 / 10

def r(x,a,y):
    x1, x2 = x
    y1, y2 = y
    return(c1 * (x2 - a >= 0) * a - 10 * (x2 - a < 0)) # Reward function

rr = 1
dr = 0
gr = 1
npp = 0
def P(x,a):
    x1, x2 = x
    x2_ = max(x2 - a, 0)
    y1  = min(rr * x2_ + npp * 5, 10)
    y2  = min(x2_ - 5 * dr + gr * x1, 10)
    #if a==0:
    #    return (x2 , min(x1 + x2, 10))
    #elif a==5:
    #    return (max(x2 - 5, 0), min(x1 + max(x2 - 5, 0), 10))
    #elif a==10:
    #    return (0, x1)
    return(y1, y2)

# CREATE THE PROBABILITY MEASURE OUT OF THE RANDOM VARIABLE
nr = 1_000
p_ = np.zeros([len(X), len(A), len(X)])
for n in range(nr):
    for x in X:
        for a in A:
            y = P(x,a)
            x_ = x_index(y)
            p_[x_index(x), a_index(a), x_] += 1
p_ = p_/nr
def p(x,a,y):
    return(p_[x_index(x), a_index(a), x_index(y)])

alpha      = 0.95  # Discount Factor
x_0        = (10,10) # Initial Value
k_0        = 0     # Initial index of the corresponding MDP, starting with the central proba of 1/2
eps_greedy = 0.1   # Epsilon greedy policy

In [32]:
Nr_iter = 1_000_000

Q_0_, V = q_learning(X, A, r, P, alpha, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))

100%|██████████| 1000000/1000000 [00:31<00:00, 31260.75it/s]


In [33]:
# Get the result of the Q-Learning algorithm,
# Get the optimal results for each x in X
def a_opt(x, Q_opt):
    return A[np.argmax(Q_opt[x_index(x),:])]

In [34]:
df = pd.DataFrame(np.array([[a_opt(x, Q_0_) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(0,10)', '(5,0)', '(5,5)', '(5,10)', '(10,0)', '(10,5)', '(10,10)']
df

Unnamed: 0,"(0,0)","(0,5)","(0,10)","(5,0)","(5,5)","(5,10)","(10,0)","(10,5)","(10,10)"
0,0,5,0,0,0,10,0,0,5


In [35]:
V

array([[9.33399e+05, 3.31290e+04, 3.34640e+04],
       [1.00000e+00, 1.00000e+00, 0.00000e+00],
       [0.00000e+00, 0.00000e+00, 0.00000e+00],
       [0.00000e+00, 0.00000e+00, 0.00000e+00],
       [1.00000e+00, 0.00000e+00, 0.00000e+00],
       [1.00000e+00, 0.00000e+00, 2.00000e+00],
       [0.00000e+00, 0.00000e+00, 0.00000e+00],
       [0.00000e+00, 0.00000e+00, 0.00000e+00],
       [0.00000e+00, 2.00000e+00, 0.00000e+00]])

In [36]:
Q_opt_robust, V = robust_q_learning_v2(X, A, r, np.array([P]), np.array([p]), alpha, x_0, k_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))

100%|██████████| 1000000/1000000 [03:42<00:00, 4485.84it/s]


In [37]:
df = pd.DataFrame(np.array([[a_opt(x, Q_opt_robust) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(0,10)', '(5,0)', '(5,5)', '(5,10)', '(10,0)', '(10,5)', '(10,10)']
df

Unnamed: 0,"(0,0)","(0,5)","(0,10)","(5,0)","(5,5)","(5,10)","(10,0)","(10,5)","(10,10)"
0,0,0,0,0,0,0,0,0,10


In [38]:
V

array([[9.33185e+05, 3.33610e+04, 3.34270e+04],
       [0.00000e+00, 0.00000e+00, 0.00000e+00],
       [3.00000e+00, 0.00000e+00, 1.00000e+00],
       [0.00000e+00, 0.00000e+00, 0.00000e+00],
       [0.00000e+00, 0.00000e+00, 0.00000e+00],
       [9.00000e+00, 0.00000e+00, 0.00000e+00],
       [0.00000e+00, 0.00000e+00, 0.00000e+00],
       [0.00000e+00, 0.00000e+00, 0.00000e+00],
       [1.00000e+00, 9.00000e+00, 4.00000e+00]])

In [39]:
Q_0_

array([[ 0.48758131, -9.5124992 , -9.51217237],
       [ 0.95      ,  1.45      ,  1.        ],
       [ 1.        ,  1.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        ],
       [ 1.8525    ,  1.        ,  1.        ],
       [ 1.3775    ,  1.        ,  1.95      ],
       [ 1.        ,  1.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        ],
       [ 1.        ,  1.6293125 ,  1.        ]])

In [40]:
Q_opt_robust

array([[ 0.4875869 , -9.5123191 , -9.51231086],
       [ 1.        ,  1.        ,  1.        ],
       [ 2.17099178,  1.        ,  1.95      ],
       [ 1.        ,  1.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        ],
       [ 1.68061822,  1.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        ],
       [ 0.95      ,  1.91833269,  2.67873006]])

In [41]:
c1 = 1 / 10

def r(x,a,y):
    x1, x2 = x
    y1, y2 = y
    return(c1 * (x2 - a >= 0) * a - 10 * (x2 - a < 0)) # Reward function

rr = 1
dr = 0
gr = 1
npp = 0
def P1(x,a):
    x1, x2 = x
    x2_ = max(x2 - a, 0)
    y1  = min(rr * x2_ + npp * 5, 10)
    y2  = min(x2_ - 5 * dr + gr * x1, 10)
    return(y1, y2)

eps1 = 0.1
eps2 = 0.2
npp2 = 1
def P2(x,a):
    x1, x2 = x
    unif = np.random.uniform(0)
    x2_  = max(x2 - a, 0)
    y1   = min(rr * x2_ + ((unif > eps1) * npp + (unif <= eps1) * npp2) * 5 , 10)
    y2   = min(x2_ - 5 * dr + gr * x1, 10)
    return(y1, y2)

eps3 = 0.5
dr3  = 1
rr   = 2
def P3(x,a):
    x1, x2 = x
    unif = np.random.uniform(0)
    x2_  = max(x2 - a, 0)
    y1   = min(rr * x2_ + npp * 5 , 10)
    x2__ = max(x2_ - ((unif > eps3) * dr + (unif <= eps3) * dr3) * 5, 0)
    y2   = min(x2__ + gr * x1, 10)
    return(y1, y2)

    
# CREATE THE PROBABILITY MEASURE OUT OF THE RANDOM VARIABLE
nr = 1_000
p1_ = np.zeros([len(X), len(A), len(X)])
p2_ = np.zeros([len(X), len(A), len(X)])
p3_ = np.zeros([len(X), len(A), len(X)])
for n in range(nr):
    for x in X:
        for a in A:
            y1 = P1(x,a)
            x_1 = x_index(y1)
            p1_[x_index(x), a_index(a), x_1] += 1
            y2 = P2(x,a)
            x_2 = x_index(y2)
            p2_[x_index(x), a_index(a), x_2] += 1
            y3 = P3(x,a)
            x_3 = x_index(y3)
            p3_[x_index(x), a_index(a), x_3] += 1
p1_ = p1_/nr
p2_ = p2_/nr
p3_ = p3_/nr
def p1(x,a,y):
    return(p1_[x_index(x), a_index(a), x_index(y)])
def p2(x,a,y):
    return(p2_[x_index(x), a_index(a), x_index(y)])
def p3(x,a,y):
    return(p3_[x_index(x), a_index(a), x_index(y)])

alpha      = 0.95  # Discount Factor
x_0        = (10,10) # Initial Value
k_0        = 0     # Initial index of the corresponding MDP, starting with the central proba of 1/2
eps_greedy = 0.1   # Epsilon greedy policy

In [42]:
Nr_iter = 1_000_000
Q_opt_robust, V = robust_q_learning_v2(X, A, r, np.array([P1, P2]), np.array([p1, p2]), alpha, x_0, k_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_opt_robust) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(0,10)', '(5,0)', '(5,5)', '(5,10)', '(10,0)', '(10,5)', '(10,10)']
df

 37%|███▋      | 365694/1000000 [02:21<03:47, 2785.68it/s]

In [None]:
Nr_iter = 1_000_000
Q_opt_robust, V = robust_q_learning_v2(X, A, r, np.array([P1, P3]), np.array([p1, p3]), alpha, x_0, k_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_opt_robust) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(0,10)', '(5,0)', '(5,5)', '(5,10)', '(10,0)', '(10,5)', '(10,10)']
df

In [None]:
Nr_iter = 1_000_000
Q_opt_robust, V = robust_q_learning_v2(X, A, r, np.array([P1, P2, P3]), np.array([p1, p2, p3]), alpha, x_0, k_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))
df = pd.DataFrame(np.array([[a_opt(x, Q_opt_robust) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(0,10)', '(5,0)', '(5,5)', '(5,10)', '(10,0)', '(10,5)', '(10,10)']
df

Now lets add a new species to simulate the willing to keep a certain biodiversity

In [None]:
#%load_ext autoreload
#%autoreload 2
import numpy as np
import copy 
import random
import pandas as pd
from tqdm import tqdm
from scipy.stats import binom
from scipy.optimize import minimize
import matplotlib.pyplot as plt

In [None]:
from robust_q_learning_v2 import *
from q_learning import *

In [None]:
A_s1 = np.array([0, 5]) # Actions
X    = []
for x1 in A_s1:
    for x2 in A_s1:
        X += [(x1, x2)]
X_s1 = np.array(X) # States

A_s2 = np.array([0, 5, 10]) # Actions
X    = []
for x1 in A_s2:
    for x2 in A_s2:
        X += [(x1, x2)]
X_s2 = np.array(X) # States

A = []
for a1 in A_s1:
    for a2 in A_s2:
        A += [(a1, a2)]
A = np.array(A) # Actions
X = []
for x1 in X_s1:
    for x2 in X_s2:
        X += [(x1, x2)]
X = np.array(X) # States

In [None]:
# Build the functions that allow us to get the index of an element a (reps. x) in A (resp. X)
if np.ndim(A) > 1:
    A_list = A
else:
    A_list = np.array([[a] for a in A])
if np.ndim(X) > 1:
    X_list = X
else:
    X_list = np.array([[x] for x in X])

def a_index(a):
    return np.flatnonzero((a==A_list).all(1))[0]
def x_index(x):
    return np.flatnonzero((x==X_list).all(1))[0]

In [None]:
c1 = 2 / 5 
c2 = 1 / 10 
psy  = 1

def r(x,a,y):
    x_s1, x_s2   = x
    y_s1, y_s2   = y
    x1_s1, x2_s1 = x_s1
    x1_s2, x2_s2 = x_s2
    y1_s1, y2_s1 = y_s1
    y1_s2, y2_s2 = y_s2
    a_s1, a_s2   = a

    food_s1      = c1 * (x2_s1 - a_s1 >= 0) * a_s1 - 10 * (x2_s1 - a_s1 < 0)  #Optimization term, regarding the food that the species 1 can give us
    food_s2      = c2 * (x2_s2 - a_s2 >= 0) * a_s2 - 10 * (x2_s2 - a_s2 < 0)  #Optimization term, regarding the food that the species 2 can give us
    biod         = psy  * (y1_s1 + y2_s1 > 0) * (y1_s2 + y2_s2 > 0)           #Optimization term, regarding the biodiversity we want to keep
    return(food_s1 + food_s2 + biod) # Reward function

rr = 1
dr = 0
gr = 1
npp = 0
def P(x,a):
    x_s1, x_s2   = x
    x1_s1, x2_s1 = x_s1
    x1_s2, x2_s2 = x_s2
    a_s1, a_s2   = a 
    
    x2_s1_ = max(x2_s1 - a_s1, 0)
    y1_s1  = min(rr * x2_s1_ + npp * 5, 5)
    y2_s1  = min(x2_s1_ - 5 * dr + gr * x1_s1, 5)
    
    x2_s2_ = max(x2_s2 - a_s2, 0)
    y1_s2  = min(rr * x2_s2_ + npp * 5, 10)
    y2_s2  = min(x2_s2_ - 5 * dr + gr * x1_s2, 10)
    
    return ((y1_s1, y2_s1), (y1_s2, y2_s2))

# CREATE THE PROBABILITY MEASURE OUT OF THE RANDOM VARIABLE
nr = 1_000
p_ = np.zeros([len(X), len(A), len(X)])
for n in range(nr):
    for x in X:
        for a in A:
            y = P(x,a)
            x_ = x_index(y)
            p_[x_index(x), a_index(a), x_] += 1
p_ = p_/nr
#print(p_)
def p(x,a,y):
    return(p_[x_index(x), a_index(a), x_index(y)])

alpha      = 0.95  # Discount Factor
x_0        = ((5, 5), (10, 10)) # Initial Value
k_0        = 0     # Initial index of the corresponding MDP, starting with the central proba of 1/2
eps_greedy = 0.1   # Epsilon greedy policy

In [None]:
Nr_iter = 10_000_000

Q_0_, V = q_learning(X, A, r, P, alpha, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))

In [None]:
# Get the result of the Q-Learning algorithm,
# Get the optimal results for each x in X
def a_opt(x, Q_opt):
    return A[np.argmax(Q_opt[x_index(x),:])]

In [None]:
Result = np.array([[a_opt(x, Q_0_) for x in X]])
Result = Result[0]

In [None]:
df = pd.DataFrame(Result.T)
df.columns = [str(x) for x in X]
df

In [None]:
df.to_csv("df_bio1_nonrobust.csv")

In [None]:
V

In [None]:
Q_opt_robust, V = robust_q_learning_v2(X, A, r, np.array([P]), np.array([p]), alpha, x_0, k_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))

In [None]:
Result_r = np.array([[a_opt(x, Q_opt_robust) for x in X]])
Result_r = Result_r[0]

In [None]:
df = pd.DataFrame(Result_r.T)
df.columns = [str(x) for x in X]
df

In [None]:
df.to_csv("df_bio1_robust.csv")

In [None]:
V

In [None]:
Nr_iter = 100_000_000

Q_0_, V = q_learning(X, A, r, P, alpha, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))

Result = np.array([[a_opt(x, Q_0_) for x in X]])
Result = Result[0]

df = pd.DataFrame(Result.T)
df.columns = [str(x) for x in X]
df

In [None]:
df.to_csv("df_bio1_nonrobust_moreit.csv")

In [None]:
V