The idea is to model the impact of fishing quotas on a fish population, while taking into account food availability

Import all useful libraries

In [1]:
#%load_ext autoreload
#%autoreload 2
import numpy as np
import copy 
import random
import pandas as pd
from tqdm import tqdm
from scipy.stats import binom
from scipy.optimize import minimize
import matplotlib.pyplot as plt

Import the q-learning function

In [2]:
from robust_q_learning_modif import *
from q_learning import *

First simple settings

In [4]:
X = np.array([(0,0), (0,5), (0,10), (5,0), (5,5), (5,10), (10,0), (10,5), (10,10)]) # States
A = np.array([0, 5, 10]) # Actions

c1 = 1
c2 = 1

def r(x,a,y):
    x1, x2 = x
    y1, y2 = y
    return(c1 * (x2 - a >= 0) + c2 * (y1 + y2 > 0)) # Reward function

def P(x,a):
    x1, x2 = x
    if a==0:
        return (x2 , min(x1 + x2, 10))
    elif a==5:
        return (max(x2 - 5, 0), min(x1 + max(x2 - 5, 0), 10))
    elif a==10:
        return (0, x1)

def p(x,a,y):
    x1, x2 = x
    y1, y2 = y

    if a==0 & y1==x2 & y2==min(x1 + x2, 10):
        p = 1
    elif a==5 & y1==max(x2 - 5, 0) & y2==min(x1 + max(x2 - 5, 0), 10):
        p = 1
    elif a==10 & y1==0 & y2==x1:
        p = 1
    else:
        p = 0
    
    return p

alpha      = 0.95  # Discount Factor
x_0        = (5,5) # Initial Value
k_0        = 0     # Initial index of the corresponding MDP, starting with the central proba of 1/2
eps_greedy = 0.1   # Epsilon greedy policy

In [5]:
Nr_iter = 1_000_000

Q_0_ = q_learning(X, A, r, P, alpha, x_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = np.ones([len(X),len(A)]))

Q_opt_robust, V = robust_q_learning_modif(X, A, r, np.array([P]), np.array([p]), alpha, x_0, k_0, eps_greedy, Nr_iter, gamma_t_tilde = lambda t: 1/(t+1), Q_0 = Q_0_)

100%|██████████| 1000000/1000000 [00:37<00:00, 26570.93it/s]
100%|██████████| 1000000/1000000 [01:44<00:00, 9599.64it/s]


In [6]:
# Build the functions that allow us to get the index of an element a (reps. x) in A (resp. X)
if np.ndim(A) > 1:
    A_list = A
else:
    A_list = np.array([[a] for a in A])
if np.ndim(X) > 1:
    X_list = X
else:
    X_list = np.array([[x] for x in X])

def a_index(a):
    return np.flatnonzero((a==A_list).all(1))[0]
def x_index(x):
    return np.flatnonzero((x==X_list).all(1))[0]

# Get the result of the Q-Learning algorithm,
# Get the optimal results for each x in X
def a_opt(x, Q_opt):
    return A[np.argmax(Q_opt[x_index(x),:])]

In [7]:
df = pd.DataFrame(np.array([[a_opt(x, Q_opt_robust) for x in X]]))
df.columns = ['(0,0)', '(0,5)', '(0,10)', '(5,0)', '(5,5)', '(5,10)', '(10,0)', '(10,5)', '(10,10)']
df

Unnamed: 0,"(0,0)","(0,5)","(0,10)","(5,0)","(5,5)","(5,10)","(10,0)","(10,5)","(10,10)"
0,0,0,0,0,0,5,0,0,0


In [8]:
V

array([[9.32095e+05, 3.32520e+04, 3.36700e+04],
       [2.10000e+01, 0.00000e+00, 1.00000e+00],
       [2.00000e+01, 1.00000e+00, 0.00000e+00],
       [0.00000e+00, 0.00000e+00, 0.00000e+00],
       [2.10000e+01, 0.00000e+00, 2.00000e+00],
       [4.00000e+01, 4.53000e+02, 2.00000e+01],
       [0.00000e+00, 0.00000e+00, 0.00000e+00],
       [0.00000e+00, 0.00000e+00, 0.00000e+00],
       [3.44000e+02, 3.90000e+01, 2.10000e+01]])