In [1]:
import numpy as np

# Setto i parameteri gamma e alpha per l'algoritmo Q-Learning.
gamma = 0.75
alpha = 0.9

# Configurazione del magazzino:
######################
# A    B     C  #  D #
#####                #
# E #  F  #  G     H #
#   #     #####      #
# I    J     K     L #
######################

# Definisco gli stati
states = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5,
                     'G': 6, 'H': 7, 'I': 8, 'J': 9, 'K': 10, 'L': 11}

# Definisco le azioni
actions = [0,1,2,3,4,5,6,7,8,9,10,11]

# Definisco le rewards
# (righe = ogni stato, colonne = ogni azione che porta ad un nuovo stato. 1: posso passare al nuovo stato, 0:no)
R = np.array([[0,1,0,0,0,0,0,0,0,0,0,0],    # Da A posso andare solo in B
              [1,0,1,0,0,1,0,0,0,0,0,0],    # Da B in A,C,F
              [0,1,0,0,0,0,1,0,0,0,0,0],    # ....
              [0,0,0,0,0,0,0,1,0,0,0,0],
              [0,0,0,0,0,0,0,0,1,0,0,0],
              [0,1,0,0,0,0,0,0,0,1,0,0],
              [0,0,1,0,0,0,1,1,0,0,0,0],
              [0,0,0,1,0,0,1,0,0,0,0,1],
              [0,0,0,0,1,0,0,0,0,1,0,0],
              [0,0,0,0,0,1,0,0,1,0,1,0],
              [0,0,0,0,0,0,0,0,0,1,0,1],
              [0,0,0,0,0,0,0,1,0,0,1,0]])

In [2]:
# Mapping dagli stati alle locazioni
state_to_location = {state: location for location, state in states.items()}

In [3]:
# Implemento il processo del Q_learning
def calculate_Q_values(R):
    # Inizializzo gli Q-Values a 0
    Q = np.array(np.zeros((R.shape[0], R.shape[1])))
    # Calcolo gli Q_values
    for i in range(1000):
        current_state = np.random.randint(0, R.shape[1])  # seleziono un azione in modo random
        py_actions = []
        for action in range(R.shape[1]):
            if R[current_state, action] > 0:
                py_actions.append(action)
        next_state = np.random.choice(py_actions)
        # computo la TD
        TD = R[current_state, next_state] + gamma * Q[next_state, np.argmax(Q[next_state,:])] \
         - Q[current_state, next_state]
        # Update Q-value
        Q[current_state, next_state] += alpha * TD
    return Q

In [4]:
# Calcolo il percorso dalla locazione di start a quella di end
def route (starting_location, ending_location):
    # Assegno una grande reward per la locazione target
    R_new = np.copy(R)
    ending_state = states[ending_location]
    R_new[ending_state,ending_state] = 1000
    Q = calculate_Q_values(R_new)

    # Inizializzo il percorso
    route=[starting_location]
    next_location = starting_location
    
    # Calcolo il percorso ottimale
    while (next_location != ending_location) :
        starting_state = states[starting_location]  # converto la locazione nell'indice dello stato
        next_state = np.argmax(Q[starting_state,:])
        next_location = state_to_location[next_state]          # converto l'indice dello stato nella locazione
        route.append(next_location)                            # append della locazione al percorso
        starting_location = next_location
    return route

In [6]:
print('Route From -> To')
route('B', 'J')

Route From -> To


['B', 'F', 'J']

In [7]:
# Funzione per gestire locazioni intermedie prima della locazione di end
def two_leg_route(starting_location, intermediate_location, ending_location):
    return route(starting_location, intermediate_location)[:-1] + route(intermediate_location, ending_location)

In [9]:
print('Route From -> by -> To')
print(two_leg_route('B', 'E', 'J'))

Route From -> by -> To
['B', 'F', 'J', 'I', 'E', 'I', 'J']
