# Q-Learning Algorithm


$$
Q(s_t, a_t) \leftarrow Q(s_t, a_t) + \alpha  [R_t + \gamma \max_{a} Q(s_{t+1,},a) - Q(s_t, a_t)]
$$


In [292]:
import numpy as np

In [293]:
# Grid 4x4
grid_graph = {
    "1": [("2", "right"), ("5", "down")],
    "2": [("1", "left"), ("3", "right"), ("6", "down")],
    "3": [("2", "left"), ("4", "right"), ("7", "down")],
    "4": [("3", "left"), ("8", "down")],
    "5": [("1", "up"), ("6", "right"), ("9", "down")],
    "6": [("2", "up"), ("5", "left"), ("7", "right"), ("10", "down")],
    "7": [("3", "up"), ("6", "left"), ("8", "right"), ("11", "down")],
    "8": [("4", "up"), ("7", "left"), ("12", "down")],
    "9": [("5", "up"), ("10", "right"), ("13", "down")],
    "10": [("6", "up"), ("9", "left"), ("11", "right"), ("14", "down")],
    "11": [("7", "up"), ("10", "left"), ("12", "right"), ("15", "down")],
    "12": [("8", "up"), ("11", "left"), ("16", "down")],
    "13": [("9", "up"), ("14", "right")],
    "14": [("10", "up"), ("13", "left"), ("15", "right")],
    "15": [("11", "up"), ("14", "left"), ("16", "right")],
    "16": [("12", "up"), ("15", "left")],
}

# Grid 3x3
# grid_graph = {
#     "1": [("2", "right"), ("4", "down")],
#     "2": [("1", "left"), ("3", "right"), ("5", "down")],
#     "3": [("2", "left"), ("6", "down")],

#     "4": [("1", "up"), ("5", "right"), ("7", "down")],
#     "5": [("2", "up"), ("4", "left"), ("6", "right"), ("8", "down")],
#     "6": [("3", "up"), ("5", "left"), ("9", "down")],

#     "7": [("4", "up"), ("8", "right")],
#     "8": [("5", "up"), ("7", "left"), ("9", "right")],
#     "9": [("6", "up"), ("8", "left")]
# }

# Grid 5x5
# grid_graph = {
#     "1": [("2", "right"), ("6", "down")],
#     "2": [("1", "left"), ("3", "right"), ("7", "down")],
#     "3": [("2", "left"), ("4", "right"), ("8", "down")],
#     "4": [("3", "left"), ("5", "right"), ("9", "down")],
#     "5": [("4", "left"), ("10", "down")],

#     "6": [("1", "up"), ("7", "right"), ("11", "down")],
#     "7": [("2", "up"), ("6", "left"), ("8", "right"), ("12", "down")],
#     "8": [("3", "up"), ("7", "left"), ("9", "right"), ("13", "down")],
#     "9": [("4", "up"), ("8", "left"), ("10", "right"), ("14", "down")],
#     "10": [("5", "up"), ("9", "left"), ("15", "down")],

#     "11": [("6", "up"), ("12", "right"), ("16", "down")],
#     "12": [("7", "up"), ("11", "left"), ("13", "right"), ("17", "down")],
#     "13": [("8", "up"), ("12", "left"), ("14", "right"), ("18", "down")],
#     "14": [("9", "up"), ("13", "left"), ("15", "right"), ("19", "down")],
#     "15": [("10", "up"), ("14", "left"), ("20", "down")],

#     "16": [("11", "up"), ("17", "right"), ("21", "down")],
#     "17": [("12", "up"), ("16", "left"), ("18", "right"), ("22", "down")],
#     "18": [("13", "up"), ("17", "left"), ("19", "right"), ("23", "down")],
#     "19": [("14", "up"), ("18", "left"), ("20", "right"), ("24", "down")],
#     "20": [("15", "up"), ("19", "left"), ("25", "down")],

#     "21": [("16", "up"), ("22", "right")],
#     "22": [("17", "up"), ("21", "left"), ("23", "right")],
#     "23": [("18", "up"), ("22", "left"), ("24", "right")],
#     "24": [("19", "up"), ("23", "left"), ("25", "right")],
#     "25": [("20", "up"), ("24", "left")]
# }


In [294]:
Q_states = {str(i): {"right": 0, "up": 0, "left": 0, "down": 0} for i in range(1, 26)}


rewards = {str(i): -1 for i in range(1, len(grid_graph) + 1)}
rewards["1"] = -1
rewards["16"] = 100


def get_neighbor_states(state: str) -> list[tuple[str, str]]:
    return grid_graph[state]


def get_state_Q(state: str, action: str) -> int:
    return Q_states[state][action]


def set_state_Q(state: str, action: str, new_value: float) -> int:
    Q_states[state][action] = new_value


def get_reward(state: str, goal_state: str, distance_reward: bool = False) -> int:
    if distance_reward:
        relative_distance = abs(int(state) - int(goal_state))
        relative_distance_reward = -relative_distance
        return rewards[state] + relative_distance_reward
    return rewards[state]

In [295]:
def get_Q_update(
    state_Q: float,
    state_reward: float,
    neighbor_states: list[str],
    learning_rate: float = 0.5,
    discount_factor: float = 0.5,
    step_penalty:float = 0
) -> float:
    """_summary_

    Args:
        state_Q (float): Q value of a state
        state_reward (float): reward of a state
        learning_rate (float, optional): how much to update the Q-value. Defaults to .5.
        discount_factor (float, optional): how much future rewards are taken into account. Defaults to .5.

    Returns:
        float: _description_
    """
    neighbor_states_Q = map(lambda x: get_state_Q(*x), neighbor_states)
    return state_Q + learning_rate * (
        (state_reward + step_penalty) + discount_factor * max(neighbor_states_Q) - state_Q
    )

In [296]:
def get_neighbor_states_Q(current_state, neighbor_states):
    return list(
        map(lambda x: get_state_Q(*x), [[current_state, j] for _, j in neighbor_states])
    )

In [297]:
def get_next_state(current_state, neighbor_states):
    neighbor_states_Q = get_neighbor_states_Q(current_state, neighbor_states)
    return neighbor_states[np.argmax(neighbor_states_Q)]

In [298]:
def choose_action(current_state, neighbor_states, epsilon=0):
    if np.random.random() < epsilon:
        print("Random action")
        np.random.shuffle(neighbor_states)
        return  neighbor_states[0]
    return get_next_state(current_state, neighbor_states)

In [299]:
current_state = "1"
goal_state = "16"
states_follow_up = [current_state]

epsilon = .9

for i in range(1_000):
    print(f"Step {i+1}:")
    print(f"\tcurrent state: {current_state}")

    if current_state == "16":
        print(f"Goal state found at {i+1} iteration")
        break

    neighbor_states = get_neighbor_states(state=current_state)
    neighbor_states_Q = get_neighbor_states_Q(
        current_state=current_state, neighbor_states=neighbor_states
    )

    print(f"\tneighbor states: {neighbor_states} ")
    print(f"\tneighbor states Q values: {neighbor_states_Q} ")

    
    next_state = choose_action(current_state=current_state, neighbor_states=neighbor_states, epsilon=epsilon)
    
    if epsilon > 0.01:
        epsilon = epsilon * .8
    
    next_action_state_Q = get_state_Q(state=current_state, action=next_state[1])
    next_state_reward = get_reward(next_state[0], goal_state, distance_reward=True)
    print(
        f'\tNext action: "{next_state[1]}" to state {next_state[0]} with Q value = {next_action_state_Q} and reward = {next_state_reward}'
    )

    updated_Q = get_Q_update(
        state_Q=next_action_state_Q,
        state_reward=next_state_reward,
        neighbor_states=neighbor_states,
        learning_rate=0.7,
        discount_factor=0.9,
        step_penalty=-2,
    )

    set_state_Q(state=current_state, action=next_state[1], new_value=updated_Q)
    print(
        f"\tUpdates Q value: {get_state_Q(state=current_state, action=next_state[1])}"
    )

    current_state = next_state[0]
    states_follow_up.append(current_state)

Step 1:
	current state: 1
	neighbor states: [('2', 'right'), ('5', 'down')] 
	neighbor states Q values: [0, 0] 
Random action
	Next action: "right" to state 2 with Q value = 0 and reward = -15
	Updates Q value: -11.899999999999999
Step 2:
	current state: 2
	neighbor states: [('1', 'left'), ('3', 'right'), ('6', 'down')] 
	neighbor states Q values: [0, 0, 0] 
	Next action: "left" to state 1 with Q value = 0 and reward = -16
	Updates Q value: -12.6
Step 3:
	current state: 1
	neighbor states: [('2', 'right'), ('5', 'down')] 
	neighbor states Q values: [-11.899999999999999, 0] 
Random action
	Next action: "right" to state 2 with Q value = -11.899999999999999 and reward = -15
	Updates Q value: -15.469999999999999
Step 4:
	current state: 2
	neighbor states: [('1', 'left'), ('3', 'right'), ('6', 'down')] 
	neighbor states Q values: [-12.6, 0, 0] 
Random action
	Next action: "right" to state 3 with Q value = 0 and reward = -14
	Updates Q value: -11.2
Step 5:
	current state: 3
	neighbor states: