In [1]:
import numpy as np
import random

In [2]:
class Grid():
    def __init__(self,size=5):
        self.size = size
        self.start = (0,0)
        self.goal = (4,4)
        self.reset()
        
    def reset(self):
        self.agent_pos = self.start
        return self.agent_pos
    
    def state(self):
        return self.agent_pos
    
    def goal_state(self):
        return self.goal
    
    def step(self, action):
        x,y = self.agent_pos
        
        if action == 'up':
            x = max(0, x-1)
            
        elif action == 'down':
            x = min(self.size-1, x+1)
            
        elif action == 'left':
            y = max(0, y-1)
            
        elif action == 'right':
            y = min(self.size-1, y+1)
            
        self.agent_pos = (x,y)
        
        reward = 10 if self.agent_pos == self.goal else -1
        done = self.agent_pos == self.goal
        return self.agent_pos, reward, done
    

# Moves

In [5]:
environment = Grid()
actions = [0,1,2,3] # up, down, left, right
action_names = ['up', 'down', 'left', 'right']

# Q-Table
    Table format: [x y action] example: [x,y,3] # last one is action 

In [6]:
Q = np.zeros((environment.size, environment.size, len(actions)))

# Parameters

In [7]:
alpha = 0.1 # learning rate
gamma = 0.9 # discount factor
epsilon = 0.2  # exploration rate
episodes = 100

In [None]:
for episode in range(episodes):
    state = environment.reset()
    done = False
    total_reward = 0
    steps = 0
    
    while not done:
        x, y = state
        
        # ε-greedy action: explore vs exploit
        if random.random() < epsilon:
            action = random.choice(actions) # explore
        else:
            action = np.argmax(Q[x,y]) # exploit already learned values
            
        next_state, reward, done = environment.step(action_names[action])
        next_x, next_y = next_state
        
        # Q-learning update
        # current reward + future best reward from new state.
        # Q(s, a) = Q(s, a) + α * [R + γ * max_a' Q(s', a') - Q(s, a)]
        best_next_q = np.max(Q[next_x, next_y])
        Q[x, y, action] = Q[x, y, action] + alpha * (reward + gamma * best_next_q - Q[x, y, action])

        state = next_state
        total_reward += reward
        steps += 1
        
    print(f"Episode {episode+1}: Total Reward: {total_reward}, Steps: {steps}")        

Episode 1: Total Reward: -127, Steps: 138
Episode 2: Total Reward: -44, Steps: 55
Episode 3: Total Reward: -41, Steps: 52
Episode 4: Total Reward: -5, Steps: 16
Episode 5: Total Reward: -14, Steps: 25
Episode 6: Total Reward: -18, Steps: 29
Episode 7: Total Reward: -20, Steps: 31
Episode 8: Total Reward: -40, Steps: 51
Episode 9: Total Reward: -1, Steps: 12
Episode 10: Total Reward: -57, Steps: 68
Episode 11: Total Reward: -24, Steps: 35
Episode 12: Total Reward: -40, Steps: 51
Episode 13: Total Reward: -7, Steps: 18
Episode 14: Total Reward: -12, Steps: 23
Episode 15: Total Reward: -4, Steps: 15
Episode 16: Total Reward: -14, Steps: 25
Episode 17: Total Reward: -17, Steps: 28
Episode 18: Total Reward: -8, Steps: 19
Episode 19: Total Reward: -16, Steps: 27
Episode 20: Total Reward: -9, Steps: 20
Episode 21: Total Reward: 1, Steps: 10
Episode 22: Total Reward: -11, Steps: 22
Episode 23: Total Reward: 0, Steps: 11
Episode 24: Total Reward: 0, Steps: 11
Episode 25: Total Reward: -10, Step

In [14]:
def print_q_table(Q):
    action_names = ['up', 'down', 'left', 'right']
    for i in range(environment.size):
        for j in range(environment.size):
            print(f"State ({i},{j}):")
            for a in range(4):
                print(f"  Action {action_names[a]}: {Q[i,j,a]:.2f}")
            print("-" * 30)

print("Q-Table (State, Action Values):")
print_q_table(Q)


Q-Table (State, Action Values):
State (0,0):
  Action up: -2.97
  Action down: -2.52
  Action left: -2.93
  Action right: -2.83
------------------------------
State (0,1):
  Action up: -2.27
  Action down: -2.17
  Action left: -2.20
  Action right: -1.96
------------------------------
State (0,2):
  Action up: -1.55
  Action down: -0.41
  Action left: -1.68
  Action right: -1.61
------------------------------
State (0,3):
  Action up: -1.06
  Action down: -1.00
  Action left: -1.42
  Action right: -1.11
------------------------------
State (0,4):
  Action up: -0.84
  Action down: -0.91
  Action left: -0.84
  Action right: -0.86
------------------------------
State (1,0):
  Action up: -2.42
  Action down: -2.18
  Action left: -2.29
  Action right: -1.26
------------------------------
State (1,1):
  Action up: -2.00
  Action down: -1.62
  Action left: -1.96
  Action right: 0.44
------------------------------
State (1,2):
  Action up: -1.37
  Action down: 2.51
  Action left: -1.37
  Actio