In [1]:
import numpy as np
import random

class GridWorld:
    def __init__(self):
        self.rows = 4
        self.cols = 4
        self.state = (0, 0)
        self.goal = (3, 3)
        self.actions = [0, 1, 2, 3]

    def reset(self):
        self.state = (0, 0)
        return self.state

    def step(self, action):
        x, y = self.state
        if action == 0:   x = max(0, x - 1)
        elif action == 1: y = min(self.cols - 1, y + 1)
        elif action == 2: x = min(self.rows - 1, x + 1)
        elif action == 3: y = max(0, y - 1)
        self.state = (x, y)
        if self.state == self.goal: return self.state, 10, True
        else: return self.state, -1, False

def run_q_learning():
    env = GridWorld()
    Q = np.zeros((4, 4, 4))
    alpha = 0.1
    gamma = 0.9
    epsilon = 0.1
    
    print("Running Q-Learning (Off-Policy)...")
    for _ in range(1000):
        state = env.reset()
        done = False
        while not done:
            # Choose Action (Behavior Policy)
            if random.random() < epsilon: action = random.choice(env.actions)
            else: action = np.argmax(Q[state])
            
            next_state, reward, done = env.step(action)
            
            # Update (Target Policy is Greedy - Max)
            best_next_val = np.max(Q[next_state])
            target = reward + gamma * best_next_val * (not done)
            Q[state][action] += alpha * (target - Q[state][action])
            
            state = next_state
            
    print("Q-Learning Q-Table Sample (State 0,0):", np.round(Q[0,0], 2))

if __name__ == "__main__":
    run_q_learning()

Running Q-Learning (Off-Policy)...
Q-Learning Q-Table Sample (State 0,0): [0.33 1.81 1.24 0.3 ]


In [2]:
# exp8_q_learning_generic.py

import numpy as np
import random

class GridWorld4x4:
    def __init__(self, start_state=0, goal_state=15, max_steps=100):
        self.n_rows = 4
        self.n_cols = 4
        self.n_states = self.n_rows * self.n_cols
        self.n_actions = 4
        self.start_state = start_state
        self.goal_state = goal_state
        self.max_steps = max_steps

    def state_to_xy(self, s):
        return (s // self.n_cols, s % self.n_cols)

    def xy_to_state(self, r, c):
        return r * self.n_cols + c

    def reset(self):
        self.state = self.start_state
        self.steps = 0
        return self.state

    def step(self, action):
        r, c = self.state_to_xy(self.state)
        if action == 0:
            r = max(0, r - 1)
        elif action == 1:
            c = min(self.n_cols - 1, c + 1)
        elif action == 2:
            r = min(self.n_rows - 1, r + 1)
        elif action == 3:
            c = max(0, c - 1)
        ns = self.xy_to_state(r, c)
        self.state = ns
        self.steps += 1
        done = (ns == self.goal_state) or (self.steps >= self.max_steps)
        reward = 0 if ns == self.goal_state else -1
        return ns, reward, done, {}

def epsilon_greedy(Q, s, n_actions, eps):
    if random.random() < eps:
        return random.randrange(n_actions)
    return int(np.argmax(Q[s]))

def q_learning(env, num_episodes=2000, alpha=0.5, gamma=1.0, eps=0.1):
    Q = np.zeros((env.n_states, env.n_actions))
    for _ in range(num_episodes):
        s = env.reset()
        done = False
        while not done:
            a = epsilon_greedy(Q, s, env.n_actions, eps)
            ns, r, done, _ = env.step(a)
            Q[s, a] = Q[s, a] + alpha * (r + gamma * np.max(Q[ns]) - Q[s, a])
            s = ns
    policy = np.argmax(Q, axis=1)
    return Q, policy

if __name__ == "__main__":
    env = GridWorld4x4()
    Q, pi = q_learning(env)
    print("Q-learning Q-table:")
    print(Q)
    print("\nQ-learning policy (0:U,1:R,2:D,3:L):")
    print(pi.reshape(4, 4))


Q-learning Q-table:
[[-6.         -5.         -5.         -6.        ]
 [-5.         -4.         -4.         -6.        ]
 [-4.         -3.         -3.         -5.        ]
 [-3.         -3.         -2.         -4.        ]
 [-5.06999397 -4.         -4.         -4.98055097]
 [-4.96489239 -3.         -3.         -4.98587583]
 [-3.99279733 -2.         -2.         -3.75488252]
 [-3.         -2.         -1.         -3.        ]
 [-4.52011585 -3.         -3.         -3.74998051]
 [-3.51390183 -2.         -2.         -3.62974533]
 [-2.96093733 -1.         -1.         -2.91226196]
 [-2.         -1.          0.         -2.        ]
 [-3.88082644 -2.         -2.49811935 -2.        ]
 [-2.77339455 -1.         -1.87499618 -2.74765635]
 [-1.81249237  0.         -0.99609375 -1.74999999]
 [ 0.          0.          0.          0.        ]]

Q-learning policy (0:U,1:R,2:D,3:L):
[[1 1 1 2]
 [1 1 1 2]
 [1 1 1 2]
 [1 1 1 0]]
