In [None]:
import numpy as np
import random

class GridWorld:
    def __init__(self):
        self.rows = 4
        self.cols = 4
        self.state = (0, 0)
        self.goal = (3, 3)
        self.actions = [0, 1, 2, 3] # Up, Right, Down, Left

    def reset(self):
        self.state = (0, 0)
        return self.state

    def step(self, action):
        x, y = self.state
        if action == 0:   x = max(0, x - 1)
        elif action == 1: y = min(self.cols - 1, y + 1)
        elif action == 2: x = min(self.rows - 1, x + 1)
        elif action == 3: y = max(0, y - 1)
        self.state = (x, y)
        if self.state == self.goal: return self.state, 10, True
        else: return self.state, -1, False

def run_simple_q_learning():
    env = GridWorld()
    # Q-Table: 4x4 Grid, 4 Actions
    Q = np.zeros((4, 4, 4))
    alpha = 0.1 # Learning Rate
    gamma = 0.9 # Discount Factor
    epsilon = 0.1 # Exploration Rate
    
    print("Running Simple Q-Learning Simulation...")
    for episode in range(500):
        state = env.reset()
        done = False
        
        while not done:
            # Epsilon Greedy Strategy
            if random.uniform(0, 1) < epsilon:
                action = random.choice(env.actions)
            else:
                action = np.argmax(Q[state])
            
            next_state, reward, done = env.step(action)
            
            # Q-Learning Update Rule
            best_next_action = np.argmax(Q[next_state])
            td_target = reward + gamma * Q[next_state][best_next_action] * (not done)
            Q[state][action] += alpha * (td_target - Q[state][action])
            
            state = next_state

    print("Learned Q-Table (State 0,0):\n", Q[0,0])

if __name__ == "__main__":
    run_simple_q_learning()

Running Simple Q-Learning Simulation...
Learned Q-Table (State 0,0):
 [-0.4274438   1.8098      0.38604909  0.17719916]


In [27]:
# exp4_q_learning_unknown_model.py

import numpy as np
import random

class GridWorld4x4:
    def __init__(self, start_state=0, goal_state=15, max_steps=100):
        self.n_rows = 4
        self.n_cols = 4
        self.n_states = self.n_rows * self.n_cols
        self.n_actions = 4
        self.start_state = start_state
        self.goal_state = goal_state
        self.max_steps = max_steps

    def state_to_xy(self, s):
        return (s // self.n_cols, s % self.n_cols)

    def xy_to_state(self, r, c):
        return r * self.n_cols + c

    def reset(self):
        self.state = self.start_state
        self.steps = 0
        return self.state

    def step(self, action):
        r, c = self.state_to_xy(self.state)
        if action == 0:
            r = max(0, r - 1)
        elif action == 1:
            c = min(self.n_cols - 1, c + 1)
        elif action == 2:
            r = min(self.n_rows - 1, r + 1)
        elif action == 3:
            c = max(0, c - 1)
        ns = self.xy_to_state(r, c)
        self.state = ns
        self.steps += 1
        done = (ns == self.goal_state) or (self.steps >= self.max_steps)
        reward = 0 if ns == self.goal_state else -1
        return ns, reward, done, {}

def q_learning(env, num_episodes=2000, alpha=0.5, gamma=1.0,
               eps_start=1.0, eps_end=0.05, eps_decay=0.995):
    Q = np.zeros((env.n_states, env.n_actions))
    eps = eps_start
    for ep in range(num_episodes):
        s = env.reset()
        done = False
        while not done:
            if random.random() < eps:
                a = random.randrange(env.n_actions)
            else:
                a = int(np.argmax(Q[s]))
            ns, r, done, _ = env.step(a)
            Q[s, a] += alpha * (r + gamma * np.max(Q[ns]) - Q[s, a])
            s = ns
        eps = max(eps_end, eps * eps_decay)
    policy = np.argmax(Q, axis=1)
    return Q, policy

if __name__ == "__main__":
    env = GridWorld4x4()
    Q, pi = q_learning(env)
    print("Learned Q-table (shape:", Q.shape, ")")
    print(Q)
    print("\nDerived policy (0:U,1:R,2:D,3:L):")
    print(pi.reshape(4, 4))


Learned Q-table (shape: (16, 4) )
[[-6.         -5.         -5.         -6.        ]
 [-5.         -4.         -4.         -6.        ]
 [-4.         -3.         -3.         -5.        ]
 [-3.         -3.         -2.         -4.        ]
 [-6.         -4.         -4.         -5.        ]
 [-5.         -3.         -3.         -4.99999997]
 [-4.         -2.         -2.         -4.        ]
 [-3.         -2.         -1.         -3.        ]
 [-4.99999213 -3.         -3.         -3.99999991]
 [-4.         -2.         -2.         -4.        ]
 [-3.         -1.         -1.         -3.        ]
 [-2.         -1.          0.         -2.        ]
 [-3.99999886 -2.         -2.99999495 -3.        ]
 [-2.99999999 -1.         -2.         -3.        ]
 [-2.          0.         -1.         -1.9999995 ]
 [ 0.          0.          0.          0.        ]]

Derived policy (0:U,1:R,2:D,3:L):
[[1 1 1 2]
 [1 1 1 2]
 [1 1 1 2]
 [1 1 1 0]]
