In [3]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim

class GridWorld:
    def __init__(self):
        self.rows = 4
        self.cols = 4
        self.state = (0, 0)
        self.goal = (3, 3)
        self.actions = [0, 1, 2, 3] # Up, Right, Down, Left

    def reset(self):
        self.state = (0, 0)
        return self.state

    def step(self, action):
        x, y = self.state
        if action == 0:   x = max(0, x - 1)
        elif action == 1: y = min(self.cols - 1, y + 1)
        elif action == 2: x = min(self.rows - 1, x + 1)
        elif action == 3: y = max(0, y - 1)
        
        self.state = (x, y)
        if self.state == self.goal: return self.state, 10, True
        else: return self.state, -1, False

def run_dqn():
    env = GridWorld()
    
    # Define Model
    model = nn.Sequential(
        nn.Linear(2, 64),
        nn.ReLU(),
        nn.Linear(64, 4)
    )
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()
    
    gamma = 0.9
    epsilon = 0.3 # Increased slightly for better initial exploration
    episodes = 500 # Increased slightly to ensure convergence
    
    print("-----------------------------------")
    print(f"Starting Training for {episodes} episodes...")
    print("-----------------------------------")
    
    # --- TRAINING LOOP ---
    for episode in range(episodes):
        state = env.reset()
        done = False
        total_reward = 0
        total_loss = 0
        
        while not done:
            state_t = torch.FloatTensor(state)
            
            # Epsilon Greedy with decay (optional, but helps)
            current_epsilon = max(0.01, epsilon * (0.99 ** episode))
            
            if random.random() < current_epsilon:
                action = random.choice(env.actions)
            else:
                with torch.no_grad():
                    q_values = model(state_t)
                    action = torch.argmax(q_values).item()
            
            next_state, reward, done = env.step(action)
            next_state_t = torch.FloatTensor(next_state)
            
            # --- DQN Update ---
            # 1. Calculate Target
            with torch.no_grad():
                target_max = model(next_state_t).max()
                target = reward + gamma * target_max * (not done)
            
            # 2. Calculate Prediction
            q_values = model(state_t)
            q_pred = q_values[action]
            
            # 3. Optimize
            loss = loss_fn(q_pred, target)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            state = next_state
            total_reward += reward
            total_loss += loss.item()

        # Print progress every 50 episodes
        if (episode + 1) % 50 == 0:
            print(f"Episode {episode+1}: Total Reward = {total_reward}, Loss = {total_loss:.4f}")

    print("\nTraining Finished.")
    
    # --- TESTING PHASE ---
    print("\n-----------------------------------")
    print("Testing Learned Policy (Greedy Mode)")
    print("-----------------------------------")
    
    state = env.reset()
    path = [state]
    done = False
    steps = 0
    
    while not done and steps < 20:
        state_t = torch.FloatTensor(state)
        with torch.no_grad():
            # Strictly Greedy (No epsilon)
            q_values = model(state_t)
            action = torch.argmax(q_values).item()
            
        state, _, done = env.step(action)
        path.append(state)
        steps += 1
        
    print("Final Path Taken:", path)
    
    if path[-1] == (3,3):
        print("RESULT: SUCCESS - Goal Reached!")
    else:
        print("RESULT: FAILED - Did not reach goal.")

if __name__ == "__main__":
    run_dqn()

-----------------------------------
Starting Training for 500 episodes...
-----------------------------------
Episode 50: Total Reward = 5, Loss = 2.4373
Episode 100: Total Reward = 4, Loss = 3.4601
Episode 150: Total Reward = 2, Loss = 12.3485
Episode 200: Total Reward = 5, Loss = 0.0437
Episode 250: Total Reward = 4, Loss = 1.0288
Episode 300: Total Reward = 5, Loss = 0.0090
Episode 350: Total Reward = 5, Loss = 0.0048
Episode 400: Total Reward = 5, Loss = 0.0025
Episode 450: Total Reward = 5, Loss = 0.0012
Episode 500: Total Reward = 5, Loss = 0.0153

Training Finished.

-----------------------------------
Testing Learned Policy (Greedy Mode)
-----------------------------------
Final Path Taken: [(0, 0), (0, 1), (0, 2), (0, 3), (1, 3), (2, 3), (3, 3)]
RESULT: SUCCESS - Goal Reached!


In [2]:
# exp9_dqn_gridworld.py

import numpy as np
import random
from collections import deque

import torch
import torch.nn as nn
import torch.optim as optim

class GridWorld4x4:
    def __init__(self, start_state=0, goal_state=15, max_steps=100):
        self.n_rows = 4
        self.n_cols = 4
        self.n_states = self.n_rows * self.n_cols
        self.n_actions = 4
        self.start_state = start_state
        self.goal_state = goal_state
        self.max_steps = max_steps

    def state_to_xy(self, s):
        return (s // self.n_cols, s % self.n_cols)

    def xy_to_state(self, r, c):
        return r * self.n_cols + c

    def reset(self):
        self.state = self.start_state
        self.steps = 0
        return self.state

    def step(self, action):
        r, c = self.state_to_xy(self.state)
        if action == 0:
            r = max(0, r - 1)
        elif action == 1:
            c = min(self.n_cols - 1, c + 1)
        elif action == 2:
            r = min(self.n_rows - 1, r + 1)
        elif action == 3:
            c = max(0, c - 1)
        ns = self.xy_to_state(r, c)
        self.state = ns
        self.steps += 1
        done = (ns == self.goal_state) or (self.steps >= self.max_steps)
        reward = 0 if ns == self.goal_state else -1
        return ns, reward, done, {}

class DQNNet(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim)
        )
    def forward(self, x):
        return self.fc(x)

def one_hot_state(s, n_states):
    v = np.zeros(n_states, dtype=np.float32)
    v[s] = 1.0
    return v

def train_dqn(env, episodes=1000, gamma=1.0, batch_size=32, lr=1e-3):
    device = torch.device("cpu")
    net = DQNNet(env.n_states, env.n_actions).to(device)
    target_net = DQNNet(env.n_states, env.n_actions).to(device)
    target_net.load_state_dict(net.state_dict())

    optimizer = optim.Adam(net.parameters(), lr=lr)
    replay_buffer = deque(maxlen=5000)

    eps = 1.0
    eps_end = 0.05
    eps_decay = 0.995
    update_target_every = 50

    for ep in range(episodes):
        s = env.reset()
        done = False
        while not done:
            if random.random() < eps:
                a = random.randrange(env.n_actions)
            else:
                with torch.no_grad():
                    inp = torch.tensor(one_hot_state(s, env.n_states), dtype=torch.float32)
                    q_vals = net(inp)
                    a = int(torch.argmax(q_vals).item())
            ns, r, done, _ = env.step(a)
            replay_buffer.append((s, a, r, ns, done))
            s = ns

            if len(replay_buffer) >= batch_size:
                batch = random.sample(replay_buffer, batch_size)
                states = torch.tensor([one_hot_state(b[0], env.n_states) for b in batch], dtype=torch.float32)
                actions = torch.tensor([b[1] for b in batch], dtype=torch.int64).unsqueeze(1)
                rewards = torch.tensor([b[2] for b in batch], dtype=torch.float32)
                next_states = torch.tensor([one_hot_state(b[3], env.n_states) for b in batch], dtype=torch.float32)
                dones = torch.tensor([b[4] for b in batch], dtype=torch.float32)

                q_values = net(states).gather(1, actions).squeeze(1)
                with torch.no_grad():
                    next_q_values = target_net(next_states).max(1)[0]
                    targets = rewards + gamma * next_q_values * (1 - dones)

                loss = nn.MSELoss()(q_values, targets)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        eps = max(eps_end, eps * eps_decay)
        if (ep + 1) % update_target_every == 0:
            target_net.load_state_dict(net.state_dict())
            print(f"Episode {ep+1}, eps={eps:.3f}")

    # extract policy
    policy = np.zeros(env.n_states, dtype=int)
    for s in range(env.n_states):
        with torch.no_grad():
            inp = torch.tensor(one_hot_state(s, env.n_states), dtype=torch.float32)
            q_vals = net(inp)
            policy[s] = int(torch.argmax(q_vals).item())
    return net, policy

if __name__ == "__main__":
    env = GridWorld4x4()
    net, pi = train_dqn(env, episodes=800)
    print("DQN-derived policy (0:U,1:R,2:D,3:L):")
    print(pi.reshape(4, 4))


  states = torch.tensor([one_hot_state(b[0], env.n_states) for b in batch], dtype=torch.float32)


Episode 50, eps=0.778
Episode 100, eps=0.606
Episode 150, eps=0.471
Episode 200, eps=0.367
Episode 250, eps=0.286
Episode 300, eps=0.222
Episode 350, eps=0.173
Episode 400, eps=0.135
Episode 450, eps=0.105
Episode 500, eps=0.082
Episode 550, eps=0.063
Episode 600, eps=0.050
Episode 650, eps=0.050
Episode 700, eps=0.050
Episode 750, eps=0.050
Episode 800, eps=0.050
DQN-derived policy (0:U,1:R,2:D,3:L):
[[1 2 2 2]
 [1 1 2 2]
 [2 1 2 2]
 [1 1 1 1]]
