<a href="https://colab.research.google.com/github/2303A51734/RL_B-11_1734/blob/main/sasra_algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import random

class GridWorld:
    def __init__(self, grid_size=4):
        self.grid_size = grid_size
        self.state = (0, 0)
        self.terminal_state = (grid_size - 1, grid_size - 1)
        self.actions = ['up', 'down', 'left', 'right']

    def reset(self):
        self.state = (0, 0)
        return self.state

    def step(self, action):
        i, j = self.state
        if self.state == self.terminal_state:
            return self.state, 0, True

        if action == 'up':
            i = max(i - 1, 0)
        elif action == 'down':
            i = min(i + 1, self.grid_size - 1)
        elif action == 'left':
            j = max(j - 1, 0)
        elif action == 'right':
            j = min(j + 1, self.grid_size - 1)

        self.state = (i, j)
        reward = 0 if self.state == self.terminal_state else -1
        done = self.state == self.terminal_state

        return self.state, reward, done


In [2]:
def td_zero(env, episodes=500, alpha=0.1, gamma=1.0):
    V = np.zeros((env.grid_size, env.grid_size))

    for ep in range(episodes):
        state = env.reset()
        done = False

        while not done:
            action = random.choice(env.actions)
            next_state, reward, done = env.step(action)
            s = state
            s_prime = next_state

            V[s] += alpha * (reward + gamma * V[s_prime] - V[s])
            state = next_state

    return V


In [3]:
def epsilon_greedy(Q, state, actions, epsilon):
    if random.random() < epsilon:
        return random.choice(actions)
    else:
        return actions[np.argmax([Q[state][a] for a in actions])]

def sarsa(env, episodes=5000, alpha=0.1, gamma=0.99, epsilon=0.1):
    Q = {}
    for i in range(env.grid_size):
        for j in range(env.grid_size):
            Q[(i, j)] = {a: 0.0 for a in env.actions}

    for ep in range(episodes):
        state = env.reset()
        action = epsilon_greedy(Q, state, env.actions, epsilon)
        done = False

        while not done:
            next_state, reward, done = env.step(action)
            next_action = epsilon_greedy(Q, next_state, env.actions, epsilon)

            # Update Q(s, a)
            Q[state][action] += alpha * (
                reward + gamma * Q[next_state][next_action] - Q[state][action]
            )

            state, action = next_state, next_action

    return Q


In [4]:
def extract_policy(Q, env):
    policy = {}
    for state in Q:
        best_action = max(Q[state], key=Q[state].get)
        policy[state] = best_action
    return policy


In [5]:
env = GridWorld(grid_size=4)

# TD(0)
V = td_zero(env, episodes=500)
print("State-value function from TD(0):")
print(np.round(V, 2))

# SARSA
Q = sarsa(env, episodes=5000)
policy = extract_policy(Q, env)
print("Learned policy from SARSA:")
for i in range(env.grid_size):
    print([policy[(i, j)] for j in range(env.grid_size)])


State-value function from TD(0):
[[-55.59 -53.23 -50.88 -47.95]
 [-53.78 -52.07 -47.68 -41.  ]
 [-51.46 -48.65 -44.67 -28.54]
 [-50.38 -44.55 -38.44   0.  ]]
Learned policy from SARSA:
['down', 'right', 'down', 'down']
['right', 'right', 'down', 'down']
['right', 'right', 'right', 'down']
['right', 'right', 'right', 'up']
