<a href="https://colab.research.google.com/github/230303A51909/2303A51909-APPLICATIONS-OF-DATAMINING/blob/main/RL_lab1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

# GridWorld parameters
grid_size = 4
states = [(i, j) for i in range(grid_size) for j in range(grid_size)]
actions = ['U', 'D', 'L', 'R']
gamma = 0.9  # Discount factor
theta = 1e-4  # Convergence threshold

# Transition dynamics
def step(state, action):
    i, j = state
    if state in [(0, 0), (3, 3)]:
        return state, 0  # Terminal state
    if action == 'U':
        i = max(i - 1, 0)
    elif action == 'D':
        i = min(i + 1, grid_size - 1)
    elif action == 'L':
        j = max(j - 1, 0)
    elif action == 'R':
        j = min(j + 1, grid_size - 1)
    return (i, j), -1

# Initialize policy and value function
policy = {s: np.random.choice(actions) for s in states}
V = {s: 0 for s in states}

# POLICY EVALUATION
def policy_evaluation(policy, V):
    while True:
        delta = 0
        for s in states:
            v = V[s]
            a = policy[s]
            next_state, reward = step(s, a)
            V[s] = reward + gamma * V[next_state]
            delta = max(delta, abs(v - V[s]))
        if delta < theta:
            break
    return V

# POLICY IMPROVEMENT
def policy_improvement(V):
    policy_stable = True
    for s in states:
        old_action = policy[s]
        action_values = {}
        for a in actions:
            next_state, reward = step(s, a)
            action_values[a] = reward + gamma * V[next_state]
        best_action = max(action_values, key=action_values.get)
        policy[s] = best_action
        if old_action != best_action:
            policy_stable = False
    return policy, policy_stable

# POLICY ITERATION
def policy_iteration():
    global V, policy
    while True:
        V = policy_evaluation(policy, V)
        policy, stable = policy_improvement(V)
        if stable:
            break
    return policy, V

# VALUE ITERATION
def value_iteration():
    V = {s: 0 for s in states}
    policy = {s: actions[0] for s in states}
    while True:
        delta = 0
        for s in states:
            if s in [(0, 0), (3, 3)]:
                continue
            action_values = {}
            for a in actions:
                next_state, reward = step(s, a)
                action_values[a] = reward + gamma * V[next_state]
            max_value = max(action_values.values())
            delta = max(delta, abs(V[s] - max_value))
            V[s] = max_value
            policy[s] = max(action_values, key=action_values.get)
        if delta < theta:
            break
    return policy, V

# Run Policy Iteration
print("=== Policy Iteration ===")
policy_pi, V_pi = policy_iteration()
for i in range(grid_size):
    print([policy_pi[(i, j)] for j in range(grid_size)])

# Run Value Iteration
print("\n=== Value Iteration ===")
policy_vi, V_vi = value_iteration()
for i in range(grid_size):
    print([policy_vi[(i, j)] for j in range(grid_size)])


=== Policy Iteration ===
['U', 'L', 'L', 'D']
['U', 'U', 'U', 'D']
['U', 'U', 'D', 'D']
['U', 'R', 'R', 'U']

=== Value Iteration ===
['U', 'L', 'L', 'D']
['U', 'U', 'U', 'D']
['U', 'U', 'D', 'D']
['U', 'R', 'R', 'U']
