<a href="https://colab.research.google.com/github/AjayTeja641/Flappy-Learner-WIDS-Project/blob/main/Untitled6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np


GRID_SIZE = 5
ACTIONS = {
    0: (-1, 0),  # Up
    1: (1, 0),   # Down
    2: (0, -1),  # Left
    3: (0, 1)    # Right
}

GOAL_STATE = (4, 4)
GAMMA = 0.9


In [3]:
def step(state, action):
    if state == GOAL_STATE:
        return state, 0

    move = ACTIONS[action]
    next_state = (
        min(max(state[0] + move[0], 0), GRID_SIZE - 1),
        min(max(state[1] + move[1], 0), GRID_SIZE - 1)
    )

    reward = 10 if next_state == GOAL_STATE else -1
    return next_state, reward


In [4]:
def value_iteration(theta=1e-4):
    V = np.zeros((GRID_SIZE, GRID_SIZE))

    while True:
        delta = 0
        for i in range(GRID_SIZE):
            for j in range(GRID_SIZE):
                state = (i, j)
                if state == GOAL_STATE:
                    continue

                v = V[i, j]
                action_values = []

                for action in ACTIONS:
                    next_state, reward = step(state, action)
                    action_values.append(
                        reward + GAMMA * V[next_state]
                    )

                V[i, j] = max(action_values)
                delta = max(delta, abs(v - V[i, j]))

        if delta < theta:
            break

    return V


In [5]:
def extract_policy(V):
    policy = np.zeros((GRID_SIZE, GRID_SIZE), dtype=int)

    for i in range(GRID_SIZE):
        for j in range(GRID_SIZE):
            state = (i, j)
            if state == GOAL_STATE:
                continue

            action_values = []
            for action in ACTIONS:
                next_state, reward = step(state, action)
                action_values.append(reward + GAMMA * V[next_state])

            policy[i, j] = np.argmax(action_values)

    return policy


In [6]:
V_optimal = value_iteration()
policy_optimal = extract_policy(V_optimal)

print("Optimal Value Function:\n", V_optimal)
print("\nOptimal Policy:\n", policy_optimal)


Optimal Value Function:
 [[-0.434062  0.62882   1.8098    3.122     4.58    ]
 [ 0.62882   1.8098    3.122     4.58      6.2     ]
 [ 1.8098    3.122     4.58      6.2       8.      ]
 [ 3.122     4.58      6.2       8.       10.      ]
 [ 4.58      6.2       8.       10.        0.      ]]

Optimal Policy:
 [[1 1 1 1 1]
 [1 1 1 1 1]
 [1 1 1 1 1]
 [1 1 1 1 1]
 [3 3 3 3 0]]


In [7]:
def policy_evaluation(policy, gamma=0.9, theta=1e-3, max_iter=10):
    V = np.zeros((GRID_SIZE, GRID_SIZE))

    for _ in range(max_iter):
        delta = 0
        for i in range(GRID_SIZE):
            for j in range(GRID_SIZE):
                if (i, j) == GOAL_STATE:
                    continue

                action = policy[i, j]
                next_state, reward = step((i, j), action)
                new_v = reward + gamma * V[next_state]

                delta = max(delta, abs(V[i, j] - new_v))
                V[i, j] = new_v

        if delta < theta:
            break

    return V

In [8]:
def policy_improvement(V):
    policy = np.zeros((GRID_SIZE, GRID_SIZE), dtype=int)
    policy_stable = True

    for i in range(GRID_SIZE):
        for j in range(GRID_SIZE):
            state = (i, j)
            if state == GOAL_STATE:
                continue

            old_action = policy[i, j]
            action_values = []

            for action in ACTIONS:
                next_state, reward = step(state, action)
                action_values.append(reward + GAMMA * V[next_state])

            best_action = np.argmax(action_values)
            policy[i, j] = best_action

            if old_action != best_action:
                policy_stable = False

    return policy, policy_stable


In [9]:
def policy_iteration():
    policy = np.zeros((GRID_SIZE, GRID_SIZE), dtype=int)

    while True:
        V = policy_evaluation(policy)
        policy, stable = policy_improvement(V)

        if stable:
            break

    return V, policy


In [None]:
V_pi, policy_pi = policy_iteration()

print("Policy Iteration Value Function:\n", V_pi)
print("\nPolicy Iteration Policy:\n", policy_pi)
