<a href="https://colab.research.google.com/github/Endalebob/Deep-Learning-Lab/blob/main/RL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


**Name: Endale Yohannes**

**ID N0: UGR/7379/12**

In [12]:
%pip install gymnasium



# Value Iteration

**Steps:**
1. Create the grid world environment using FrozenLake-v1.
2. Define the value iteration algorithm: Implement the value iteration algorithm.
3. Evaluate the policy: Evaluate the policy obtained from value iteration.
4. Run tests: Test the implementation with random test cases.

In [13]:
import numpy as np
import gymnasium as gym

# Create the FrozenLake environment
env = gym.make('FrozenLake-v1', is_slippery=False)

# Value Iteration algorithm
def value_iteration(env, gamma=0.99, theta=1e-6):
    num_states = env.observation_space.n
    num_actions = env.action_space.n

    V = np.zeros(num_states)
    policy = np.zeros(num_states, dtype=int)

    while True:
        delta = 0
        for state in range(num_states):
            v = V[state]
            q_values = np.zeros(num_actions)
            for action in range(num_actions):
                for prob, next_state, reward, done in env.unwrapped.P[state][action]:
                    q_values[action] += prob * (reward + gamma * V[next_state] * (not done))
            V[state] = max(q_values)
            policy[state] = np.argmax(q_values)
            delta = max(delta, abs(v - V[state]))
        if delta < theta:
            break
    return policy, V

# Evaluate the policy
def evaluate_policy(env, policy, num_episodes=100):
    total_rewards = 0
    for _ in range(num_episodes):
        state, _ = env.reset()
        done = False
        while not done:
            action = policy[state]
            state, reward, done, _, _ = env.step(action)
            total_rewards += reward
    return total_rewards / num_episodes

# Run value iteration and evaluate the policy
policy, V = value_iteration(env)
average_reward = evaluate_policy(env, policy)

print(f"Optimal Policy: {policy}")
print(f"State Values: {V}")
print(f"Average Reward: {average_reward}")

# Test the implementation with a random test case
test_env = gym.make('FrozenLake-v1', is_slippery=False)
test_policy, test_V = value_iteration(test_env)
test_average_reward = evaluate_policy(test_env, test_policy)

print(f"Test Optimal Policy: {test_policy}")
print(f"Test State Values: {test_V}")
print(f"Test Average Reward: {test_average_reward}")


Optimal Policy: [1 2 1 0 1 0 1 0 2 1 1 0 0 2 2 0]
State Values: [0.95099005 0.96059601 0.970299   0.96059601 0.96059601 0.
 0.9801     0.         0.970299   0.9801     0.99       0.
 0.         0.99       1.         0.        ]
Average Reward: 1.0
Test Optimal Policy: [1 2 1 0 1 0 1 0 2 1 1 0 0 2 2 0]
Test State Values: [0.95099005 0.96059601 0.970299   0.96059601 0.96059601 0.
 0.9801     0.         0.970299   0.9801     0.99       0.
 0.         0.99       1.         0.        ]
Test Average Reward: 1.0


# Policy Iteration Algorithm

In [14]:
import numpy as np
import gymnasium as gym

# Create the FrozenLake environment
env = gym.make('FrozenLake-v1', is_slippery=False)

def policy_evaluation(policy, env, gamma=0.99, theta=1e-6):
    num_states = env.observation_space.n
    V = np.zeros(num_states)

    while True:
        delta = 0
        for state in range(num_states):
            v = V[state]
            action = policy[state]
            v_new = sum([prob * (reward + gamma * V[next_state] * (not done))
                         for prob, next_state, reward, done in env.unwrapped.P[state][action]])
            V[state] = v_new
            delta = max(delta, abs(v - v_new))
        if delta < theta:
            break
    return V

def policy_improvement(V, env, gamma=0.99):
    num_states = env.observation_space.n
    num_actions = env.action_space.n
    policy = np.zeros(num_states, dtype=int)

    for state in range(num_states):
        q_values = np.zeros(num_actions)
        for action in range(num_actions):
            q_values[action] = sum([prob * (reward + gamma * V[next_state] * (not done))
                                    for prob, next_state, reward, done in env.unwrapped.P[state][action]])
        policy[state] = np.argmax(q_values)
    return policy

def policy_iteration(env, gamma=0.99, theta=1e-6):
    num_states = env.observation_space.n
    policy = np.zeros(num_states, dtype=int)

    while True:
        V = policy_evaluation(policy, env, gamma, theta)
        new_policy = policy_improvement(V, env, gamma)
        if np.array_equal(policy, new_policy):
            break
        policy = new_policy
    return policy, V

def evaluate_policy(env, policy, num_episodes=100):
    total_rewards = 0
    for _ in range(num_episodes):
        state, _ = env.reset()
        done = False
        while not done:
            action = policy[state]
            state, reward, done, _, _ = env.step(action)
            total_rewards += reward
    return total_rewards / num_episodes

# Run policy iteration and evaluate the policy
policy, V = policy_iteration(env)
average_reward = evaluate_policy(env, policy)

print(f"Optimal Policy: {policy}")
print(f"State Values: {V}")
print(f"Average Reward: {average_reward}")

# Test the implementation with a random test case
test_env = gym.make('FrozenLake-v1', is_slippery=False)
test_policy, test_V = policy_iteration(test_env)
test_average_reward = evaluate_policy(test_env, test_policy)

print(f"Test Optimal Policy: {test_policy}")
print(f"Test State Values: {test_V}")
print(f"Test Average Reward: {test_average_reward}")


Optimal Policy: [1 2 1 0 1 0 1 0 2 1 1 0 0 2 2 0]
State Values: [0.95099005 0.96059601 0.970299   0.96059601 0.96059601 0.
 0.9801     0.         0.970299   0.9801     0.99       0.
 0.         0.99       1.         0.        ]
Average Reward: 1.0
Test Optimal Policy: [1 2 1 0 1 0 1 0 2 1 1 0 0 2 2 0]
Test State Values: [0.95099005 0.96059601 0.970299   0.96059601 0.96059601 0.
 0.9801     0.         0.970299   0.9801     0.99       0.
 0.         0.99       1.         0.        ]
Test Average Reward: 1.0


# Q-Learning Algorithm

In [15]:
import numpy as np
import gymnasium as gym

# Create the FrozenLake environment
env = gym.make('FrozenLake-v1', is_slippery=False)

def q_learning(env, num_episodes=100, alpha=0.1, gamma=0.99, epsilon=0.1, max_steps_per_episode=100):
    num_states = env.observation_space.n
    num_actions = env.action_space.n
    Q = np.zeros((num_states, num_actions))

    def choose_action(state):
        if np.random.rand() < epsilon:
            return np.random.randint(num_actions)
        else:
            return np.argmax(Q[state])

    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False
        steps = 0
        while not done and steps < max_steps_per_episode:
            action = choose_action(state)
            next_state, reward, done, _, _ = env.step(action)

            # Debug print
            print(f"Episode: {episode}, State: {state}, Action: {action}, Reward: {reward}, Next State: {next_state}, Done: {done}")

            Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
            state = next_state
            steps += 1

    policy = np.argmax(Q, axis=1)
    return policy, Q

# Evaluate the policy
def evaluate_policy(env, policy, num_episodes=20, max_steps_per_episode=100):
    total_rewards = 0
    for _ in range(num_episodes):
        state, _ = env.reset()
        done = False
        steps = 0
        while not done and steps < max_steps_per_episode:
            action = policy[state]
            state, reward, done, _, _ = env.step(action)
            total_rewards += reward
            steps += 1
    return total_rewards / num_episodes

# Run Q-Learning and evaluate the policy
policy, Q = q_learning(env)
average_reward = evaluate_policy(env, policy)

print(f"Optimal Policy: {policy}")
print(f"Q-Values: {Q}")
print(f"Average Reward: {average_reward}")

# Test the implementation with a random test case
test_env = gym.make('FrozenLake-v1', is_slippery=False)
test_policy, test_Q = q_learning(test_env)
test_average_reward = evaluate_policy(test_env, test_policy)

print(f"Test Optimal Policy: {test_policy}")
print(f"Test Q-Values: {test_Q}")
print(f"Test Average Reward: {test_average_reward}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Episode: 34, State: 4, Action: 0, Reward: 0.0, Next State: 4, Done: False
Episode: 34, State: 4, Action: 0, Reward: 0.0, Next State: 4, Done: False
Episode: 34, State: 4, Action: 0, Reward: 0.0, Next State: 4, Done: False
Episode: 34, State: 4, Action: 0, Reward: 0.0, Next State: 4, Done: False
Episode: 34, State: 4, Action: 0, Reward: 0.0, Next State: 4, Done: False
Episode: 34, State: 4, Action: 0, Reward: 0.0, Next State: 4, Done: False
Episode: 34, State: 4, Action: 2, Reward: 0.0, Next State: 5, Done: True
Episode: 35, State: 0, Action: 3, Reward: 0.0, Next State: 0, Done: False
Episode: 35, State: 0, Action: 0, Reward: 0.0, Next State: 0, Done: False
Episode: 35, State: 0, Action: 1, Reward: 0.0, Next State: 4, Done: False
Episode: 35, State: 4, Action: 0, Reward: 0.0, Next State: 4, Done: False
Episode: 35, State: 4, Action: 0, Reward: 0.0, Next State: 4, Done: False
Episode: 35, State: 4, Action: 0, Reward: 0.0, N

# epsilon-greedy policy

In [16]:
import numpy as np
import gymnasium as gym

# Create the FrozenLake environment
env = gym.make('FrozenLake-v1', is_slippery=False)

def epsilon_greedy_policy(Q, state, epsilon, num_actions):
    if np.random.rand() < epsilon:
        return np.random.randint(num_actions)
    else:
        return np.argmax(Q[state])

def q_learning(env, num_episodes=2000, alpha=0.5, gamma=0.99, epsilon=0.1, max_steps_per_episode=100):
    num_states = env.observation_space.n
    num_actions = env.action_space.n
    Q = np.zeros((num_states, num_actions))

    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False
        steps = 0
        while not done and steps < max_steps_per_episode:
            action = epsilon_greedy_policy(Q, state, epsilon, num_actions)
            next_state, reward, done, _, _ = env.step(action)

            # Debug print
            if episode % 100 == 0 and steps == 0:
                print(f"Episode: {episode}, State: {state}, Action: {action}, Reward: {reward}, Next State: {next_state}, Done: {done}")

            Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
            state = next_state
            steps += 1
    policy = np.argmax(Q, axis=1)
    return policy, Q

def evaluate_policy(env, policy, num_episodes=100, max_steps_per_episode=100):
    total_rewards = 0
    for _ in range(num_episodes):
        state, _ = env.reset()
        done = False
        steps = 0
        while not done and steps < max_steps_per_episode:
            action = policy[state]
            state, reward, done, _, _ = env.step(action)
            total_rewards += reward
            steps += 1
    return total_rewards / num_episodes

# Run Q-Learning with epsilon-greedy policy and evaluate the policy
policy, Q = q_learning(env)
average_reward = evaluate_policy(env, policy)

print(f"Optimal Policy: {policy}")
print(f"Q-Values: {Q}")
print(f"Average Reward: {average_reward}")

# Test the implementation with a random test case
test_env = gym.make('FrozenLake-v1', is_slippery=False)
test_policy, test_Q = q_learning(test_env)
test_average_reward = evaluate_policy(test_env, test_policy)

print(f"Test Optimal Policy: {test_policy}")
print(f"Test Q-Values: {test_Q}")
print(f"Test Average Reward: {test_average_reward}")


Episode: 0, State: 0, Action: 0, Reward: 0.0, Next State: 0, Done: False
Episode: 100, State: 0, Action: 0, Reward: 0.0, Next State: 0, Done: False
Episode: 200, State: 0, Action: 0, Reward: 0.0, Next State: 0, Done: False
Episode: 300, State: 0, Action: 0, Reward: 0.0, Next State: 0, Done: False
Episode: 400, State: 0, Action: 0, Reward: 0.0, Next State: 0, Done: False
Episode: 500, State: 0, Action: 0, Reward: 0.0, Next State: 0, Done: False
Episode: 600, State: 0, Action: 0, Reward: 0.0, Next State: 0, Done: False
Episode: 700, State: 0, Action: 0, Reward: 0.0, Next State: 0, Done: False
Episode: 800, State: 0, Action: 0, Reward: 0.0, Next State: 0, Done: False
Episode: 900, State: 0, Action: 0, Reward: 0.0, Next State: 0, Done: False
Episode: 1000, State: 0, Action: 0, Reward: 0.0, Next State: 0, Done: False
Episode: 1100, State: 0, Action: 0, Reward: 0.0, Next State: 0, Done: False
Episode: 1200, State: 0, Action: 0, Reward: 0.0, Next State: 0, Done: False
Episode: 1300, State: 0,

UCB Algorithm

In [17]:
import numpy as np
import gymnasium as gym

# Create the FrozenLake environment
env = gym.make('FrozenLake-v1', is_slippery=False)

def ucb_action_selection(Q, state, N, t, num_actions, c=1.0):
    ucb_values = Q[state] + c * np.sqrt(np.log(t + 1) / (N[state] + 1e-5))
    return np.argmax(ucb_values)

def q_learning_with_ucb(env, num_episodes=1000, alpha=0.1, gamma=0.99, c=1.0, max_steps_per_episode=100):
    num_states = env.observation_space.n
    num_actions = env.action_space.n
    Q = np.zeros((num_states, num_actions))
    N = np.zeros((num_states, num_actions))  # To count the number of times each action is taken in each state

    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False
        steps = 0
        t = 0  # Time step
        while not done and steps < max_steps_per_episode:
            action = ucb_action_selection(Q, state, N, t, num_actions, c)
            next_state, reward, done, _, _ = env.step(action)
            N[state, action] += 1
            Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
            state = next_state
            steps += 1
            t += 1
    policy = np.argmax(Q, axis=1)
    return policy, Q

def evaluate_policy(env, policy, num_episodes=100, max_steps_per_episode=100):
    total_rewards = 0
    for _ in range(num_episodes):
        state, _ = env.reset()
        done = False
        steps = 0
        while not done and steps < max_steps_per_episode:
            action = policy[state]
            state, reward, done, _, _ = env.step(action)
            total_rewards += reward
            steps += 1
    return total_rewards / num_episodes

# Run Q-Learning with UCB and evaluate the policy
policy, Q = q_learning_with_ucb(env)
average_reward = evaluate_policy(env, policy)

print(f"Optimal Policy: {policy}")
print(f"Q-Values: {Q}")
print(f"Average Reward: {average_reward}")

# Test the implementation with a random test case
test_env = gym.make('FrozenLake-v1', is_slippery=False)
test_policy, test_Q = q_learning_with_ucb(test_env)
test_average_reward = evaluate_policy(test_env, test_policy)

print(f"Test Optimal Policy: {test_policy}")
print(f"Test Q-Values: {test_Q}")
print(f"Test Average Reward: {test_average_reward}")


Optimal Policy: [2 2 1 0 3 0 1 0 2 2 1 0 0 2 2 0]
Q-Values: [[5.17361278e-03 1.93112637e-03 9.50990050e-01 5.53987019e-03]
 [3.92433728e-03 0.00000000e+00 9.60596010e-01 7.44749681e-03]
 [5.97079399e-03 9.70299000e-01 1.57074080e-03 1.01947647e-02]
 [5.11168239e-03 0.00000000e+00 9.46721440e-04 9.46721440e-04]
 [1.08071466e-03 1.99145467e-03 0.00000000e+00 3.01123600e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 9.80100000e-01 0.00000000e+00 4.31888742e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.11137719e-03 0.00000000e+00 3.94278685e-03 5.86742442e-04]
 [4.12854790e-04 6.00801300e-03 1.11682395e-02 0.00000000e+00]
 [1.86103348e-03 9.90000000e-01 0.00000000e+00 2.83300819e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 2.77200000e-02 1.07509129e-03]
 [9.80100000e-04 1.88100000e-02 1.00000000e+00 3.62637000e