<a href="https://colab.research.google.com/github/FevDer/Reinforcement_Learning/blob/main/ReinforcementLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries
!pip install gymnasium numpy matplotlib tqdm

import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm


Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [2]:
# Value Iteration
def value_iteration(env, gamma=0.99, theta=0.0001):
    def one_step_lookahead(state, V):
        A = np.zeros(env.action_space.n)
        for a in range(env.action_space.n):
            for prob, next_state, reward, done in env.P[state][a]:
                A[a] += prob * (reward + gamma * V[next_state])
        return A

    V = np.zeros(env.observation_space.n)
    while True:
        delta = 0
        for s in range(env.observation_space.n):
            A = one_step_lookahead(s, V)
            best_action_value = np.max(A)
            delta = max(delta, np.abs(best_action_value - V[s]))
            V[s] = best_action_value
        if delta < theta:
            break

    policy = np.zeros([env.observation_space.n, env.action_space.n])
    for s in range(env.observation_space.n):
        A = one_step_lookahead(s, V)
        best_action = np.argmax(A)
        policy[s, best_action] = 1.0

    return policy, V

In [3]:
# Policy Iteration
def policy_iteration(env, gamma=0.99, theta=0.0001):
    def policy_eval(policy, env, gamma, theta):
        V = np.zeros(env.observation_space.n)
        while True:
            delta = 0
            for s in range(env.observation_space.n):
                v = 0
                for a, action_prob in enumerate(policy[s]):
                    for prob, next_state, reward, done in env.P[s][a]:
                        v += action_prob * prob * (reward + gamma * V[next_state])
                delta = max(delta, np.abs(v - V[s]))
                V[s] = v
            if delta < theta:
                break
        return V

    def policy_improvement(V, env, gamma):
        policy = np.zeros([env.observation_space.n, env.action_space.n])
        for s in range(env.observation_space.n):
            q = np.zeros(env.action_space.n)
            for a in range(env.action_space.n):
                for prob, next_state, reward, done in env.P[s][a]:
                    q[a] += prob * (reward + gamma * V[next_state])
            best_action = np.argmax(q)
            policy[s, best_action] = 1.0
        return policy

    policy = np.ones([env.observation_space.n, env.action_space.n]) / env.action_space.n
    while True:
        V = policy_eval(policy, env, gamma, theta)
        new_policy = policy_improvement(V, env, gamma)
        if np.array_equal(new_policy, policy):
            break
        policy = new_policy
    return policy, V


In [4]:
# Q-Learning
def q_learning(env, num_episodes=500, gamma=0.99, alpha=0.1, epsilon=0.1):
    q_table = np.zeros([env.observation_space.n, env.action_space.n])

    for i in tqdm(range(num_episodes)):
        state, _ = env.reset()
        done = False

        while not done:
            if np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[state])

            next_state, reward, done, _, _ = env.step(action)

            q_table[state, action] = q_table[state, action] + alpha * (
                reward + gamma * np.max(q_table[next_state]) - q_table[state, action]
            )

            state = next_state

    policy = np.zeros([env.observation_space.n, env.action_space.n])
    for s in range(env.observation_space.n):
        best_action = np.argmax(q_table[s])
        policy[s, best_action] = 1.0

    return policy, q_table

In [5]:
# Epsilon-Greedy Policy
def epsilon_greedy_policy(Q, state, epsilon):
    if np.random.rand() < epsilon:
        return np.random.randint(0, Q.shape[1])
    else:
        return np.argmax(Q[state])

In [16]:
# UCB Algorithm
def ucb_action(Q, N, state, t, c=2):
    total_counts = np.sum(N[state])
    if total_counts == 0:
        return np.random.randint(0, Q.shape[1])
    ucb_values = Q[state] + c * np.sqrt(np.log(t + 1) / (N[state] + 1))
    return np.argmax(ucb_values)

In [7]:
# Initialize environment
env = gym.make('FrozenLake-v1', is_slippery=False)


In [8]:
# Value Iteration
policy_vi, v_vi = value_iteration(env)
print("Value Iteration Policy:")
print(policy_vi)
print("")

Value Iteration Policy:
[[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]]



  logger.warn(


In [9]:
# Policy Iteration
policy_pi, v_pi = policy_iteration(env)
print("Policy Iteration Policy:")
print(policy_pi)
print("")

Policy Iteration Policy:
[[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]]



In [10]:
# Q-Learning Execution
policy_ql, q_table = q_learning(env)
print("Q-Learning Policy:")
print(policy_ql)
print("")

100%|██████████| 500/500 [00:01<00:00, 338.18it/s]

Q-Learning Policy:
[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]






In [11]:
# Epsilon-Greedy Policy Demonstration
print("Epsilon-Greedy Policy Demonstration:")
state, _ = env.reset()
done = False
while not done:
    action = epsilon_greedy_policy(q_table, state, epsilon=0.1)
    state, reward, done, _, _ = env.step(action)
    env.render()
    if done:
        break

Epsilon-Greedy Policy Demonstration:


  gym.logger.warn(


In [None]:
# UCB Algorithm Demonstration
print("UCB Algorithm Demonstration:")
num_episodes = 500  # Reduce the number of episodes
Q = np.zeros((env.observation_space.n, env.action_space.n))
N = np.zeros((env.observation_space.n, env.action_space.n))
gamma = 0.99  # Define gamma for UCB Algorithm

for t in range(num_episodes):
    state, _ = env.reset()
    done = False
    while not done:
        action = ucb_action(Q, N, state, t)
        next_state, reward, done, _, _ = env.step(action)
        N[state, action] += 1
        Q[state, action] += (reward + gamma * np.max(Q[next_state]) - Q[state, action]) / N[state, action]
        state = next_state

print("UCB Algorithm Policy:")
policy_ucb = np.zeros([env.observation_space.n, env.action_space.n])
for s in range(env.observation_space.n):
    best_action = np.argmax(Q[s])
    policy_ucb[s, best_action] = 1.0
print(policy_ucb)

UCB Algorithm Demonstration:
