<a href="https://colab.research.google.com/github/Abhishek3102/Reinforcement-Learning/blob/main/Reinforcement_Learning_Algos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
from itertools import count

# QLearning Agent :

This class implements the Q-learning algorithm, a model-free reinforcement learning algorithm.
It uses a Q-table where states are mapped to action-value pairs.
The agent selects actions based on an epsilon-greedy policy (balance between exploration and exploitation).
The Q-value is updated using the Bellman equation: Q(s, a) ← Q(s, a) + α * (r + γ * max(Q(s', a')) - Q(s, a)).

In [None]:
class QLearningAgent:
    def __init__(self, env, learning_rate=0.1, discount_factor=0.99, epsilon=1.0, epsilon_decay=0.995, min_epsilon=0.01):
        self.env = env
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon
        self.q_table = np.zeros([env.observation_space.n, env.action_space.n])

    def select_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.q_table[state])

    def update_q_table(self, state, action, reward, next_state, done):
        best_next_action = np.argmax(self.q_table[next_state])
        target = reward + (self.gamma * self.q_table[next_state][best_next_action] if not done else 0)
        self.q_table[state][action] += self.lr * (target - self.q_table[state][action])

    def decay_epsilon(self):
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

# SARSAAgent :
SARSA (State-Action-Reward-State-Action) is an on-policy algorithm similar to Q-learning, but it updates the Q-value based on the action actually taken, rather than the greedy action.
The agent follows an epsilon-greedy policy and updates its Q-values using the equation: Q(s, a) ← Q(s, a) + α * (r + γ * Q(s', a') - Q(s, a)).

In [None]:
class SARSAAgent:
    def __init__(self, env, learning_rate=0.1, discount_factor=0.99, epsilon=1.0, epsilon_decay=0.995, min_epsilon=0.01):
        self.env = env
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon
        self.q_table = np.zeros([env.observation_space.n, env.action_space.n])

    def select_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.q_table[state])

    def update_q_table(self, state, action, reward, next_state, next_action, done):
        target = reward + (self.gamma * self.q_table[next_state][next_action] if not done else 0)
        self.q_table[state][action] += self.lr * (target - self.q_table[state][action])

    def decay_epsilon(self):
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

# DQN (Deep Q-Network):

DQN is an extension of Q-learning where a neural network is used to approximate the Q-value function instead of a table.
The agent interacts with the environment and stores experiences in a replay buffer.
During training, a batch of experiences is sampled from the buffer, and the Q-values are updated based on the Bellman equation.
A target network is used to stabilize the learning by periodically copying the weights of the policy network.

In [None]:
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [None]:
class DQNAgent:
    def __init__(self, env, learning_rate=1e-4, discount_factor=0.99, epsilon=1.0, epsilon_decay=0.995, min_epsilon=0.01, batch_size=64):
        self.env = env
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon
        self.batch_size = batch_size
        self.memory = deque(maxlen=10000)
        self.policy_net = DQN(env.observation_space.shape[0], env.action_space.n).to(device)
        self.target_net = DQN(env.observation_space.shape[0], env.action_space.n).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)

    def select_action(self, state):
        if random.random() < self.epsilon:
            return self.env.action_space.sample()
        else:
            with torch.no_grad():
                return self.policy_net(state).argmax(dim=1).view(1, 1)

    def push_to_memory(self, state, action, next_state, reward, done):
        self.memory.append((state, action, next_state, reward, done))

    def sample_from_memory(self):
        return random.sample(self.memory, self.batch_size)

    def update(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.sample_from_memory()
        batch = Transition(*zip(*transitions))

        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), dtype=torch.bool)
        non_final_next_states = torch.cat([s for s, m in zip(batch.next_state, non_final_mask) if m])

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        next_state_values = torch.zeros(self.batch_size, device=device)
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0]

        expected_state_action_values = (next_state_values * self.gamma) + reward_batch

        loss = F.mse_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def decay_epsilon(self):
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

    def update_target(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

In [None]:
env = gym.make('CartPole-v1')
n_episodes = 1000

In [None]:
q_agent = QLearningAgent(env)
for episode in range(n_episodes):
    state = env.reset()
    done = False
    while not done:
        action = q_agent.select_action(state)
        next_state, reward, done, _ = env.step(action)
        q_agent.update_q_table(state, action, reward, next_state, done)
        state = next_state
    q_agent.decay_epsilon()

In [None]:
sarsa_agent = SARSAAgent(env)
for episode in range(n_episodes):
    state = env.reset()
    action = sarsa_agent.select_action(state)
    done = False
    while not done:
        next_state, reward, done, _ = env.step(action)
        next_action = sarsa_agent.select_action(next_state)
        sarsa_agent.update_q_table(state, action, reward, next_state, next_action, done)
        state, action = next_state, next_action
    sarsa_agent.decay_epsilon()

In [None]:
dqn_agent = DQNAgent(env)
for episode in range(n_episodes):
    state = env.reset()
    done = False
    while not done:
        action = dqn_agent.select_action(state)
        next_state, reward, done, _ = env.step(action)
        dqn_agent.push_to_memory(state, action, next_state, reward, done)
        dqn_agent.update()
        state = next_state
    dqn_agent.decay_epsilon()
    if episode % 10 == 0:
        dqn_agent.update_target()

Interaction with the environment: The agent selects actions and receives rewards and next states from the environment.

Updating Q-values: The Q-values (or approximations via neural networks in DQN) are updated after every step using the appropriate algorithm.

Exploration and Exploitation: The epsilon-greedy strategy ensures that the agent explores the environment and gradually shifts to exploitation as learning progresses.