# Deep SARSA (State-Action-Reward-State-Action)

Deep SARSA is an algorithm used in reinforcement learning for decision-making in dynamic environments. It extends the SARSA algorithm by employing deep neural networks to approximate the Q-values. The algorithm learns to estimate the value of taking actions in states, aiming to optimize long-term rewards. By interacting with the environment, collecting experiences, and updating the neural network parameters, Deep SARSA improves its decision-making capabilities over time.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, state, action, reward, next_state, next_action, done):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = (state, action, reward, next_state, next_action, done)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

class DeepSARSA(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(DeepSARSA, self).__init__()
        self.fc1 = nn.Linear(obs_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, n_actions)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class DeepSARSA_Agent:
    def __init__(self,
                 action_space,
                 observation_space,
                 hidden_size,
                 gamma,
                 epsilon_start,
                 epsilon_end,
                 epsilon_decay,
                 learning_rate,
                 batch_size,
                 ):
        self.action_space = action_space
        self.observation_space = observation_space
        self.hidden_size = hidden_size
        self.gamma = gamma
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        
        self.q_net = DeepSARSA(observation_space.shape[0], hidden_size, action_space.n)
        self.optimizer = optim.Adam(self.q_net.parameters(), lr=learning_rate)
        self.loss_function = nn.MSELoss()

    def get_action(self, state, epsilon):
        if np.random.rand() < epsilon:
            return self.action_space.sample()
        else:
            state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
            q_values = self.q_net(state)
            return torch.argmax(q_values).item()

    def update(self, state, action, reward, next_state, next_action, done):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        next_state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0)
        action = torch.tensor(action).unsqueeze(0)
        next_action = torch.tensor(next_action).unsqueeze(0)
        reward = torch.tensor([reward], dtype=torch.float32)
        done = torch.tensor([done], dtype=torch.float32)

        self.optimizer.zero_grad()
        q_value = self.q_net(state).gather(1, action)

        with torch.no_grad():
            next_q_value = self.q_net(next_state).gather(1, next_action)
            target_q = reward + (1 - done) * self.gamma * next_q_value

        loss = self.loss_function(q_value, target_q)
        loss.backward()
        self.optimizer.step()

    def get_epsilon(self, current_step):
        return self.epsilon_end + (self.epsilon_start - self.epsilon_end) * np.exp(-1.0 * current_step / self.epsilon_decay)
