## My favourite  reinforcement learning algorithms

## Part A : Implementation

## Part B : Theory

#--- Part A ---

### 1. Q-Learning Implementation

```python
import numpy as np

def q_learning(env, num_episodes, learning_rate, discount_factor, epsilon):
    q_table = np.zeros((env.observation_space.n, env.action_space.n))

    for episode in range(num_episodes):
        state = env.reset()
        done = False

        while not done:
            # Epsilon-greedy action selection
            if np.random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()  # Explore
            else:
                action = np.argmax(q_table[state, :]) # Exploit

            next_state, reward, done, _ = env.step(action)

            # Q-value update
            old_value = q_table[state, action]
            next_max = np.max(q_table[next_state, :])

            new_value = old_value + learning_rate * (reward + discount_factor * next_max - old_value)
            q_table[state, action] = new_value

            state = next_state
    return q_table

# Example usage (requires an environment like Gym's FrozenLake)
# env = gym.make("FrozenLake-v1")
# q_table = q_learning(env, num_episodes=10000, learning_rate=0.1, discount_factor=0.99, epsilon=0.1)
# print(q_table)
```

### 2. SARSA Implementation

```python
import numpy as np

def sarsa(env, num_episodes, learning_rate, discount_factor, epsilon):
    q_table = np.zeros((env.observation_space.n, env.action_space.n))

    for episode in range(num_episodes):
        state = env.reset()
        # Choose action a from state s using policy derived from Q (e.g., epsilon-greedy)
        if np.random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state, :])

        done = False

        while not done:
            next_state, reward, done, _ = env.step(action)

            # Choose next action a' from next state s' using policy derived from Q
            if np.random.uniform(0, 1) < epsilon:
                next_action = env.action_space.sample()
            else:
                next_action = np.argmax(q_table[next_state, :])

            # SARSA Q-value update
            old_value = q_table[state, action]
            target = reward + discount_factor * q_table[next_state, next_action]
            new_value = old_value + learning_rate * (target - old_value)
            q_table[state, action] = new_value

            state = next_state
            action = next_action
    return q_table

# Example usage (requires an environment like Gym's FrozenLake)
# env = gym.make("FrozenLake-v1")
# q_table = sarsa(env, num_episodes=10000, learning_rate=0.1, discount_factor=0.99, epsilon=0.1)
# print(q_table)
```

### 3. Deep Q-Networks (DQN) Implementation

```python
import random
from collections import deque
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers

class DQNAgent:
    def __init__(self, state_size, action_size, learning_rate=0.001, discount_factor=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01, replay_buffer_size=2000, target_update_freq=10):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.replay_buffer = deque(maxlen=replay_buffer_size)
        self.target_update_freq = target_update_freq
        self.train_step_counter = 0

        self.q_network = self._build_model()
        self.target_q_network = self._build_model()
        self.target_q_network.set_weights(self.q_network.get_weights())

    def _build_model(self):
        model = models.Sequential()
        model.add(layers.Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(layers.Dense(24, activation='relu'))
        model.add(layers.Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=optimizers.Adam(learning_rate=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def choose_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        q_values = self.q_network.predict(state)
        return np.argmax(q_values[0])

    def train(self, batch_size):
        if len(self.replay_buffer) < batch_size:
            return

        minibatch = random.sample(self.replay_buffer, batch_size)

        for state, action, reward, next_state, done in minibatch:
            target = self.q_network.predict(state)
            if done:
                target[0][action] = reward
            else:
                t = self.target_q_network.predict(next_state)[0]
                target[0][action] = reward + self.discount_factor * np.amax(t)
            self.q_network.fit(state, target, epochs=1, verbose=0)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        self.train_step_counter += 1
        if self.train_step_counter % self.target_update_freq == 0:
            self.target_q_network.set_weights(self.q_network.get_weights())

# Example usage (requires an environment like Gym's CartPole)
# env = gym.make('CartPole-v1')
# state_size = env.observation_space.shape[0]
# action_size = env.action_space.n
# agent = DQNAgent(state_size, action_size)

# for e in range(num_episodes):
#    state = env.reset()
#    state = np.reshape(state, [1, state_size])
#    done = False
#    while not done:
#        action = agent.choose_action(state)
#        next_state, reward, done, _ = env.step(action)
#        next_state = np.reshape(next_state, [1, state_size])
#        agent.remember(state, action, reward, next_state, done)
#        state = next_state
#        agent.train(batch_size=32)
```

### 4. REINFORCE (Monte Carlo Policy Gradient) Implementation

```python
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers

class REINFORCEAgent:
    def __init__(self, state_size, action_size, learning_rate=0.001, discount_factor=0.99):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor

        self.policy_network = self._build_model()

    def _build_model(self):
        model = models.Sequential()
        model.add(layers.Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(layers.Dense(24, activation='relu'))
        model.add(layers.Dense(self.action_size, activation='softmax')) # Output probabilities for actions
        model.compile(optimizer=optimizers.Adam(learning_rate=self.learning_rate), loss='categorical_crossentropy')
        return model

    def choose_action(self, state):
        state = np.reshape(state, [1, self.state_size])
        action_probabilities = self.policy_network.predict(state, verbose=0)[0]
        action = np.random.choice(self.action_size, p=action_probabilities)
        return action, action_probabilities[action]

    def train(self, states, actions, rewards):
        # Calculate discounted returns (G_t)
        discounted_returns = []
        G = 0
        for r in reversed(rewards):
            G = r + self.discount_factor * G
            discounted_returns.insert(0, G)

        # Standardize returns for more stable training (optional but common)
        discounted_returns = np.array(discounted_returns)
        discounted_returns = (discounted_returns - np.mean(discounted_returns)) / (np.std(discounted_returns) + 1e-8)

        # Prepare for policy update
        target_actions = tf.keras.utils.to_categorical(actions, self.action_size)

        # Policy gradient update
        # In Keras, we use a custom loss function or sample weights to apply G_t as advantage
        # For simplicity here, we'll imagine fitting with sample weights
        # In a real implementation, you'd calculate the gradient manually or use a custom training loop

        # Conceptual fitting with sample_weight (approximation for pedagogical purposes)
        self.policy_network.fit(np.array(states), target_actions,
                               sample_weight=discounted_returns,
                               epochs=1, verbose=0)

# Example usage (requires an environment like Gym's CartPole)
# env = gym.make('CartPole-v1')
# state_size = env.observation_space.shape[0]
# action_size = env.action_space.n
# agent = REINFORCEAgent(state_size, action_size)

# for e in range(num_episodes):
#     states, actions, rewards = [], [], []
#     state = env.reset()
#     done = False
#     while not done:
#         action, prob = agent.choose_action(state)
#         next_state, reward, done, _ = env.step(action)
#
#         states.append(state)
#         actions.append(action)
#         rewards.append(reward)
#
#         state = next_state
#
#     agent.train(states, actions, rewards)
```

### 5. Actor-Critic Methods (A2C Conceptual Implementation)

```python
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers

class ActorCriticAgent:
    def __init__(self, state_size, action_size, actor_lr=0.001, critic_lr=0.005, discount_factor=0.99):
        self.state_size = state_size
        self.action_size = action_size
        self.discount_factor = discount_factor

        # Actor network (policy)
        self.actor = self._build_actor_model(actor_lr)
        # Critic network (value function)
        self.critic = self._build_critic_model(critic_lr)

    def _build_actor_model(self, lr):
        model = models.Sequential()
        model.add(layers.Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(layers.Dense(24, activation='relu'))
        model.add(layers.Dense(self.action_size, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(learning_rate=lr))
        return model

    def _build_critic_model(self, lr):
        model = models.Sequential()
        model.add(layers.Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(layers.Dense(24, activation='relu'))
        model.add(layers.Dense(1, activation='linear')) # Output V(s)
        model.compile(loss='mse', optimizer=optimizers.Adam(learning_rate=lr))
        return model

    def choose_action(self, state):
        state = np.reshape(state, [1, self.state_size])
        action_probabilities = self.actor.predict(state, verbose=0)[0]
        action = np.random.choice(self.action_size, p=action_probabilities)
        return action, action_probabilities[action]

    def train(self, state, action, reward, next_state, done):
        state = np.reshape(state, [1, self.state_size])
        next_state = np.reshape(next_state, [1, self.state_size])

        # Predict value for current state and next state
        value = self.critic.predict(state, verbose=0)[0]
        next_value = self.critic.predict(next_state, verbose=0)[0]

        # Calculate TD Target and TD Error (Advantage)
        if done:
            td_target = reward
        else:
            td_target = reward + self.discount_factor * next_value[0]

        advantage = td_target - value[0]

        # Critic Update
        self.critic.fit(state, np.array([[td_target]]), epochs=1, verbose=0)

        # Actor Update
        # We want to increase the probability of 'action' proportional to 'advantage'
        # In Keras, this is typically done using a custom loss function or `tf.GradientTape`
        # For simplicity, we'll represent it conceptually as fitting with advantage as sample_weight
        
        target_actions = np.zeros(self.action_size)
        target_actions[action] = 1 # One-hot encode the action taken

        self.actor.fit(state, np.array([target_actions]),
                       sample_weight=np.array([advantage]),
                       epochs=1, verbose=0)

# Example usage (requires an environment like Gym's CartPole)
# env = gym.make('CartPole-v1')
# state_size = env.observation_space.shape[0]
# action_size = env.action_space.n
# agent = ActorCriticAgent(state_size, action_size)

# for e in range(num_episodes):
#    state = env.reset()
#    done = False
#    while not done:
#        action, _ = agent.choose_action(state)
#        next_state, reward, done, _ = env.step(action)
#        agent.train(state, action, reward, next_state, done)
#        state = next_state
```

### 6. Proximal Policy Optimization (PPO) Implementation

```python
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers

class PPOAgent:
    def __init__(self, state_size, action_size, actor_lr=0.0003, critic_lr=0.001, discount_factor=0.99, clip_ratio=0.2):
        self.state_size = state_size
        self.action_size = action_size
        self.discount_factor = discount_factor
        self.clip_ratio = clip_ratio

        self.actor = self._build_actor_model(actor_lr)
        self.critic = self._build_critic_model(critic_lr)
        self.old_actor = self._build_actor_model(actor_lr) # For old policy probabilities
        self.old_actor.set_weights(self.actor.get_weights())

    def _build_actor_model(self, lr):
        model = models.Sequential([
            layers.Dense(64, activation='relu', input_shape=(self.state_size,)),
            layers.Dense(64, activation='relu'),
            layers.Dense(self.action_size, activation='softmax')
        ])
        # PPO actor loss is complex, handled with custom training step or Keras custom loss
        # For simplicity, we'll compile with a placeholder and handle loss manually during training
        model.compile(optimizer=optimizers.Adam(learning_rate=lr))
        return model

    def _build_critic_model(self, lr):
        model = models.Sequential([
            layers.Dense(64, activation='relu', input_shape=(self.state_size,)),
            layers.Dense(64, activation='relu'),
            layers.Dense(1, activation='linear')
        ])
        model.compile(loss='mse', optimizer=optimizers.Adam(learning_rate=lr))
        return model

    def choose_action(self, state):
        state = np.reshape(state, [1, self.state_size])
        action_probabilities = self.actor.predict(state, verbose=0)[0]
        action = np.random.choice(self.action_size, p=action_probabilities)
        return action, action_probabilities[action]

    def compute_advantages(self, rewards, values, next_values, dones):
        # GAE (Generalized Advantage Estimation) is commonly used, here simplified TD Advantage
        advantages = []
        for i in range(len(rewards)):
            if dones[i]:
                td_target = rewards[i]
            else:
                td_target = rewards[i] + self.discount_factor * next_values[i]
            advantage = td_target - values[i]
            advantages.append(advantage)
        return np.array(advantages)

    def train(self, states, actions, old_action_probs, advantages, returns, num_epochs=3, batch_size=64):
        # Convert to numpy arrays
        states = np.array(states)
        actions = np.array(actions)
        old_action_probs = np.array(old_action_probs)
        advantages = np.array(advantages)
        returns = np.array(returns)

        # Update old policy network for ratio calculation
        self.old_actor.set_weights(self.actor.get_weights())

        # Critic update (value function)
        self.critic.fit(states, returns, epochs=num_epochs, verbose=0, batch_size=batch_size)

        # Actor update (policy function)
        for _ in range(num_epochs):
            # Custom training step using tf.GradientTape for PPO loss
            with tf.GradientTape() as tape:
                current_action_probs = self.actor(states)
                # Select probabilities for the chosen actions
                current_action_probs_gathered = tf.gather_nd(current_action_probs,
                                                           tf.stack([tf.range(tf.shape(actions)[0]), actions], axis=1))
                
                ratio = current_action_probs_gathered / (old_action_probs + 1e-10) # Add epsilon to avoid division by zero
                
                clipped_ratio = tf.clip_by_value(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio)
                
                # PPO Clipped Objective
                actor_loss = -tf.reduce_mean(tf.minimum(ratio * advantages, clipped_ratio * advantages))
            
            actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables)
            self.actor.optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables))

# Example usage (requires an environment like Gym's CartPole)
# env = gym.make('CartPole-v1')
# state_size = env.observation_space.shape[0]
# action_size = env.action_space.n
# agent = PPOAgent(state_size, action_size)

# for e in range(num_episodes):
#    states, actions, rewards, old_action_probs_list = [], [], [], []
#    state = env.reset()
#    done = False
#    episode_rewards = 0
#    while not done:
#        action, old_prob = agent.choose_action(state)
#        next_state, reward, done, _ = env.step(action)
#
#        states.append(state)
#        actions.append(action)
#        rewards.append(reward)
#        old_action_probs_list.append(old_prob)
#
#        state = next_state
#        episode_rewards += reward
#    
#    # Prepare data for training
#    # Calculate value estimates for states in the trajectory
#    values = agent.critic.predict(np.array(states), verbose=0).flatten()
#    next_values = np.append(values[1:], agent.critic.predict(np.reshape(next_state, [1, state_size]), verbose=0)[0]) # Simplified
#    
#    advantages = agent.compute_advantages(rewards, values, next_values, [False]*len(rewards[:-1]) + [True])
#
#    # Calculate returns (sum of discounted rewards)
#    returns = []
#    G = 0
#    for r in reversed(rewards):
#        G = r + agent.discount_factor * G
#        returns.insert(0, G)
#    returns = np.array(returns)
#
#    agent.train(states, actions, old_action_probs_list, advantages, returns)
```

# Part B

### 1. Q-Learning

**Theory:** Q-Learning is an **off-policy, model-free, value-based** reinforcement learning algorithm. It learns an action-value function, $Q(s, a)$, which represents the maximum expected future rewards for taking action $a$ in state $s$. 'Off-policy' means it learns the optimal policy's Q-values while following a different (e.g., exploratory) behavior policy.

**Equation (Q-Value Update):**
$$Q(s, a) \leftarrow Q(s, a) + \alpha [R + \gamma \max_{a'} Q(s', a') - Q(s, a)]$$
Where:
*   $Q(s, a)$: Current Q-value for state $s$ and action $a$.
*   $\alpha$: Learning rate ($0 \le \alpha \le 1$).
*   $R$: Immediate reward received after taking action $a$ in state $s$.
*   $\gamma$: Discount factor ($0 \le \gamma \le 1$) for future rewards.
*   $s'$: The new state after taking action $a$.
*   $\max_{a'} Q(s', a')$: The maximum Q-value for the next state $s'$ over all possible actions $a'$.

### 2. SARSA (State-Action-Reward-State-Action)

**Theory:** SARSA is an **on-policy, model-free, value-based** reinforcement learning algorithm. Similar to Q-Learning, it learns an action-value function $Q(s, a)$. However, 'on-policy' means that it updates the Q-values based on the *next action actually taken* by the current policy, not the maximum possible next action. This makes it more sensitive to the agent's exploration strategy.

**Equation (Q-Value Update):**
$$Q(s, a) \leftarrow Q(s, a) + \alpha [R + \gamma Q(s', a') - Q(s, a)]$$
Where:
*   $Q(s, a)$: Current Q-value for state $s$ and action $a$.
*   $\alpha$: Learning rate ($0 \le \alpha \le 1$).
*   $R$: Immediate reward received.
*   $\gamma$: Discount factor ($0 \le \gamma \le 1$).
*   $s'$: The new state.
*   $a'$: The action chosen in state $s'$ by the *current policy*.

### 3. Deep Q-Networks (DQN)

**Theory:** DQN extends Q-Learning by using **deep neural networks** to approximate the Q-function, enabling it to handle high-dimensional state spaces. Key innovations include **experience replay** (storing and sampling past transitions to break correlations) and a **separate target network** (a copy of the Q-network updated less frequently) to stabilize training.

**Equation (Loss Function for Network Training):**
$$L(\theta) = E_{(s, a, R, s') \sim U(D)} \left[ (R + \gamma \max_{a'} Q(s', a'; \theta_{target}) - Q(s, a; \theta))^2 \right]$$
Where:
*   $\theta$: Parameters of the current Q-network.
*   $\theta_{target}$: Parameters of the target Q-network.
*   $D$: Experience replay buffer.
*   $U(D)$: Uniform sampling from the experience replay buffer.
*   $R + \gamma \max_{a'} Q(s', a'; \theta_{target})$: The 'target' Q-value, calculated using the target network.
*   $Q(s, a; \theta)$: The predicted Q-value from the current network.

### 4. REINFORCE (Monte Carlo Policy Gradient)

**Theory:** REINFORCE is an **on-policy, policy-based** algorithm. Instead of learning value functions, it directly learns a parameterized policy $\pi_{\theta}(a|s)$ that maps states to actions. It estimates the gradient of the expected return using Monte Carlo rollouts (full episodes) and updates the policy parameters $\theta$ in the direction that increases the probability of actions that lead to higher returns.

**Equation (Policy Parameter Update):**
$$\theta \leftarrow \theta + \alpha \nabla J(\theta)$$
$$J(\theta) = E_{\pi_{\theta}} \left[ \sum_{t=0}^T R_t \right]$$
$$\nabla J(\theta) \approx \sum_{t=0}^T \nabla_{\theta} \log \pi_{\theta}(a_t|s_t) G_t$$
Where:
*   $\theta$: Policy parameters.
*   $\alpha$: Learning rate.
*   $J(\theta)$: Objective function (expected total reward).
*   $\nabla J(\theta)$: Gradient of the objective function.
*   $G_t$: The return (total discounted future reward) from time step $t$.
*   $\nabla_{\theta} \log \pi_{\theta}(a_t|s_t)$: Gradient of the log-probability of the action taken.

### 5. Actor-Critic Methods (Conceptual basis for A2C/A3C)

**Theory:** Actor-Critic methods combine aspects of both **value-based (critic)** and **policy-based (actor)** approaches. The **Actor** learns a policy $\pi_{\theta}(a|s)$ to select actions, and the **Critic** learns a value function $V_{\phi}(s)$ or $Q_{\phi}(s, a)$ to estimate the expected return. The critic's value estimates are used to update the actor's policy, often via an **advantage function**, which reduces variance in policy gradient estimates.

**Key Concepts & Updates (simplified):**

*   **Advantage Function ($A(s, a)$):** Measures how much better an action $a$ is than the average action in state $s$.
    *   $A(s, a) = Q(s, a) - V(s)$
    *   Or, commonly used in A2C/A3C: $A(s, a) = R + \gamma V(s') - V(s)$ (Temporal Difference Error)

*   **Actor Update:** Policy parameters are updated in the direction of the advantage.
    $$\theta \leftarrow \theta + \alpha_{actor} \nabla_{\theta} \log \pi_{\theta}(a|s) A(s, a)$$

*   **Critic Update:** Value function parameters are updated to minimize the squared error between its estimate and the actual (or estimated) return.
    $$\phi \leftarrow \phi - \alpha_{critic} \nabla_{\phi} (R + \gamma V_{\phi}(s') - V_{\phi}(s))^2$$

### 6. Proximal Policy Optimization (PPO)

**Theory:** PPO is a popular and robust **on-policy, policy-based** algorithm. It's an improvement over traditional policy gradient methods, aiming to achieve the data efficiency and reliable performance of Trust Region Policy Optimization (TRPO) but with a simpler implementation. PPO uses a **clipped surrogate objective function** to constrain policy updates, preventing them from becoming too large and destabilizing training.

**Equation (Clipped Surrogate Objective):**
$$L^{CLIP}(\theta) = E_t \left[ \min(r_t(\theta) A_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) A_t) \right]$$
Where:
*   $\theta$: Current policy parameters.
*   $E_t$: Expectation over time steps (trajectories).
*   $A_t$: Estimated advantage at time step $t$.
*   $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\theta_{old}}(a_t|s_t)}$: Ratio of the new policy's probability to the old policy's probability for the chosen action.
*   $\text{clip}(x, L, R)$: Clips the value $x$ to be within the range $[L, R]$.
*   $\epsilon$: A small hyperparameter (e.g., 0.1 or 0.2) that defines the clipping range.