# RL Final Project

Now it's finally time to put into use what we have learned so far in this course!

The aim of this project is to assess your practical knowledge in Reinforcement Learning.

your project consist of 2 parts. you will get the chance to work with 2 different environment.


## 2.Atari Game Pong


<img src="zzzzzzzzzzzzzzzzzc"/>

**[Pong](https://www.gymlibrary.dev/environments/atari/pong/)** is a famus atari game that almost all of us have played it at least once!
The goal of this task is to get engage with **gym** library and use Deep Reinforcement Learning to train an agent which can actually play this game!

In [None]:
# !pip install ALE
# !pip install gym
# !pip install opencv-python
#
# !pip install "tensorflow==2.10"
# !pip install "tensorflow-gpu==2.10"
#
# !pip install tqdm
# !pip install jdc
#
# !pip list

In [None]:
import gym
import cv2
import random
import warnings

import numpy as np
import tensorflow as tf

from collections import deque
from IPython.utils import io
from tqdm.notebook import tqdm

In [None]:
warnings.filterwarnings('ignore')

In [None]:
tf.config.list_physical_devices('GPU')

In [None]:
TRAIN = True
GAME_NAME = 'ALE/Pong-v5'
MODEL_PATH = './pong-dqn.h5'
BEST_MODEL_PATH = './best_model.h5'
MODEL_ACTIVATION = 'relu'
INPUT_SHAPE = (84, 84, 1)

In [None]:
BATCH_SIZE = 32
MEMORY_SIZE = 10000

GAMMA = 0.95
EPSILON = 1.0
MIN_EPSILON = 0.1
EPSILON_DECAY = 0.995
LEARNING_RATE = 0.001

In [None]:
class PrioritizedReplayBuffer:
    def __init__(self, size, alpha=0.6, beta_start=0.4, beta_frames=100000):
        self.alpha = alpha
        self.beta_start = beta_start
        self.beta_frames = beta_frames
        self.frame = 1
        self.buffer = deque(maxlen=size)
        self.priorities = deque(maxlen=size)
        self.max_priority = 1.0

    def append(self, experience):
        self.buffer.append(experience)
        self.priorities.append(self.max_priority)

    def sample(self, batch_size):
        probs = np.array(self.priorities) ** self.alpha
        probs /= probs.sum()

        beta = self.beta_start + (1 - self.beta_start) * (self.frame / self.beta_frames)
        self.frame += 1

        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
        experiences = [self.buffer[i] for i in indices]
        weights = (len(self.buffer) * probs[indices]) ** (-beta)
        weights /= weights.max()

        return experiences, indices, np.array(weights, dtype=np.float32)

    def update_priorities(self, indices, errors, absolute_error=1e-5):
        for i, error in zip(indices, errors):
            priority = np.max(np.abs(error)) + absolute_error
            self.priorities[i] = priority
            self.max_priority = max(self.max_priority, priority)

    def __len__(self):
        return len(self.buffer)

In [None]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = PrioritizedReplayBuffer(MEMORY_SIZE)
        self.epsilon = EPSILON
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_model()

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def build_model(self):
        inputs = tf.keras.layers.Input(shape=self.state_size)
        x = tf.keras.layers.Conv2D(32, kernel_size=3, activation=MODEL_ACTIVATION)(inputs)
        x = tf.keras.layers.Conv2D(64, kernel_size=3, activation=MODEL_ACTIVATION)(x)
        x = tf.keras.layers.Conv2D(128, kernel_size=3, activation=MODEL_ACTIVATION)(x)
        x = tf.keras.layers.Flatten()(x)
        fc1 = tf.keras.layers.Dense(256, activation=MODEL_ACTIVATION)(x)
        value = tf.keras.layers.Dense(1)(fc1)
        advantage = tf.keras.layers.Dense(self.action_size)(fc1)
        q_values = value + (advantage - tf.math.reduce_mean(advantage, axis=1, keepdims=True))
        model = tf.keras.models.Model(inputs=inputs, outputs=q_values)
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE))
        return model

    def choose_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            return np.argmax(self.model.predict(state)[0])

    def run_episode(self):
        experiences, indices, weights = self.memory.sample(BATCH_SIZE)
        states, actions, rewards, next_states, dones = zip(*experiences)
        states = np.squeeze(states, axis=1)
        next_states = np.squeeze(next_states, axis=1)
        target_q_values = self.target_model.predict(np.array(next_states))
        online_q_values = self.model.predict(np.array(next_states))
        best_actions = np.argmax(online_q_values, axis=1)

        targets = rewards + (1 - np.array(dones)) * GAMMA * target_q_values[np.arange(BATCH_SIZE), best_actions]
        target_f = self.model.predict(np.array(states))
        target_f[np.arange(BATCH_SIZE), actions] = targets

        errors = np.abs(self.model.predict(np.array(states)) - target_f)
        self.memory.update_priorities(indices, errors)

        if self.epsilon > MIN_EPSILON:
            self.epsilon *= EPSILON_DECAY

        self.model.fit(states, target_f, verbose=0, callbacks=[best_checkpointer])

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [None]:
def preprocess_frame(frame):
    frame = frame[0]
    if len(frame.shape) == 3 and frame.shape[2] == 3:
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
    resized = cv2.resize(frame, (84, 84), interpolation=cv2.INTER_AREA)
    return np.reshape(resized, INPUT_SHAPE)

In [None]:
def train(episodes):
    env = gym.make(GAME_NAME)
    state_size = INPUT_SHAPE
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)

    bar_format = 'Training: {percentage:3.0f}% |{bar}| Elapsed: {elapsed} Remaining: {remaining}{postfix}'
    training_pbar = tqdm(total=episodes, bar_format=bar_format, unit='episode')

    best_total_reward = -np.inf

    for e in range(episodes):
        state = preprocess_frame(env.reset())
        state = np.expand_dims(state, axis=0)
        done = False
        total_reward = 0
        while not done:
            with io.capture_output() as captured:
                action = agent.choose_action(state)
            next_state, reward, done, _, _ = env.step(action)
            total_reward += reward

            next_state = preprocess_frame(next_state)
            next_state = np.expand_dims(next_state, axis=0)
            agent.remember(state, action, reward, next_state, done)
            state = next_state

            if done:
                if total_reward > best_total_reward:
                    print(f"New best total reward {total_reward}, saving model weights.")
                    best_total_reward = total_reward
                    agent.model.save_weights(MODEL_PATH)

        if len(agent.memory) > BATCH_SIZE:
            with io.capture_output() as captured:
                agent.run_episode()

        training_pbar.set_postfix_str(f'Reward: {total_reward}')
        training_pbar.update(1)

    training_pbar.close()


best_checkpointer = tf.keras.callbacks.ModelCheckpoint(
    filepath=BEST_MODEL_PATH,
    monitor='loss',
    verbose=1,
    save_best_only=True
)

train(episodes=200)

In [None]:
def play_with_model():
    env = gym.make(GAME_NAME, render_mode='human')
    state_size = INPUT_SHAPE
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)
    agent.load(BEST_MODEL_PATH)

    state = preprocess_frame(env.reset())
    state = np.expand_dims(state, axis=0)
    done = False
    while not done:
        env.render()
        with io.capture_output() as captured:
            action = agent.choose_action(state)
        next_state, reward, done, _, _ = env.step(action)
        state = preprocess_frame(next_state)
        state = np.expand_dims(state, axis=0)


play_with_model()

**Note**: Keep in mind that observation space for this environment are frames from environment. Observation space is an image of size (210, 160, 3). so you will need to implement an agent which can process images!(a CNN based agent). 

Make sure to do perform preprocessing on the frames. For example, you can convert the RBG image to gray. you can use [OpenCV](https://docs.opencv.org/4.x/d6/d00/tutorial_py_root.html) library to perform resize\ing, bluring or any applicable filtering on the frames.

## Grading criteria
Project: 35 points

* Final Viva: 10 points
* Implementation: 10 points
* Final Report: 15 points

For viva you will need to expilictly mention each team member's contribution.

You can write your report on this notebook. The report must include visualization of your results. Train your model at least with 2 different sets of hyperparameters and in visualization section compare their output.


### Good Luck!