# RL Final Project

Now it's finally time to put into use what we have learned so far in this course!

The aim of this project is to assess your practical knowledge in Reinforcement Learning.

your project consist of 2 parts. you will get the chance to work with 2 different environment.


## 2.Atari Game Pong


<img src="zzzzzzzzzzzzzzzzzc"/>

**[Pong](https://www.gymlibrary.dev/environments/atari/pong/)** is a famus atari game that almost all of us have played it at least once!
The goal of this task is to get engage with **gym** library and use Deep Reinforcement Learning to train an agent which can actually play this game!

In [27]:
# !pip install ALE
# !pip install gym
# !pip install opencv-python
#
# !pip install "tensorflow==2.10"
# !pip install "tensorflow-gpu==2.10"
#
# !pip install tqdm
# !pip install jdc
#
# !pip list

In [28]:
import gym
import cv2
import jdc
import random
import warnings

import numpy as np
import tensorflow as tf

from collections import deque
from IPython.utils import io
from tqdm.notebook import tqdm

In [29]:
warnings.filterwarnings('ignore')

In [30]:
INPUT_SHAPE = (84, 84, 1)
TRAIN = True
GAME_NAME = 'ALE/Pong-v5'
BATCH_SIZE = 32
MEMORY_SIZE = 2000

In [31]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [32]:
class DQNAgent:
    def __init__(
            self,
            model_name,
            action_space,
            gamma=0.95,
            epsilon=1.0, epsilon_min=0.01,
            epsilon_decay=0.995,
            learning_rate=0.001,
            episodes=100
    ):
        self.model_name = model_name
        self.observation_space = INPUT_SHAPE
        self.action_space = action_space

        self.memory = deque(maxlen=MEMORY_SIZE)
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.learning_rate = learning_rate
        self.model = self.build_model()

        self.episodes = episodes
        self.batch_size = BATCH_SIZE

    def build_model(self, pad='same'):
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Conv2D(32, (8, 8), strides=4, padding=pad, input_shape=INPUT_SHAPE))
        model.add(tf.keras.layers.Conv2D(64, (4, 4), strides=2, padding=pad))
        model.add(tf.keras.layers.Conv2D(64, (3, 3), strides=1, padding=pad))
        model.add(tf.keras.layers.Flatten())
        model.add(tf.keras.layers.Dense(512, activation='relu'))
        model.add(tf.keras.layers.Dense(self.action_space, activation='linear'))
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate))
        return model

In [33]:
%%add_to DQNAgent

def preprocess_frame(self, frame):
    frame = frame[0]
    if len(frame.shape) == 3 and frame.shape[2] == 3:
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    resized = cv2.resize(frame, (84, 84))
    return np.expand_dims(resized, axis=2)

In [34]:
%%add_to DQNAgent

def remember(self, state, action, reward, next_state, done):
    self.memory.append((state, action, reward, next_state, done))

In [35]:
%%add_to DQNAgent

def act(self, state):
    if np.random.rand() <= self.epsilon:
        return random.randrange(self.action_space)
    with io.capture_output() as captured:
        act_values = self.model.predict(state.reshape(1, *state.shape))
    return np.argmax(act_values[0])

In [36]:
%%add_to DQNAgent

def replay(self, batch_size):
    minibatch = random.sample(self.memory, batch_size)
    states = np.array([x[0] for x in minibatch])
    actions = np.array([x[1] for x in minibatch])
    rewards = np.array([x[2] for x in minibatch])
    next_states = np.array([x[3] for x in minibatch])
    dones = np.array([x[4] for x in minibatch])

    states = states.reshape(states.shape[0], *self.observation_space)
    next_states = next_states.reshape(next_states.shape[0], *self.observation_space)
    dones = np.array([x[4] for x in minibatch])

    targets = rewards + self.gamma * np.amax(self.model.predict_on_batch(next_states), axis=1) * (1 - dones)
    target_f = self.model.predict_on_batch(states)

    for i, action in enumerate(actions):
        target_f[i][action] = targets[i]

    self.model.fit(states, target_f, epochs=1, verbose=0)

    if self.epsilon > self.epsilon_min:
        self.epsilon *= self.epsilon_decay

In [37]:
%%add_to DQNAgent

def train(self, max_episode_length=500):
    bar_format = 'Training: {percentage:3.0f}% |{bar}| Elapsed: {elapsed} Remaining: {remaining}{postfix}'
    training_pbar = tqdm(total=self.episodes, bar_format=bar_format, unit='episode')
    for e in range(self.episodes):
        state = env.reset()
        state = self.preprocess_frame(state)
        state = np.reshape(state, [1, 84, 84, 1])
        total_reward = 0
        bar_format = 'Episode: {percentage:3.0f}% |{bar}| Speed: {rate_fmt}{postfix}'
        episode_pbar = tqdm(total=max_episode_length, bar_format=bar_format, unit='step')
        for time in range(max_episode_length):
            action = self.act(state)
            next_state, reward, done, _, _ = env.step(action)
            next_state = self.preprocess_frame(next_state)
            next_state = np.reshape(next_state, [1, 84, 84, 1])
            self.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                print("episode: {}/{}, score: {}, e: {:.2}".format(e, self.episodes, time, self.epsilon))
                break
            if len(self.memory) > self.batch_size:
                self.replay(self.batch_size)
            total_reward += reward
            episode_pbar.set_postfix_str(f'Reward: {total_reward}')
            episode_pbar.update(1)
        if e % 100 == 0:
            self.model.save(self.model_name)
        episode_pbar.close()
        training_pbar.set_postfix_str(f'Reward: {total_reward}')
        training_pbar.update(1)

    training_pbar.close()
    print('Training completed')
    env.close()

In [38]:
def play_with_model(model_path):
    env = gym.make(GAME_NAME, render_mode='human')
    saved_model = tf.keras.models.load_model(model_path)
    agent = DQNAgent(INPUT_SHAPE, env.action_space.n)
    agent.model = saved_model

    state = env.reset()
    state = agent.preprocess_frame(state)
    state = np.reshape(state, [1, 84, 84, 1])

    done = False
    while not done:
        if 'render_fps' in env.metadata:
            env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        next_state = agent.preprocess_frame(next_state)
        next_state = np.reshape(next_state, [1, 84, 84, 1])
        state = next_state

    env.close()

In [39]:
# if TRAIN:
env = gym.make(GAME_NAME)
agent = DQNAgent(
    model_name='trained_model.h5',
    action_space=env.action_space.n,
    episodes=1000
)
agent.train(
    max_episode_length=100
)

# play_with_model('trained_model.h5')

Training:   0% |          | Elapsed: 00:00 Remaining: ?

AttributeError: 'DQNAgent' object has no attribute 'env'

**Note**: Keep in mind that observation space for this environment are frames from environment. Observation space is an image of size (210, 160, 3). so you will need to implement an agent which can process images!(a CNN based agent). 

Make sure to do perform preprocessing on the frames. For example, you can convert the RBG image to gray. you can use [OpenCV](https://docs.opencv.org/4.x/d6/d00/tutorial_py_root.html) library to perform resize\ing, bluring or any applicable filtering on the frames.

## Grading criteria
Project: 35 points

* Final Viva: 10 points
* Implementation: 10 points
* Final Report: 15 points

For viva you will need to expilictly mention each team member's contribution.

You can write your report on this notebook. The report must include visualization of your results. Train your model at least with 2 different sets of hyperparameters and in visualization section compare their output.


### Good Luck!