In [None]:
import os
import time
import random
from IPython.display import display, clear_output
import PIL.Image
import torch
from torch import nn, optim
import torch.nn.functional as F
from collections import deque
import imageio
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.animation as animation

import gym
import retro
import time
import json
import cv2

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'using device: {device}')

using device: cuda


In [None]:
class DuelCNNPlayerModel(nn.Module):
    def __init__(self, height, width, output_size):
        super(DuelCNNPlayerModel, self).__init__()

        # Convolutional neural network
        self.conv1 = nn.Conv2d(in_channels = 4, out_channels = 32, kernel_size = 8, stride = 4)
        self.bn1 = nn.BatchNorm2d(32)
        conv_height, conv_width = self.calculate_updated_size(height, width, 8, 4)
        self.conv2 = nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = 4, stride = 2)
        self.bn2 = nn.BatchNorm2d(64)
        conv_height, conv_width = self.calculate_updated_size(conv_height, conv_width, 4, 2)
        self.conv3 = nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = 3, stride = 1)
        self.bn3 = nn.BatchNorm2d(64)
        conv_height, conv_width = self.calculate_updated_size(conv_height, conv_width, 3, 1)

        input_size = conv_height * conv_width * 64

        # Dueling action layer
        self.action_linear_1 = nn.Linear(in_features = input_size, out_features = 128)
        self.action_relu = nn.LeakyReLU()
        self.action_linear_2 = nn.Linear(in_features = 128, out_features = output_size)

        # Dueling state layer
        self.state_linear_1 = nn.Linear(in_features = input_size, out_features = 128)
        self.state_relu = nn.LeakyReLU()
        self.state_linear_2 = nn.Linear(in_features = 128, out_features = 1)

    def calculate_updated_size(self, height, width, kernel_size, stride):
        return (
            ((height - kernel_size) // stride + 1),
            ((width - kernel_size) // stride + 1)
        )

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))

        x = x.view(x.size(0), -1)

        action_x = self.action_relu(self.action_linear_1(x))
        action_x = self.action_linear_2(action_x)

        state_x = self.state_relu(self.state_linear_1(x))
        state_x = self.state_linear_2(state_x)

        q = state_x + (action_x - action_x.mean())

        return q

# Stub class
class DQNPlayerAgent:
    def __init__(self, env):

        self.state_size_h = env.observation_space.shape[0]
        self.state_size_w = env.observation_space.shape[1]
        self.state_size_c = env.observation_space.shape[2]

        self.height = 80
        self.width = 64

        self.action_space = 4

        self.targetNN = None
        self.onlineNN = None

        self.targetNN = DuelCNNPlayerModel(self.height, self.width, self.action_space).to(device)
        self.onlineNN = DuelCNNPlayerModel(self.height, self.width, self.action_space).to(device)
        self.targetNN.load_state_dict(self.onlineNN.state_dict())
        self.targetNN.eval()

        # Note: if we seem to have a lot of extra RAM, we might want to increase this
        self.memory = deque(maxlen=50000)
        self.optimizer = optim.Adam(self.onlineNN.parameters(), lr=0.00025)

        self.epsilon_min = 0.1
        self.epsilon_decay = 0.92
        self.epsilon = 1

    def predict_action(self, state):
        # Greedy algo
        if random.random() < self.epsilon:
            action = random.choice(range(self.action_space))
        else:
            with torch.no_grad():
                state = torch.tensor(state, dtype=torch.float, device=device).unsqueeze(0)
                q_values = self.onlineNN.forward(state)  # (1, action_space)
                action = torch.argmax(q_values).item()

        return action

    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            self.epsilon = max(self.epsilon, self.epsilon_min)


    def train(self):
        # Note: if we seem to have a lot of extra RAM, we might want to increase this
        if len(self.memory) < 40000:
            return 0, 0
        # Sample a minibatch and convert to tensors to pass into pytorch
        state, action, reward, next_state, done = zip(*random.sample(self.memory, 64))

        state = np.concatenate(state)
        next_state = np.concatenate(next_state)

        state = torch.tensor(state, dtype=torch.float, device=device)
        action = torch.tensor(action, dtype=torch.long, device=device)
        reward = torch.tensor(reward, dtype=torch.float, device=device)
        next_state = torch.tensor(next_state, dtype=torch.float, device=device)
        done = torch.tensor(done, dtype=torch.float, device=device)

        state_q_vals = self.onlineNN(state)
        next_state_q_vals = self.onlineNN(next_state)
        target_q_vals = self.targetNN(next_state)

        selected_qvals = state_q_vals.gather(1, action.unsqueeze(1)).squeeze(1)
        target_qvals = target_q_vals.gather(1, next_state_q_vals.max(1)[1].unsqueeze(1)).squeeze(1)
        expected_qvals = reward + (0.99 * target_qvals * (1 - done))

        loss = (selected_qvals - expected_qvals.detach()).pow(2).mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss, torch.max(state_q_vals).item()

    def update_target_to_online(self):
        self.targetNN.load_state_dict(self.onlineNN.state_dict())

    def store_results(self, data):
        self.memory.append([data[0][None, :], data[1], data[2], data[3][None, :], data[4]])


In [None]:
def preprocess_frame(frame):
    frame_cropped = frame[35:195:2, ::2, :]
    frame_gray = np.dot(frame_cropped[..., :3], [0.299, 0.587, 0.114])
    frame_normalized = frame_gray / 255.0
    return np.expand_dims(frame_normalized, axis=0)

def preProcess(image):
      """
      Process image crop resize, grayscale and normalize the images
      """
      frame = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  # To grayscale
      crop_dim = [20, 210, 0, 160]
      frame = frame[crop_dim[0]:crop_dim[1], crop_dim[2]:crop_dim[3]]  # Cut 20 px from top
      frame = cv2.resize(frame, (80, 64))  # Resize
      frame = frame.reshape(80, 64) / 255  # Normalize

      return frame

def train_dqn(model, minibatch, gamma):
    # Extract data from the minibatch
    states = np.array([sample[0] for sample in minibatch])
    actions = np.array([sample[1] for sample in minibatch])
    rewards = np.array([sample[2] for sample in minibatch])
    next_states = np.array([sample[3] for sample in minibatch])
    dones = np.array([sample[4] for sample in minibatch])

    # Predict Q-values for the current and next states
    q_values = model.predict(states, verbose=0)
    next_q_values = model.predict(next_states, verbose=0)

    # Update Q-values using the Bellman equation
    for i in range(len(minibatch)):
        target = rewards[i]
        if not dones[i]:
            target += gamma * np.max(next_q_values[i])

        # Handle multi-dimensional or one-hot encoded actions
        if isinstance(actions[i], (list, np.ndarray)):
            # Assume one-hot encoding for actions
            action_index = np.argmax(actions[i])
        else:
            # Direct integer index
            action_index = int(actions[i])

        # Update the target Q-value for the chosen action
        q_values[i][action_index] = target

    # Train the model on the updated Q-values
    model.fit(states, q_values, verbose=0, batch_size=len(minibatch))


def player1_action_sample():
    action = [0.0] * 16
    player1_action = random.choice([0, 4, 5])
    action[player1_action] = 1
    return action

def player2_action_sample():
    action = [0.0] * 16
    player2_action = random.choice([6, 7, 15])
    action[player2_action] = 1
    return action

def plot_paddle_bounces(episode_numbers, paddle_bounces):
    plt.figure(figsize=(12, 6))
    plt.plot(episode_numbers, paddle_bounces, linewidth=2)
    plt.xlabel("Episode Number", fontsize=14)
    plt.ylabel("Paddle Bounces per Point", fontsize=14)
    plt.title("Number of Paddle Bounces per Episode", fontsize=16)
    plt.grid(True)
    plt.show()

def plot_average_serving_times(episode_numbers, average_serving_times):
    plt.figure(figsize=(12, 6))
    plt.plot(episode_numbers, average_serving_times, linewidth=2)
    plt.xlabel("Episode number", fontsize=14)
    plt.ylabel("Average time between ball serve and hit", fontsize=14)
    plt.title("Average serving time per episode", fontsize=16)
    plt.grid(True)
    plt.show()

def plot_rewards(episode_numbers, rewards_p1, rewards_p2):
    plt.figure(figsize=(12, 6))
    plt.plot(episode_numbers, rewards_p1, label="Player 1 Cumulative Rewards", linewidth=2)
    plt.plot(episode_numbers, rewards_p2, label="Player 2 Cumulative Rewards", linewidth=2)
    plt.xlabel("Episode Number", fontsize=14)
    plt.ylabel("Cumulative Rewards", fontsize=14)
    plt.title("Cumulative Rewards over Episodes", fontsize=16)
    plt.legend(fontsize=12)
    plt.grid(True)
    plt.show()

def plot_step_rewards(step_rewards_p1, step_rewards_p2, steps_to_display=100):
    min_length = min(len(step_rewards_p1), len(step_rewards_p2))
    step_rewards_p1 = step_rewards_p1[-min_length:]
    step_rewards_p2 = step_rewards_p2[-min_length:]
    start_index = max(0, min_length - steps_to_display)
    steps = range(start_index, min_length)
    step_rewards_p1 = step_rewards_p1[start_index:]
    step_rewards_p2 = step_rewards_p2[start_index:]
    plt.figure(figsize=(12, 6))
    plt.plot(steps, step_rewards_p1, label="Player 1 Step Rewards", marker='o', linestyle='-', linewidth=2)
    plt.plot(steps, step_rewards_p2, label="Player 2 Step Rewards", marker='s', linestyle='-', linewidth=2)
    plt.xlabel("Step Number", fontsize=14)
    plt.ylabel("Reward", fontsize=14)
    plt.title(f"Rewards Throughout the Last {steps_to_display} Steps", fontsize=16)
    plt.legend(fontsize=12)
    plt.grid(True)
    plt.show()

def plot_ball_pos(ball_pos):
    steps = range(len(ball_pos))
    plt.figure(figsize=(12, 6))
    plt.plot(steps, ball_pos, label="Ball Position in X axis", marker='o', linestyle='-', linewidth=2)
    plt.xlabel("Step Number", fontsize=14)
    plt.ylabel("Ball Position X-axis", fontsize=14)
    plt.title("Ball Position in X-axis Throughout an Episode", fontsize=16)
    plt.legend(fontsize=12)
    plt.grid(True)
    plt.show()


In [None]:
# Run this cell to load the graphs from the autosave data
with open('./models/autosave-data.json', 'r') as f:
  json_data = json.loads(f.read())
  cumulative_rewards_p1 = json_data['cumulative_rewards_p1']
  cumulative_rewards_p2 = json_data['cumulative_rewards_p2']
  episode_numbers = json_data['episode_numbers']
  paddle_bounces_per_point = json_data['paddle_bounces_per_point']
  average_serving_times = json_data['average_serving_times']

plot_rewards(episode_numbers, cumulative_rewards_p1, cumulative_rewards_p2)
plot_paddle_bounces(episode_numbers, paddle_bounces_per_point)
plot_average_serving_times(episode_numbers, average_serving_times)

In [None]:
env = retro.make(game='Pong-Atari2600', players=2, render_mode='rgb_array')
env.multi_rewards = True

In [None]:
# Set this to a non-zero value to load an existing episode number from
# the filesystem.
load_episode_number = 820

input_shape = (4, 80, 80)
action_space = env.action_space.n

player1_agent = DQNPlayerAgent(env)
player2_agent = DQNPlayerAgent(env)

start_episode = load_episode_number
episode_save_interval = 20
max_episodes = 99999
batch_size = 32
gamma = 0.99
preview_frames = []
frame_count = 0
# How many frames to include in output gif
frame_sampling_rate = 8

# These is to keep track for graphing purposes
cumulative_rewards_p1 = []
cumulative_rewards_p2 = []
episode_numbers = []
paddle_bounces_per_point = []
average_serving_times = []
p1_losses = []
p1_avg_max_q_val = []
p2_losses = []
p2_avg_max_q_val = []

if load_episode_number != 0:
  player1_agent.onlineNN.load_state_dict(torch.load(f'./models/p1-online-{load_episode_number}.pkl'))
  player1_agent.targetNN.load_state_dict(torch.load(f'./models/p1-target-{load_episode_number}.pkl'))
  player2_agent.onlineNN.load_state_dict(torch.load(f'./models/p2-online-{load_episode_number}.pkl'))
  player2_agent.targetNN.load_state_dict(torch.load(f'./models/p2-online-{load_episode_number}.pkl'))
  with open(f'./models/autosave-data-{load_episode_number}.json', 'r') as f:
    json_data = json.loads(f.read())
    cumulative_rewards_p1 = json_data['cumulative_rewards_p1'][:start_episode]
    cumulative_rewards_p2 = json_data['cumulative_rewards_p2'][:start_episode]
    episode_numbers = json_data['episode_numbers'][:start_episode]
    paddle_bounces_per_point = json_data['paddle_bounces_per_point'][:start_episode]
    average_serving_times = json_data['average_serving_times'][:start_episode]
    p1_losses = json_data['p1_losses'][:start_episode]
    p1_avg_max_q_val = json_data['p1_avg_max_q_val'][:start_episode]
    p2_losses = json_data['p2_losses'][:start_episode]
    p2_avg_max_q_val = json_data['p2_avg_max_q_val'][:start_episode]
  player1_agent.epsilon = max(player1_agent.epsilon_decay ** start_episode, player1_agent.epsilon_min)
  player2_agent.epsilon = max(player2_agent.epsilon_decay ** start_episode, player2_agent.epsilon_min)

# Map of the actions able to be performed by each player to the button
# index for the input in the order [SERVE, UP, DOWN, NOOP]
PLAYER_ONE_ACTIONS = [0, 4, 5, -1]
PLAYER_TWO_ACTIONS = [15, 6, 7, -1]

show_output = False


def show_output_print(*args):
  if show_output:
    print(*args)


for episode in range(start_episode, max_episodes):
  print(f"Starting episode {episode}/{max_episodes}")
  episode_start_time = time.time()
  state, _ = env.reset()
  state = preProcess(state)
  # Fill state with four of itself for the initial buffer
  state = np.stack((state, state, state, state))

  done = False
  score_reward_p1 = 0
  score_reward_p2 = 0

  should_reward_p1 = False
  should_reward_p2 = False
  has_rewarded = True

  step_count = 0
  train_interval = 100
  ram = env.get_ram()

  p1_pos = 181
  p2_pos = 181
  ball_pos = []

  step_rewards_p1 = []
  step_rewards_p2 = []

  prev_frames = []

  penalized = False

  paddle_bounces = 0
  total_points = 0
  serving_times = []
  last_serve_frame_count = 0

  p1_total_loss = 0
  p1_total_max_q_val = 0
  p2_total_loss = 0
  p2_total_max_q_val = 0

  while not done:
    # -- new --
    player1_action = player1_agent.predict_action(state)
    player2_action = player2_agent.predict_action(state)

    player1_action_index = PLAYER_ONE_ACTIONS[player1_action]
    player2_action_index = PLAYER_TWO_ACTIONS[player2_action]

    actions = [0.0] * 16
    if player1_action_index != -1:
      if p1_pos >= 200:
        actions[4] = 1.0
      elif p1_pos < 40:
        actions[5] = 1.0
      else:
        actions[player1_action_index] = 1.0
    if player2_action_index != -1:
      if p2_pos >= 200:
        actions[6] = 1.0
      elif p2_pos < 40:
        actions[7] = 1.0
      else:
        actions[player2_action_index] = 1.0

    next_state, rewards, done, _, info = env.step(actions)
    next_state = preProcess(next_state)
    next_state = np.stack((next_state, state[0], state[1], state[2]))

    ball_pos.append(info['ball_x'])

    if len(ball_pos) >= 3:

      # If the ball was reset, then there would be a big jump from the edge of the border back to "ball_x: 125" position, so I said anything less than 5 is safe enough to assume the ball didn't get reset
      if abs(ball_pos[-1] - ball_pos[-2]) < 5:

        # Assuming that the ball didn't get reset, let's try and observe if there was a change in the ball's direction

        # 1st condition: If the third last position is greater than the ball's second last recent position, then this means the ball is moving to the left
        # 2nd condition: If ball_pos[-1] > ball_pos[-2] this means that the ball has properly bounced off and is now moving to the right side
        if ball_pos[-3] > ball_pos[-2] and ball_pos[-1] > ball_pos[-2]:
          rewards[1] = 1.0
          rewards[0] = 0.0
          paddle_bounces += 1
          #print("player 2 will be rewarded")
        elif ball_pos[-3] < ball_pos[-2] and ball_pos[-1] < ball_pos[-2]:
          rewards[1] = 0.0
          rewards[0] = 1.0
          paddle_bounces += 1
          #print("player 1 will be rewarded")
        else:
          rewards[0] = 0.0
          rewards[1] = 0.0
      else:
        # print(f'jump {ball_pos[-1] - ball_pos[-2]} >= 5')
        rewards[0] = 0.0
        rewards[1] = 0.0
        ball_pos = []
        serving_times.append(frame_count - last_serve_frame_count)
        last_serve_frame_count = frame_count
    else:
      rewards[0] = 0.0
      rewards[1] = 0.0

    if len(ball_pos) > 10:  # Keep only the last 10 positions
      ball_pos.pop(0)

    if len(step_rewards_p1) > 10:
      step_rewards_p1.pop(0)

    if len(step_rewards_p2) > 10:
      step_rewards_p2.pop(0)

    # Let's check if either player1 or player2 got scored on
    show_output_print("penalized: ", penalized)

    # If the ball makes it past 68, which is when the slider hits the ball
    if not penalized and info['ball_x'] < 64:
      penalized = True
      rewards[0] = 0
      rewards[1] = -1

    elif not penalized and info['ball_x'] > 192:
      penalized = True
      rewards[0] = -1
      rewards[1] = 0

    # If the ball got reset, then we can penalize them again
    elif info['ball_x'] > 120 and info['ball_x'] < 130:
      penalized = False

    show_output_print("This is rewards: ", rewards)

    player1_agent.store_results((state, player1_action, rewards[0], next_state, done))
    player2_agent.store_results((state, player2_action, rewards[1], next_state, done))

    p1_pos = info['p1_pos']
    p2_pos = info['p2_pos']
    score_reward_p1 += rewards[0]
    score_reward_p2 += rewards[1]

    step_rewards_p1.append(rewards[0])
    step_rewards_p2.append(rewards[1])

    if step_count % 50 == 0:
        p1_loss, p1_max_q_val = player1_agent.train()
        p2_loss, p2_max_q_val = player2_agent.train()

        p1_total_loss += p1_loss if type(p1_loss) == int else p1_loss.item()
        p2_total_loss += p2_loss if type(p2_loss) == int else p2_loss.item()

        p1_total_max_q_val += p1_max_q_val
        p2_total_max_q_val += p2_max_q_val

    state = next_state

    step_count += 1

    if show_output:
      frame = env.render()
      frame_image = PIL.Image.fromarray(frame)
      #clear_output()
      display(frame_image)
      plot_step_rewards(step_rewards_p1, step_rewards_p2)
      plot_rewards(episode_numbers, cumulative_rewards_p1, cumulative_rewards_p2)
      plot_ball_pos(ball_pos)

    if (episode % episode_save_interval) == 0 and frame_count % frame_sampling_rate == 0:
      frame = env.render()
      preview_frames.append(frame)
    frame_count += 1

    if done:
      # score 1 and score 2 are flipped around for some reason
      score_p1 = info['score2']
      score_p2 = info['score1']
      total_points = score_p1 + score_p2
      print(f"Episode {episode} ended. Rewards - Player 1: {score_reward_p1}, " +
          f"Player 2: {score_reward_p2}, Scores - P1: {score_p1}, P2: {score_p2}")
      state, _ = env.reset()
      player1_agent.update_target_to_online()
      player2_agent.update_target_to_online()

  cumulative_rewards_p1.append(score_reward_p1)
  cumulative_rewards_p2.append(score_reward_p2)
  if total_points != 0:
    paddle_bounces_per_point.append(paddle_bounces / total_points)
  episode_numbers.append(episode)
  average_serving_times.append(sum(serving_times) / len(serving_times))
  p1_losses.append(p1_total_loss)
  p2_losses.append(p2_total_loss)
  p1_avg_max_q_val.append(p1_total_max_q_val / (1 + (frame_count // 50)))
  p2_avg_max_q_val.append(p2_total_max_q_val / (1 + (frame_count // 50)))

  player1_agent.decay_epsilon()
  player2_agent.decay_epsilon()

  if (episode) % episode_save_interval == 0:
    imageio.mimsave(f'./episodes/episode-{episode}.gif', preview_frames, fps=60)
    torch.save(player1_agent.onlineNN.state_dict(), f'./models/p1-online-{episode}.pkl')
    torch.save(player1_agent.targetNN.state_dict(), f'./models/p1-target-{episode}.pkl')
    torch.save(player2_agent.onlineNN.state_dict(), f'./models/p2-online-{episode}.pkl')
    torch.save(player2_agent.targetNN.state_dict(), f'./models/p2-target-{episode}.pkl')
    with open(f'./models/autosave-data-{episode}.json', 'w+') as f:
      f.write(json.dumps({
        'cumulative_rewards_p1': cumulative_rewards_p1,
        'cumulative_rewards_p2': cumulative_rewards_p2,
        'episode_numbers': episode_numbers,
        'paddle_bounces_per_point': paddle_bounces_per_point,
        'average_serving_times': average_serving_times,
        'p1_losses': p1_losses,
        'p2_losses': p2_losses,
        'p1_avg_max_q_val': p1_avg_max_q_val,
        'p2_avg_max_q_val': p2_avg_max_q_val
      }))
  preview_frames = []

  episode_elapsed_time = int(time.time() - episode_start_time)
  print(f'Took {episode_elapsed_time}s, New epsilon: {player1_agent.epsilon}, Frames: {frame_count}')
  frame_count = 0

In [None]:
env.close()
player1_model.save('models/player1.keras')
player2_model.save('models/player2.keras')