# PIP

In [None]:
!pip install gymnasium[atari,accept-rom-license]



# CONNECT TO FS

In [None]:
!mkdir models


# CONFIG

In [None]:
# BATCH_SIZE is the number of transitions sampled from the replay buffer
# GAMMA is the discount factor as mentioned in the previous section
# EPS_START is the starting value of epsilon
# EPS_END is the final value of epsilon
# EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
# TAU is the update rate of the target network
# LR is the learning rate of the ``AdamW`` optimizer
BATCH_SIZE = 128
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000
TAU = 0.005
LR = 1e-4

ENV_NAME = "BreakoutDeterministic"

# PLOTTING

In [None]:
import matplotlib.pyplot as plt
import torch

def plot_durations(episode_durations,show_result=False):
	plt.figure(1)
	durations_t = torch.tensor(episode_durations, dtype=torch.float)
	if show_result:
		plt.title('Result')
	else:
		plt.clf()
		plt.title('Training...')
	plt.xlabel('Episode')
	plt.ylabel('Duration')
	plt.plot(durations_t.numpy())
	# Take 100 episode averages and plot them too
	if len(durations_t) >= 100:
		means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
		means = torch.cat((torch.zeros(99), means))
		plt.savefig('training.png')
		plt.plot(means.numpy())
	plt.pause(0.001)  # pause a bit so that plots are updated

# TRANSFORMS

In [None]:
import torchvision.transforms as transforms

# Class to convert images to grayscale and crop
class Transforms:
    def to_gray(frame1, frame2=None):
        gray_transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Grayscale(),
            transforms.CenterCrop((175,150)),
            transforms.Resize((84, 84)),
            transforms.ToTensor()
        ])

        # Subtract one frame from the other to get sense of ball and paddle direction
        if frame2 is not None:
            new_frame = gray_transform(frame2) - 0.4*gray_transform(frame1)
        else:
            new_frame = gray_transform(frame1)

        return new_frame.numpy()

# REPLAY_MEMORY

In [None]:

from collections import namedtuple, deque
import random

Transition = namedtuple('transition', ('state', 'action', 'reward', 'state_', 'done', 'raw_state'))

# Memory which allows for storing and sampling batches of transtions
class ReplayMemory(object):
    def __init__(self, size=1e6):
        self.buffer = []
        self.max_size = size
        self.pointer = 0

    # Adds a single transitions to the memory buffer
    def add_transition(self, *args):
        if len(self.buffer) < self.max_size:
            self.buffer.append(None)

        self.buffer[self.pointer] = Transition(*args)
        self.pointer = int((self.pointer + 1) % self.max_size)

    # Samples a batch of transitions
    def sample_batch(self, batch_size=64):
        batch = random.sample(self.buffer, batch_size)

        # Converts batch of transitions to transitions of batches
        batch = Transition(*zip(*batch))

        return batch

    def __len__(self):
        return len(self.buffer)

# DQN

In [None]:
import torch
import torch.nn as nn
import numpy as np

class DQN(nn.Module):

	def __init__(self, input_dim, output_dim, model_name='model', env_name='BreakoutDeterministic'):
		super(DQN, self).__init__()
		self.input_dim = input_dim
		channels, _, _ = input_dim

		# 3 conv layers, all with relu activations, first one with maxpool
		self.l1 = nn.Sequential(
			nn.Conv2d(channels, 32, kernel_size=8, stride=4, padding=2),
			nn.ReLU(),
			nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=1),
			nn.ReLU(),
			nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
			nn.ReLU(),
		)

		# Calculate output dimensions for linear layer
		conv_output_size = self.conv_output_dim()
		lin1_output_size = 512

		# Two fully connected layers with one relu activation
		self.l2 = nn.Sequential(
			nn.Linear(conv_output_size, lin1_output_size),
			nn.ReLU(),
			nn.Linear(lin1_output_size, output_dim)
		)

		# Save filename for saving model
		self.model_name = model_name
		self.env_name = env_name

	# Calulates output dimension of conv layers
	def conv_output_dim(self):
		x = torch.zeros(1, *self.input_dim)
		x = self.l1(x)
		return int(np.prod(x.shape))

	# Performs forward pass through the network, returns action values
	def forward(self, x):
		x = self.l1(x)
		x = x.view(x.shape[0], -1)
		actions = self.l2(x)

		return actions

	def save(self,additional_info=''):
		torch.save(self.state_dict(), f'models/{self.model_name}{additional_info}.pth')
		t = open('models/tap.tap','w')
		t.close()

	def load(self):
		self.load_state_dict(torch.load(f'models/{self.model_name}.pth'))
		self.eval()

# DQN_AGENT

In [None]:
import torch
import numpy as np
import random
import math
#from replay_memory import ReplayMemory
#from dqn import DQN
#from transforms import Transforms

# This class trains and plays on the actual game
class DQAgent(object):
	# Take hyperparameters, as well as openai gym environment name
	# Keeps the environment in the class. All learning/playing functions are built in
	def __init__(self, replace_target_cnt, env, state_space, action_space,
				model_name='breakout_model', gamma=0.99, eps_strt=0.1,
				eps_end=0.001, eps_dec=5e-6, batch_size=32, lr=0.001):

		# Set global variables
		self.env = env
		self.state_space = state_space
		self.action_space = action_space
		self.batch_size = batch_size
		self.GAMMA = gamma
		self.LR = lr
		self.eps = eps_strt
		self.eps_dec = eps_dec
		self.eps_end = eps_end

		# Use GPU if available
		self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
		print(self.device)

		# Initialise Replay Memory
		self.memory = ReplayMemory(size=100_000)

		# After how many training iterations the target network should update
		self.replace_target_cnt = replace_target_cnt
		self.learn_counter = 0

		# Initialise policy and target networks, set target network to eval mode
		self.policy_net = DQN(self.state_space, self.action_space, model_name=model_name).to(self.device)
		self.target_net = DQN(self.state_space, self.action_space, model_name=model_name+'target').to(self.device)
		self.target_net.eval()

		# If pretrained model of the modelname already exists, load it
		try:
			self.policy_net.load()
			print('loaded pretrained model')
		except Exception as e:
			print(e)
			pass

		# Set target net to be the same as policy net
		self.replace_target_net()

		# Set optimizer & loss function
		self.optim = torch.optim.Adam(self.policy_net.parameters(), lr=self.LR)
		self.loss = torch.nn.SmoothL1Loss()

	def sample_batch(self):
		batch = self.memory.sample_batch(self.batch_size)
		state_shape = batch.state[0].shape

		# Convert to tensors with correct dimensions
		state = torch.tensor(batch.state).view(self.batch_size, -1, state_shape[1], state_shape[2]).float().to(self.device)
		action = torch.tensor(batch.action).unsqueeze(1).to(self.device)
		reward = torch.tensor(batch.reward).float().unsqueeze(1).to(self.device)
		state_ = torch.tensor(batch.state_).view(self.batch_size, -1, state_shape[1], state_shape[2]).float().to(self.device)
		done = torch.tensor(batch.done).float().unsqueeze(1).to(self.device)

		return state, action, reward, state_, done

	# Returns the greedy action according to the policy net
	def greedy_action(self, obs):
		obs = torch.tensor(obs).float().to(self.device)
		obs = obs.unsqueeze(0)
		action = self.policy_net(obs).argmax().item()
		return action

	# Returns an action based on epsilon greedy method
	def choose_action(self, obs):
		if random.random() > self.eps:
			action = self.greedy_action(obs)
		else:
			action = random.choice([x for x in range(self.action_space)])
		return action

	# Stores a transition into memory
	def store_transition(self, *args):
		self.memory.add_transition(*args)

	# Updates the target net to have same weights as policy net
	def replace_target_net(self):
		if self.learn_counter % self.replace_target_cnt == 0:
			self.target_net.load_state_dict(self.policy_net.state_dict())
			print('Target network replaced')

	# Decrement epsilon
	def dec_eps(self):
		self.eps = self.eps - self.eps_dec if self.eps > self.eps_end \
						else self.eps_end

	# Samples a single batch according to batchsize and updates the policy net
	def learn(self, num_iters=1,episode=''):
		if self.memory.pointer < self.batch_size:
			return

		for i in range(num_iters):

			# Sample batch
			state, action, reward, state_, done = self.sample_batch()

			# Calculate the value of the action taken
			q_eval = self.policy_net(state).gather(1, action)

			# Calculate best next action value from the target net and detach from graph
			q_next = self.target_net(state_).detach().max(1)[0].unsqueeze(1)
			# Using q_next and reward, calculate q_target
			# (1-done) ensures q_target is 0 if transition is in a terminating state
			q_target = (1-done) * (reward + self.GAMMA * q_next) + (done * reward)

			# Compute the loss
			# loss = self.loss(q_target, q_eval).to(self.device)
			loss = self.loss(q_eval, q_target).to(self.device)

			# Perform backward propagation and optimization step
			self.optim.zero_grad()
			loss.backward()
			self.optim.step()

			# Increment learn_counter (for dec_eps and replace_target_net)
			self.learn_counter += 1

			# Check replace target net
			self.replace_target_net()

		# Save model & decrement epsilon
		if (episode%100==0):
			self.policy_net.save(f'_{episode}')
		self.dec_eps()

	# Plays num_eps amount of games, while optimizing the model after each episode
	def train(self, num_eps=100, render=False):
		scores = []

		max_score = 0

		for i in range(num_eps):
			done = False

			# Reset environment and preprocess state
			obs, _ = self.env.reset()
			new_game = True
			state = Transforms.to_gray(obs)

			score = 0
			cnt = 0
			while not done:
				# Take epsilon greedy action
				action = self.choose_action(state)

				# Force the first action to be a start
				if new_game:
					action = 1
					new_game = False
				obs_, reward, terminated, truncated, _ = self.env.step(action)
				done = terminated or truncated
				if render:
					self.env.render()

				# Preprocess next state and store transition
				state_ = Transforms.to_gray(obs, obs_)
				self.store_transition(state, action, reward, state_, int(done), obs)

				score += reward
				obs = obs_
				state = state_
				cnt += 1

			# Maintain record of the max score achieved so far
			if score > max_score:
				max_score = score

			scores.append(score)
			print(f'Episode {i}/{num_eps}: \n\tScore: {score}\n\tAvg score (past 100): {np.mean(scores[-100:])}\
				\n\tEpsilon: {self.eps}\n\tTransitions added: {cnt}')

			# Train on as many transitions as there have been added in the episode
			print(f'Learning x{math.ceil(cnt/self.batch_size)}')
			self.learn(math.ceil(cnt/self.batch_size),i)
			print('\n----------------------------------------\n')

		self.env.close()
		return scores

	# This function simply lets a pretrained model be evaluated to play a game
	# No learning will be done
	def play_games(self, num_eps, render=True):

		# Set network to eval mode
		self.policy_net.eval()

		scores = []

		for i in range(num_eps):
			done = False

			# Get observation and preprocess
			obs, _ = self.env.reset()
			new_game = True
			state = Transforms.to_gray(obs)

			score = 0
			cnt = 0
			while not done:
				# Take the greedy action and observe next state
				action = self.greedy_action(state)

				# Force the first action to be a start
				if new_game:
					action = 1
					new_game = False
				obs_, reward, terminated,truncated, _ = self.env.step(action)
				done = terminated or truncated
				if render:
					self.env.render()

				# Preprocess next state and store transition
				state_ = Transforms.to_gray(obs, obs_)
				self.store_transition(state, action, reward, state_, int(done), obs)

				# Calculate score, set next state and obs and increment counter
				score += reward
				obs = obs_
				state = state_
				cnt += 1

			scores.append(score)
			print(f'Episode {i}/{num_eps}: \tScore: {score}\tScore (past 100): {score}\tEpsilon: {self.eps}\tSteps made: {cnt}')
		self.env.close()
		return scores





# TRAINING

In [None]:
import gymnasium as gym
import numpy as np

env = gym.make(ENV_NAME)

plt.ion()

# Get number of actions from gym action space
n_actions = env.action_space.n
# Get the number of state observations
state = env.reset()[0].shape
state_space = (state[2], state[0], state[1])
state_raw = np.zeros(state, dtype=np.uint8)
processed_state = Transforms.to_gray(state_raw)
state_space = processed_state.shape

agent = DQAgent(replace_target_cnt=5000, env=env, state_space=state_space, action_space=n_actions, model_name='breakout_model', gamma=GAMMA,
				eps_strt=EPS_START, eps_end=EPS_END, eps_dec=EPS_DECAY, batch_size=BATCH_SIZE, lr=LR)

episode_durations = agent.train(num_eps=100_000)

print('Complete')
plot_durations(episode_durations,show_result=True)
plt.ioff()
plt.show()


cpu
[Errno 2] No such file or directory: 'models/breakout_model.pth'
Target network replaced
Episode 0/100000: 
	Score: 1.0
	Avg score (past 100): 1.0				
	Epsilon: 0.9
	Transitions added: 162
Learning x2


  state = torch.tensor(batch.state).view(self.batch_size, -1, state_shape[1], state_shape[2]).float().to(self.device)


KeyboardInterrupt: ignored

# TESTING

In [None]:
import gymnasium as gym
import numpy as np

env = gym.make(ENV_NAME)

plt.ion()

# Get number of actions from gym action space
n_actions = env.action_space.n
# Get the number of state observations
state = env.reset()[0].shape
state_space = (state[2], state[0], state[1])
state_raw = np.zeros(state, dtype=np.uint8)
processed_state = Transforms.to_gray(state_raw)
state_space = processed_state.shape

test_results = []

for i in range(0,100_000,100):
    model_name = f'breakout_model_{i}'
    agent = DQAgent(replace_target_cnt=5, env=env, state_space=state_space, action_space=n_actions, model_name=model_name, gamma=GAMMA,
				eps_strt=EPS_START, eps_end=EPS_END, eps_dec=EPS_DECAY, batch_size=BATCH_SIZE, lr=LR)
    scores = agent.play_games(num_eps=5, render=False)
    test_results.append(np.mean(scores))
    print(f'\n\tModel {model_name} avg. score: {np.mean(scores)}\n')

plot_durations(episode_durations,show_result=True)



cpu
loaded pretrained model
Target network replaced
Episode 0/10: 	Score: 1.0	Avg score (past 100): 1.0	Epsilon: 0.9	Steps made: 27000
Episode 1/10: 	Score: 1.0	Avg score (past 100): 1.0	Epsilon: 0.9	Steps made: 27000
Episode 2/10: 	Score: 1.0	Avg score (past 100): 1.0	Epsilon: 0.9	Steps made: 27000
Episode 3/10: 	Score: 1.0	Avg score (past 100): 1.0	Epsilon: 0.9	Steps made: 27000
Episode 4/10: 	Score: 1.0	Avg score (past 100): 1.0	Epsilon: 0.9	Steps made: 27000
Episode 5/10: 	Score: 1.0	Avg score (past 100): 1.0	Epsilon: 0.9	Steps made: 27000
Episode 6/10: 	Score: 1.0	Avg score (past 100): 1.0	Epsilon: 0.9	Steps made: 27000
Episode 7/10: 	Score: 1.0	Avg score (past 100): 1.0	Epsilon: 0.9	Steps made: 27000
Episode 8/10: 	Score: 1.0	Avg score (past 100): 1.0	Epsilon: 0.9	Steps made: 27000
Episode 9/10: 	Score: 1.0	Avg score (past 100): 1.0	Epsilon: 0.9	Steps made: 27000

	Model breakout_model_0 avg. score: 1.0

cpu
loaded pretrained model
Target network replaced
Episode 0/10: 	Score: 0.