In [3]:
from tank_kills_v3 import TankKills
from collections import deque
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import random
import numpy as np
import json
import os
import pygame

from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('runs/dqn_tank_kills')

In [4]:
all_actions = ["up","right","down","left"]

In [5]:
num_episodes = 2

learning_rate = 0.1 # alpha
discount_factor = 0.90 # gamma

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.007

replay_memory = deque(maxlen=500)
batch_size = 32

In [None]:
class VfApproxModel(nn.Module):
    """Neural Network for Value Function Approximation\n
    Contains Three layers 4->10->10->4
    """
    def __init__(self):
        super().__init__()
        # Input will be [player_x,player_y,enemy_x,enemy_y]
        # NOTE: Later on we will provide s,a,r,s,a
        self.layer_1 = nn.Linear(in_features=4,out_features=10)
        self.layer_2 = nn.Linear(in_features=10,out_features=10)
        self.layer_3 = nn.Linear(in_features=10,out_features=4) # 4 actions as output
        self.relu = nn.ReLU()
    
    def forward(self,features):
        out = self.relu(self.layer_1(features))
        out = self.relu(self.layer_2(out))
        out = self.relu(self.layer_3(out))
        # using softmax as action-selection policy
        out = torch.softmax(out,-1)
        return out

value_function = VfApproxModel()

<img src="https://huggingface.co/datasets/huggingface-deep-rl-course/course-images/resolve/main/en/unit4/Q-target.jpg" style="height:400px;width:50%;float:left;">

<img src="https://huggingface.co/datasets/huggingface-deep-rl-course/course-images/resolve/main/en/unit4/sampling-training.jpg" style="height:400px;width:50%;float:right;">

In [None]:
class Agent:
    """Main Agent Class Contiaining replay Memory and all learning params"""
    def __init__(self,replay_length,learning_rate,epsilon,max_epsilon,min_epsilon,epsilon_decay,gamma,action_size,value_function):
        self.replay_memory = deque(maxlen=replay_length)
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.max_epsilon = max_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay = epsilon_decay
        self.gamma = gamma
        self.action_size = action_size
        self.value_function = value_function
        self.loss_fn = nn.SmoothL1Loss()
        self.opt  = torch.optim.AdamW(value_function.parameters(), lr=learning_rate, amsgrad=True)


    def add_experience(self,new_state,reward,running,state,action):
        """
            Adds Experience into replay_memory
            new_state = [new_player_x,new_player_y,new_enemy_x,new_enemy_y]
            state = [player_x,player_y,enemy_x,enemy_y]
            new_state ans state both are torch tensors
        """
        self.replay_memory.append((new_state,reward,running,state,action))
    
    
    def action(self,state):
        """For Taking action using e-greedy"""

        if np.random.rand() > self.epsilon:
            return np.random.randint(0,3)
        out = self.val(state)
        out = out.cpu().detach().numpy()
        return np.argmax(out)
    
    
    def greedy_action(self,state):
        """Predicts a action greedily only using value function"""

        out = self.val(state)
        out = out.cpu().detach().numpy()
        return np.argmax(out)
    
    
    def replay(self,batch_size):
        """Learning From Experience Replay"""
        
        # batch size is how many batches you want to iterate to learn over
        batch = random.sample(self.replay_memory,batch_size)
        # NOTE: new_state and state should be torch tensor 
        for new_state,reward,running,state,action in batch:
            target = reward
            
            if running:
                with torch.no_grad():
                    next_state_q_value = self.value_function(new_state)
                    next_state_q_value = next_state_q_value.cpu().detach().numpy()
                    target = reward + self.gamma * np.amax(next_state_q_value)
                    target = torch.tensor(target)

                    former_target = self.value_function(state)
                    
                loss = self.loss_fn(target,former_target.unsqueeze(1))
                self.opt.zero_grad()
                loss.backwards()
                self.opt.step()
                 

            pass


