In [1]:
from tank_kills_v3 import TankKills
from collections import deque
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import random
import numpy as np
import json
import os
import pygame

from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('runs/dqn_tank_kills')

  from pkg_resources import resource_stream, resource_exists


pygame 2.6.1 (SDL 2.28.4, Python 3.13.3)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
class VfApproxModel(nn.Module):
    """Neural Network for Value Function Approximation\n
    Contains Three layers 4->10->10->4
    """
    def __init__(self):
        super().__init__()
        # Input will be [player_x,player_y,enemy_x,enemy_y]
        # NOTE: Later on we will provide s,a,r,s,a
        self.layer_1 = nn.Linear(in_features=4,out_features=10)
        self.layer_2 = nn.Linear(in_features=10,out_features=10)
        self.layer_3 = nn.Linear(in_features=10,out_features=4) # 4 actions as output
        self.relu = nn.ReLU()
    
    def forward(self,features):
        out = self.relu(self.layer_1(features))
        out = self.relu(self.layer_2(out))
        out = self.layer_3(out)
        # using softmax as action-selection policy
        # out = torch.softmax(out,-1)
        return out

value_function = VfApproxModel()
value_function.to(device)


VfApproxModel(
  (layer_1): Linear(in_features=4, out_features=10, bias=True)
  (layer_2): Linear(in_features=10, out_features=10, bias=True)
  (layer_3): Linear(in_features=10, out_features=4, bias=True)
  (relu): ReLU()
)

<img src="https://huggingface.co/datasets/huggingface-deep-rl-course/course-images/resolve/main/en/unit4/Q-target.jpg" style="height:400px;width:50%;float:left;">

<img src="https://huggingface.co/datasets/huggingface-deep-rl-course/course-images/resolve/main/en/unit4/sampling-training.jpg" style="height:400px;width:50%;float:right;">

In [4]:
class Agent:
    """Main Agent Class Contiaining replay Memory and all learning params"""
    def __init__(self,replay_length,learning_rate,epsilon,max_epsilon,min_epsilon,epsilon_decay,gamma,action_size,value_function):
        self.replay_memory = deque(maxlen=replay_length)
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.max_epsilon = max_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay = epsilon_decay
        self.gamma = gamma
        self.action_size = action_size
        self.value_function = value_function
        self.loss_fn = nn.SmoothL1Loss().to(device)
        self.opt  = torch.optim.AdamW(value_function.parameters(), lr=learning_rate, amsgrad=True)
        self.epsilon_list = []
        self.losses =  []

    def add_experience(self,new_state,reward,running,state,action):
        """
            Adds Experience into replay_memory
            new_state = [new_player_x,new_player_y,new_enemy_x,new_enemy_y]
            state = [player_x,player_y,enemy_x,enemy_y]
            new_state ans state both are torch tensors
        """
        self.replay_memory.append((new_state,reward,running,state,action))
    
    
    def action(self,state):
        """For Taking action using e-greedy"""

        if np.random.rand() < self.epsilon:
            return np.random.randint(0,self.action_size)
        out = self.value_function(state)
        out = out.cpu().detach().numpy()
        return np.argmax(out)
    
    
    def greedy_action(self,state):
        """Predicts a action greedily only using value function"""

        out = self.value_function(state)
        out = out.cpu().detach().numpy()
        return np.argmax(out)
    
    
    def replay(self, batch_size, episode):
        batch = random.sample(self.replay_memory, batch_size)
        losses = []
        for new_state, reward, running, state, action in batch:
            # Predict Q(s, a) for current state
            q_values = self.value_function(state)
            q_value = q_values[action]

            # Predict Q(s', a') for next state
            with torch.no_grad():
                next_q_values = self.value_function(new_state)
                max_next_q = torch.max(next_q_values)
                target = reward + self.gamma * max_next_q * float(running)

            loss = self.loss_fn(q_value, target)
            self.opt.zero_grad()
            loss.backward()
            self.opt.step()
            losses.append(loss.item())
        return np.mean(losses) if losses else 0.0

    def save_weights(self,path):
        """Save Current State weights"""
        torch.save(self.value_function.state_dict(),path)


In [5]:
value_function.load_state_dict(torch.load("saved_weights/last_episode_weights.pth"))

<All keys matched successfully>

In [6]:
all_actions = ["up","right","down","left"]
num_episodes = 20

learning_rate = 0.1 # alpha
discount_factor = 0.80 # gamma

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.04
exploration_decay_rate = 0.005

replay_length = 5000# Replay memory (D) or length (N)
batch_size = 300 # Batch size to train on replay buffer

In [7]:
agent = Agent(
    replay_length=replay_length,
    learning_rate=learning_rate,
    epsilon=exploration_rate,
    max_epsilon=max_exploration_rate,
    min_epsilon=min_exploration_rate,
    epsilon_decay=exploration_decay_rate,
    gamma=discount_factor,
    action_size=4,
    value_function=value_function
)

In [8]:
losses_per_episode = []
for episode in range(num_episodes):
    old_score = 0
    print(f"-----Episode: [{episode+1}/{num_episodes}]-----")
    env = TankKills(600,600)
    state = [300/600,400/600,0/600,0/600]
    state = torch.tensor(state,dtype=torch.float32,device=device,requires_grad=True)
    running = True
    losses_per_action = []
    actions_taken_per_episode = 0
    rewards_per_episode = 0
    steps = 0
    while running:
        
        torch.cuda.empty_cache()
        
        action = agent.action(state)

        running,reward,score,pp,ep = env.play(action=all_actions[action])

        new_state = [pp[0]/600,pp[1]/600,ep[0]/600,ep[1]/600]
        new_state = torch.tensor(new_state,dtype=torch.float32,device=device,requires_grad=True)

        target = agent.value_function(new_state)
        
        # target = reward + torch.mul((agent.gamma*target),1-bool( not running))
        target = target.cpu().detach().numpy()
        target = reward + (agent.gamma * np.amax(target))*(1-bool( not running))
        target = torch.tensor(target,dtype=torch.float32,device=device,requires_grad=True)
        
        former_target = agent.value_function(state)

        former_target = former_target.cpu().detach().numpy()
        former_target = np.amax(former_target)
        former_target = torch.tensor(former_target,dtype=torch.float32,device=device,requires_grad=True)
        
        loss = agent.loss_fn(target,former_target)
        
        loss.backward()
        # print(target.grad,former_target.grad)
        agent.opt.step()
        agent.opt.zero_grad()
        # torch.nn.utils.clip_grad_value_(agent.value_function.parameters(), 100)
        agent.add_experience(new_state,reward,running,state,action)

        state = new_state

        rewards_per_episode+=reward
        actions_taken_per_episode += 1
        losses_per_action.append(loss.item())
        writer.add_scalar("Loss per action",loss.item(),actions_taken_per_episode)
        
    agent.epsilon = (agent.max_epsilon-agent.min_epsilon) * np.exp(-agent.epsilon_decay*episode) + agent.min_epsilon
    pygame.display.quit()

    if len(agent.replay_memory) > batch_size:
        replay_loss = agent.replay(batch_size, episode)
        writer.add_scalar("Replay Loss", replay_loss, episode)
    
    if score > old_score:
        agent.save_weights(f"saved_weights/agent_{score}.pth")
    old_score = score
    
    writer.add_scalar("Reward per Episode",rewards_per_episode,episode)
    writer.add_scalar("Actions Taken per Episode",actions_taken_per_episode,episode)
    writer.add_scalar("Score",score,episode)
    writer.add_scalar("Loss per Episode",loss.item(),episode)
    writer.add_scalar("Epsilon per episode",agent.epsilon,episode)
    

    print(f"Reward:{rewards_per_episode}")
    print(f"Score:{score}")
    print(f"Actions Taken:{actions_taken_per_episode}")
    print(f"Loss:{loss}")

    
    writer.close()

agent.save_weights("saved_weights/last_episode_weights.pth")

-----Episode: [1/20]-----
Reward:-1991
Score:0
Actions Taken:2071
Loss:25.883682250976562
-----Episode: [2/20]-----
Reward:-1010
Score:0
Actions Taken:2071
Loss:25.458993911743164
-----Episode: [3/20]-----
Reward:-1661
Score:0
Actions Taken:2016
Loss:26.99167251586914
-----Episode: [4/20]-----
Reward:-2391
Score:3
Actions Taken:5144
Loss:27.546947479248047
-----Episode: [5/20]-----
Reward:-1051
Score:1
Actions Taken:2608
Loss:28.0220947265625
-----Episode: [6/20]-----
Reward:-3156
Score:8
Actions Taken:9097
Loss:28.385765075683594
-----Episode: [7/20]-----
Reward:-2232
Score:1
Actions Taken:3658
Loss:28.892274856567383
-----Episode: [8/20]-----
Reward:-1608
Score:1
Actions Taken:3869
Loss:28.4791259765625
-----Episode: [9/20]-----
Reward:-1685
Score:0
Actions Taken:2046
Loss:28.572784423828125
-----Episode: [10/20]-----
Reward:-1626
Score:1
Actions Taken:3573
Loss:27.686756134033203
-----Episode: [11/20]-----
Reward:-2319
Score:3
Actions Taken:5337
Loss:28.359821319580078
-----Episode:

In [9]:
for i in agent.replay_memory:
    if i[1] == 50:
        print(i)

In [10]:
new_state = agent.replay_memory[20][0]
state = agent.replay_memory[20][3]
reward = agent.replay_memory[20][1]

target = agent.value_function(state)
target = reward + torch.mul((agent.gamma*target),1)
# target = target.cpu().detach().numpy()
# target = reward + (agent.gamma * target)
# target = torch.tensor(target,dtype=torch.float32,device=device,requires_grad=True)
# with torch.no_grad():
former_target = agent.value_function(state)
# former_target = former_target.cpu().detach().numpy()
# former_target = np.amax(former_target)
# former_target = torch.tensor(former_target,dtype=torch.float32,device=device,requires_grad=True)
# print(target.grad,former_target.grad)


print(target,former_target)

loss = agent.loss_fn(target,former_target)

loss.backward()

print(target.grad,former_target.grad)
print(loss)


tensor([-4.0044, -4.0493, -4.0410, -4.0646], device='cuda:0',
       grad_fn=<AddBackward0>) tensor([-3.7555, -3.8116, -3.8013, -3.8307], device='cuda:0',
       grad_fn=<ViewBackward0>)
None None
tensor(0.0288, device='cuda:0', grad_fn=<SmoothL1LossBackward0>)


  print(target.grad,former_target.grad)


In [11]:
pygame.display.quit() 