In [1]:
from tank_kills_v3 import TankKills
from collections import deque
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import random
import numpy as np
import json
import os
import pygame

from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('runs/dqn_tank_kills')

pygame 2.2.0 (SDL 2.0.22, Python 3.10.8)
Hello from the pygame community. https://www.pygame.org/contribute.html


2023-03-10 19:35:07.114010: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-10 19:35:08.321076: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-10 19:35:08.321170: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
class VfApproxModel(nn.Module):
    """Neural Network for Value Function Approximation\n
    Contains Three layers 4->10->10->4
    """
    def __init__(self):
        super().__init__()
        # Input will be [player_x,player_y,enemy_x,enemy_y]
        # NOTE: Later on we will provide s,a,r,s,a
        self.layer_1 = nn.Linear(in_features=4,out_features=10)
        self.layer_2 = nn.Linear(in_features=10,out_features=10)
        self.layer_3 = nn.Linear(in_features=10,out_features=4) # 4 actions as output
        self.relu = nn.ReLU()
    
    def forward(self,features):
        out = self.relu(self.layer_1(features))
        out = self.relu(self.layer_2(out))
        out = self.layer_3(out)
        # using softmax as action-selection policy
        # out = torch.softmax(out,-1)
        return out

value_function = VfApproxModel()
value_function.to(device)


VfApproxModel(
  (layer_1): Linear(in_features=4, out_features=10, bias=True)
  (layer_2): Linear(in_features=10, out_features=10, bias=True)
  (layer_3): Linear(in_features=10, out_features=4, bias=True)
  (relu): ReLU()
)

<img src="https://huggingface.co/datasets/huggingface-deep-rl-course/course-images/resolve/main/en/unit4/Q-target.jpg" style="height:400px;width:50%;float:left;">

<img src="https://huggingface.co/datasets/huggingface-deep-rl-course/course-images/resolve/main/en/unit4/sampling-training.jpg" style="height:400px;width:50%;float:right;">

In [5]:
class Agent:
    """Main Agent Class Contiaining replay Memory and all learning params"""
    def __init__(self,replay_length,learning_rate,epsilon,max_epsilon,min_epsilon,epsilon_decay,gamma,action_size,value_function):
        self.replay_memory = deque(maxlen=replay_length)
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.max_epsilon = max_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay = epsilon_decay
        self.gamma = gamma
        self.action_size = action_size
        self.value_function = value_function
        self.loss_fn = nn.SmoothL1Loss().to(device)
        self.opt  = torch.optim.AdamW(value_function.parameters(), lr=learning_rate, amsgrad=True)
        self.epsilon_list = []
        self.losses =  []

    def add_experience(self,new_state,reward,running,state,action):
        """
            Adds Experience into replay_memory
            new_state = [new_player_x,new_player_y,new_enemy_x,new_enemy_y]
            state = [player_x,player_y,enemy_x,enemy_y]
            new_state ans state both are torch tensors
        """
        self.replay_memory.append((new_state,reward,running,state,action))
    
    
    def action(self,state):
        """For Taking action using e-greedy"""

        if np.random.rand() > self.epsilon:
            return np.random.randint(0,3)
        out = self.value_function(state)
        out = out.cpu().detach().numpy()
        return np.argmax(out)
    
    
    def greedy_action(self,state):
        """Predicts a action greedily only using value function"""

        out = self.value_function(state)
        out = out.cpu().detach().numpy()
        return np.argmax(out)
    
    
    def replay(self,batch_size,episode):
        """Learning From Experience Replay"""
        
        # batch size is how many batches you want to iterate to learn over
        batch = random.sample(self.replay_memory,batch_size)
        # NOTE: new_state and state should be torch tensor 
        for new_state,reward,running,state,action in batch:
            target = reward
            
            if running:
                
                target = self.value_function(new_state)
                # target = target.cpu().detach().numpy()
                target = reward + torch.mul((self.gamma*target),1-bool( not running))
                # target = torch.tensor(target,dtype=torch.float32,device=device,requires_grad=True)

                with torch.no_grad():
                    former_target = self.value_function(state)
                    former_target = former_target.cpu().detach().numpy()
                    former_target = np.amax(former_target)
                    former_target = torch.tensor(former_target,dtype=torch.float32,device=device,requires_grad=True)

                loss = self.loss_fn(target,former_target)
                
                self.opt.zero_grad()
                loss.backward()
                self.opt.step()
            
            self.epsilon_list.append(self.epsilon)
        return loss.item()

    def save_weights(self,path):
        """Save Current State weights"""
        torch.save(self.value_function.state_dict(),path)


In [None]:
value_function.load_state_dict(torch.load("tank_kills/saved_weights/last_episode_weights.pth"))

In [14]:
all_actions = ["up","right","down","left"]
num_episodes = 50

learning_rate = 0.0005 # alpha
discount_factor = 0.80 # gamma

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.04
exploration_decay_rate = 0.005

replay_length = 5000# Replay memory (D) or length (N)
batch_size = 60 # Batch size to train on replay buffer

In [15]:
agent = Agent(
    replay_length=replay_length,
    learning_rate=learning_rate,
    epsilon=exploration_rate,
    max_epsilon=max_exploration_rate,
    min_epsilon=min_exploration_rate,
    epsilon_decay=exploration_decay_rate,
    gamma=discount_factor,
    action_size=4,
    value_function=value_function
)

In [16]:
losses_per_episode = []
for episode in range(num_episodes):
    old_score = 0
    print(f"-----Episode: [{episode+1}/{num_episodes}]-----")
    env = TankKills(600,600)
    state = [300,400,0,0]
    state = torch.tensor(state,dtype=torch.float32,device=device,requires_grad=True)
    running = True
    losses_per_action = []
    actions_taken_per_episode = 0
    rewards_per_episode = 0
    steps = 0
    while running:
        
        torch.cuda.empty_cache()
        exploration_rate_threshold = random.uniform(0,1)
        if exploration_rate_threshold > agent.epsilon:
            action = agent.greedy_action(state)
        else:
            action = random.randint(0,3)
        
        running,reward,score,pp,ep = env.play(action=all_actions[action])

        new_state = [pp[0],pp[1],ep[0],ep[1]]
        new_state = torch.tensor(new_state,dtype=torch.float32,device=device,requires_grad=True)

        target = agent.value_function(new_state)
        # target = target.cpu().detach().numpy()
        # target = reward + agent.gamma * np.amax(target)
        # target = reward + (agent.gamma * target)
        target = reward + torch.mul((agent.gamma*target),1-bool( not running))
        # target = torch.tensor(target,dtype=torch.float32,device=device,requires_grad=True)

        
        former_target = agent.value_function(state)
            # former_target = former_target.cpu().detach().numpy()
            # former_target = np.amax(former_target)
            # former_target = torch.tensor(former_target,dtype=torch.float32,device=device,requires_grad=True)
        
        
        # print(target.shape,former_target.shape)
        loss = agent.loss_fn(target,former_target)
        
        
        loss.backward()
        # print(target.grad,former_target.grad)
        agent.opt.step()
        agent.opt.zero_grad()
        torch.nn.utils.clip_grad_value_(agent.value_function.parameters(), 100)
        agent.add_experience(new_state,reward,running,state,action)

        state = new_state

        rewards_per_episode+=reward
        actions_taken_per_episode += 1
        losses_per_action.append(loss.item())
        writer.add_scalar("Loss per action",loss.item(),actions_taken_per_episode)
    
    agent.epsilon = (agent.max_epsilon-agent.min_epsilon) * np.exp(-agent.epsilon_decay*episode) + agent.min_epsilon
    pygame.display.quit()
    
    if score > old_score:
        agent.save_weights(f"saved_weights/agent_{score}.pth")
    old_score = score
    
    writer.add_scalar("Reward per Episode",rewards_per_episode,episode)
    writer.add_scalar("Actions Taken per Episode",actions_taken_per_episode,episode)
    writer.add_scalar("Score",score,episode)
    writer.add_scalar("Loss per Episode",loss.item(),episode)
    writer.add_scalar("Epsilon per episode",agent.epsilon,episode)
    

    print(f"Reward:{rewards_per_episode}")
    print(f"Score:{score}")
    print(f"Actions Taken:{actions_taken_per_episode}")
    print(f"Loss:{loss}")


    # if len(agent.replay_memory) > batch_size:
    #     replay_loss = agent.replay(batch_size,episode)
    #     writer.add_scalar("Replay Loss",replay_loss,episode)
    #     print("[Trained on replay]\n")
    
    writer.close()

agent.save_weights("saved_weights/last_episode_weights.pth")

-----Episode: [1/50]-----
Reward:-2095
Score:0
Actions Taken:2046
Loss:44.51579284667969
-----Episode: [2/50]-----
Reward:-2674
Score:1
Actions Taken:2636
Loss:44.430511474609375
-----Episode: [3/50]-----
Reward:-3519
Score:2
Actions Taken:3492
Loss:44.520591735839844
-----Episode: [4/50]-----
Reward:-2095
Score:0
Actions Taken:2046
Loss:44.50000762939453
-----Episode: [5/50]-----
Reward:-2050
Score:0
Actions Taken:2001
Loss:44.49191665649414
-----Episode: [6/50]-----
Reward:-4034
Score:1
Actions Taken:3996
Loss:44.498191833496094
-----Episode: [7/50]-----
Reward:-3079
Score:1
Actions Taken:3041
Loss:44.51842498779297
-----Episode: [8/50]-----
Reward:-4479
Score:2
Actions Taken:4452
Loss:44.48652648925781
-----Episode: [9/50]-----
Reward:-2050
Score:0
Actions Taken:2001
Loss:44.47859191894531
-----Episode: [10/50]-----
Reward:-2075
Score:0
Actions Taken:2026
Loss:44.499290466308594
-----Episode: [11/50]-----
Reward:-2135
Score:0
Actions Taken:2086
Loss:44.53248977661133
-----Episode: [

In [None]:
agent.replay_memory[2]

In [None]:
new_state = agent.replay_memory[20][0]
state = agent.replay_memory[20][3]
reward = agent.replay_memory[20][1]

target = agent.value_function(state)
target = reward + torch.mul((agent.gamma*target),1)
# target = target.cpu().detach().numpy()
# target = reward + (agent.gamma * target)
# target = torch.tensor(target,dtype=torch.float32,device=device,requires_grad=True)
# with torch.no_grad():
former_target = agent.value_function(state)
# former_target = former_target.cpu().detach().numpy()
# former_target = np.amax(former_target)
# former_target = torch.tensor(former_target,dtype=torch.float32,device=device,requires_grad=True)
# print(target.grad,former_target.grad)


print(target,former_target)

loss = agent.loss_fn(target,former_target)

loss.backward()

print(target.grad,former_target.grad)
print(loss)


In [None]:
pygame.display.quit() 