In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")


In [None]:
from typing import List, Tuple


class RecurrentIQN(nn.Module):
    def __init__(self, input_size: int, output_size: int, hidden_size: int, n_quantiles=32):
        """ Initialize the Recurrent IQN model

        Args:
            input_size (int): The size of the input matrix
            output_size (int): The size of the actions
            hidden_size (int): The size of the hidden layer
            n_quantiles (int, optional): The number of quantiles. Defaults to 32.
        """
        
        super(RecurrentIQN, self).__init__()
        self.n_quantiles = n_quantiles
        self.input_size = input_size
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.quantile_embed = nn.Linear(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x: torch.FloatTensor, quantiles: torch.Tensor, hidden: tuple[torch.Tensor, torch.Tensor]
):
        """ Forward pass of the Recurrent IQN model

        Args:
            x (torch.FloatTensor): The input tensor
            quantiles (torch.Tensor): The quantiles
            hidden (tuple[torch.Tensor, torch.Tensor]): The hidden state

        Returns:
            torch.FloatTensor: The output tensor
            tuple[torch.Tensor, torch.Tensor]: The hidden state
        """
        
        lstm_out, hidden = self.lstm(x, hidden)
        
        quantiles = quantiles.unsqueeze(-1)  
        pi = torch.acos(torch.zeros(1)).item() * 2
        quantile_feats = torch.cos(pi * quantiles * torch.arange(1, self.hidden_size + 1).to(x.device))
        quantile_feats = F.relu(self.quantile_embed(quantile_feats)) 

        lstm_out = lstm_out[:, -1, :].unsqueeze(1) 
        x = lstm_out * quantile_feats 

        x = self.fc(x) 
        return x, hidden

    def act(self, state: List, hidden: tuple[torch.Tensor, torch.Tensor], epsilon: float):
        """ Acting function of the Recurrent IQN model

        Args:
            state (List): The state of the environment
            hidden (tuple[torch.Tensor, torch.Tensor]): The hidden state of the model
            epsilon (float): The epsilon value

        Returns:
            int: The action to take
            Tuple: The hidden state of the model
        """
        
        if random.random() > epsilon:
            print("Model acting")
            with torch.no_grad():
                state = torch.FloatTensor(state).unsqueeze(0).unsqueeze(0).to(next(self.parameters()).device)
                quantiles = torch.rand(1, self.n_quantiles).to(state.device)
                q_values, hidden = self.forward(state, quantiles, hidden)
                q_values = q_values.mean(dim=1)
                action = q_values.argmax(dim=1).item()
        else:
            action = random.randrange(self.fc.out_features)
        return action, hidden


In [None]:
class ReplayBuffer:
    def __init__(self, capacity: int):
        """ Initialize the ReplayBuffer

        Args:
            capacity (int): The capacity of the buffer
        """
        
        self.capacity = capacity
        self.buffer = []
        self.position = 0
        
    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity
        
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done
    
    def pop(self, count=1):
        for _ in range(count):
            self.buffer.pop(0)
            self.position -= 1
            if self.position < 0:
                self.position = 0
     
    def __len__(self):
        return len(self.buffer)
    

class IQNAgent:
    def __init__(self, input_size, output_size, hidden_size, replay_buffer_capacity, batch_size, gamma, epsilon_start, epsilon_end, epsilon_decay, n_quantiles=32):
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.replay_buffer_capacity = replay_buffer_capacity
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.n_quantiles = n_quantiles

        self.model = RecurrentIQN(input_size, output_size, hidden_size, n_quantiles).to(device)
        self.model_target = RecurrentIQN(input_size, output_size, hidden_size, n_quantiles).to(device)
        self.model_target.load_state_dict(self.model.state_dict())

        self.hidden = (torch.zeros(1, 1, hidden_size).to(device),
                       torch.zeros(1, 1, hidden_size).to(device))

        self.replay_buffer = ReplayBuffer(replay_buffer_capacity)
        self.optimizer = optim.Adam(self.model.parameters())

        self.steps_done = 0

        self.update_counter = 0
        self.target_update_freq = 1000

    def select_action(self, state):
        epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * np.exp(-1. * self.steps_done / self.epsilon_decay)
        self.steps_done += 1
        action, self.hidden = self.model.act(state, self.hidden, epsilon)
        return action

    def optimize_model(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        
        self.model.train()
        print("Optimizing model...")
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(self.batch_size)

        states = torch.FloatTensor(states).unsqueeze(1).to(device)
        actions = torch.LongTensor(actions).to(device)
        rewards = torch.FloatTensor(rewards).to(device)
        next_states = torch.FloatTensor(next_states).unsqueeze(1).to(device)
        dones = torch.FloatTensor(dones).to(device)
        
        quantiles = torch.rand(self.batch_size, self.n_quantiles).to(device)

        hidden = (torch.zeros(1, self.batch_size, self.hidden_size).to(device),
                  torch.zeros(1, self.batch_size, self.hidden_size).to(device))

        current_q, _ = self.model(states, quantiles, hidden)
        current_q = current_q.gather(2, actions.unsqueeze(-1).unsqueeze(-1).expand(-1, self.n_quantiles, -1)).squeeze(-1)

        with torch.no_grad():
            next_hidden = (torch.zeros(1, self.batch_size, self.hidden_size).to(device),
                           torch.zeros(1, self.batch_size, self.hidden_size).to(device))
            next_quantiles = torch.rand(self.batch_size, self.n_quantiles).to(device)
            next_q, _ = self.model_target(next_states, next_quantiles, next_hidden)
            next_q = next_q.max(2)[0]
            target_q = rewards.unsqueeze(1) + self.gamma * next_q * (1 - dones.unsqueeze(1))

        td_errors = target_q.unsqueeze(1) - current_q
        huber_loss = F.smooth_l1_loss(current_q, target_q.unsqueeze(1), reduction='none')
        quantile_loss = (torch.abs(quantiles.unsqueeze(-1) - (td_errors.detach() < 0).float()) * huber_loss).mean()

        self.optimizer.zero_grad()
        quantile_loss.backward()
        self.optimizer.step()

        self.update_counter += 1
        if self.update_counter % self.target_update_freq == 0:
            self.update_target_network()

        return quantile_loss.item()

    def update_target_network(self):
        self.model_target.load_state_dict(self.model.state_dict())

    def push(self, state, action, reward, next_state, done):
        self.replay_buffer.push(state, action, reward, next_state, done)

    def save(self, path):
        torch.save(self.model.state_dict(), path)

    def load(self, path):
        self.model.load_state_dict(torch.load(path))
        self.model.eval()

    def reset(self):
        self.steps_done = 0
    
    def pop(self, count=1):
        self.replay_buffer.pop(count)


In [None]:
def simplify_state(state):
    agent_x, agent_y = get_agent_position(state)
    obstacles = get_nearby_obstacles(state, agent_x, agent_y)
    timbers = get_nearby_timbers(state, agent_x, agent_y)
    
    simplified_state = ((agent_x, agent_y), tuple(set(obstacles)), tuple(set(timbers)))
    
    return simplified_state

def get_agent_position(state):
    for i, row in enumerate(state):
        for j, cell in enumerate(row):
            if cell == 1:
                return j, i
    return state.shape[1] // 2, state.shape[0] - 1
    
            
def get_nearby_obstacles(state, agent_x, agent_y):
    obstacles = []
    for i, row in enumerate(state):
        for j, cell in enumerate(row):
            if cell == 2:
                obstacles.append((j, i))
    return obstacles

def get_nearby_timbers(state, agent_x, agent_y):
    timbers = []
    for i, row in enumerate(state):
        for j, cell in enumerate(row):
            if cell == 3:
                timbers.append((j, i))
    return timbers


In [None]:
import pyautogui
import cv2
import time
import keyboard
import torchvision
from ultralytics import YOLO


RES_X = 1920
RES_Y = 1080

GAME_REGION = (405, 210, 850, 480)
restart_button = cv2.imread('restart_button.png', cv2.IMREAD_GRAYSCALE)


def get_screen(region):
    screen = pyautogui.screenshot(region=(region[0], region[1], region[2], region[3]))
    
    non_crop = screen.copy()

    transforms = torchvision.transforms.Compose([
        torchvision.transforms.RandomRotation((14, 14)),
        torchvision.transforms.CenterCrop((320, 566)),
        torchvision.transforms.Resize((240, 425)),
    ])
    
    screen = transforms(screen)   
    
    screen = cv2.cvtColor(np.array(screen), cv2.COLOR_RGB2BGR)
    
    non_crop = cv2.cvtColor(np.array(non_crop), cv2.COLOR_RGB2BGR)
    non_crop = cv2.resize(non_crop, (425, 240))
    
    return screen, non_crop

import numpy as np

def map_to_grid(image_size, grid_size, boxes, class_labels):
    """
    Map detected bounding boxes to a grid representation.

    Args:
        image_size: Tuple (width, height) of the image.
        grid_size: Tuple (N, M) of the grid dimensions.
        boxes: List of bounding boxes [(x_min, y_min, x_max, y_max)].
        class_labels: List of class labels corresponding to the boxes.

    Returns:
        grid: 2D numpy array of shape (N, M) with object class labels.
    """
    width, height = image_size
    grid_width, grid_height = grid_size
    grid = np.zeros((grid_height, grid_width), dtype=int)

    cell_width = width / grid_width
    cell_height = height / grid_height

    for (x_min, y_min, x_max, y_max), label in zip(boxes, class_labels):
        x_start = int(x_min // cell_width)
        y_start = int(y_min // cell_height)
        x_end = int(np.ceil(x_max / cell_width))
        y_end = int(np.ceil(y_max / cell_height))

        for y in range(y_start, y_end):
            for x in range(x_start, x_end):
                grid[y, x] = label + 1

    return grid


def get_state(screen):
    results = cv_model(screen, verbose=False)

    image_size = (425, 240)
    grid_size = (36, 32)

    boxes = []
    labels = []
    
    boxes_ = results[0].boxes
    for box in boxes_:
        x_min, y_min, x_max, y_max = box.xyxy[0].tolist()
        
        class_id = int(box.cls[0].item())
        
        boxes.append((x_min, y_min, x_max, y_max))
        labels.append(class_id)

    boxes, labels = zip(*sorted(zip(boxes, labels), key=lambda x: -x[1]))    
    
    grid = map_to_grid(image_size, grid_size, boxes, labels)

    return grid

def is_game_over(image, score_threshold=0.5, scale=0.5):
    grey_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    resized_template = cv2.resize(restart_button, (0, 0), fx=scale, fy=scale)
    h, w = grey_image.shape

    cropped_search_box = grey_image[int(h * 0.87):, int(w * 0.43):int(w * 0.57)]
    
    result = cv2.matchTemplate(cropped_search_box, resized_template, cv2.TM_CCOEFF_NORMED)
    result = np.sort(result.flatten())[::-1]
    
    return result.max() > score_threshold

def process_state(state, max_obstacles=1152, max_timbers=1152):
    agent_pos, obstacles, timbers = state[0], state[1], state[2]
    
    state_vector = list(agent_pos)
    
    obstacles = list(obstacles)
    for i in range(max_obstacles):
        if i < len(obstacles):
            state_vector.extend(obstacles[i])
        else:
            state_vector.extend([0, 0])
   
    timbers = list(timbers)
    for i in range(max_timbers):
        if i < len(timbers):
            state_vector.extend(timbers[i])
        else:
            state_vector.extend([0, 0])
    
    return state_vector

def compute_reward(reward_state):
    time = reward_state['time']
    state = reward_state['state']
    action = reward_state['action']
    prev_action = reward_state['prev_action']
    next_state = reward_state['next_state']
    total_reward = reward_state['total_reward']
    non_crop_state = reward_state['non_crop_state']
    reward = 0
    
    if is_game_over(non_crop_state):
        reward = -100
        return reward
    
    if action == 0:
        reward += 2
    # elif action == 1:
    #     reward -= 0
    elif action == 2:
        reward += 0.25
    elif action == 3:
        reward += 0.25
    elif action == 4:
        reward += 0.2
    
    hen_count = 0
    for row in next_state:
        for cell in row:
            if cell == 1:
                hen_count += 1

    if action == prev_action:
        if state[0][0] // 5 == next_state[0][0] // 5 \
        and state[0][1] // 5 == next_state[0][1] // 5:
            reward -= 10
            
    reward = round(reward, 2)
    
    return reward
    
    
actions = [0, 1, 2, 3, 4]

input_size = 4610
agent = IQNAgent(input_size, 5, 128, 10000, 32, 0.99, 1.0, 0.1, 1000)
cv_model = YOLO('best_cv.pt')
episodes = 1000
episode_length = 1000
losses = []
rewards = []

print("Model is ready to train")

keyboard.wait('q')

for episode in range(episodes):
    screenshot, non_crop_state = get_screen(GAME_REGION)
    state_raw = get_state(screenshot)
    
    total_reward = 0
    total_loss = 0
    action = 0

    state = simplify_state(state_raw)
    total_reward = 0
    done = False
    
    
    start_time = time.time()
    for step in range(episode_length):
        prev_action = action
        time.sleep(0.025)
        
        state_vector = process_state(state)
        
        action = agent.select_action(state_vector)
        
        if action < 4:
            pyautogui.press(['up', 'down', 'left', 'right'][action])
        
        
        next_screenshot, non_crop_state = get_screen(GAME_REGION)
        next_state_raw = get_state(next_screenshot)
        
        next_state = simplify_state(next_state_raw)
        next_state_vector = process_state(next_state)
        
        reward_state = {
            'state': state,
            'action': action,
            'prev_action': prev_action,
            'next_state': next_state,
            'time': time.time() - start_time,
            'total_reward': total_reward,
            'non_crop_state': non_crop_state
        }
        
        done = 0
        reward = compute_reward(reward_state)

        agent.push(state_vector, action, reward, next_state_vector, done)

        state = next_state
        total_reward += reward
        
        if is_game_over(non_crop_state):
            done = 1
            break
        
        loss = agent.optimize_model()

        print(f"Step: {step}, Action: {action}, Reward: {reward}, Total Reward: {total_reward}, Loss: {loss}")
 
    keyboard.press_and_release('space')
    losses.append(total_loss)
    rewards.append(total_reward)
    print('\nepisode: {}, reward: {}\n'.format(episode, total_reward))
    
    agent.pop(4)    

    if episode % 10 == 0:
        agent.save('dqn.pth')
        print("Model saved")
    
    time.sleep(3.25)
    keyboard.press_and_release('space')



Model is ready to train
Step: 0, Action: 0, Reward: 2.0, Total Reward: 2.0, Loss: None
Step: 1, Action: 2, Reward: 0.25, Total Reward: 2.25, Loss: None
Step: 2, Action: 2, Reward: 0.25, Total Reward: 2.5, Loss: None
Step: 3, Action: 2, Reward: 0.25, Total Reward: 2.75, Loss: None
Step: 4, Action: 0, Reward: 2.0, Total Reward: 4.75, Loss: None
Step: 5, Action: 0, Reward: -8.0, Total Reward: -3.25, Loss: None
Step: 6, Action: 3, Reward: 0.25, Total Reward: -3.0, Loss: None
Step: 7, Action: 3, Reward: 0.25, Total Reward: -2.75, Loss: None
Step: 8, Action: 3, Reward: -9.75, Total Reward: -12.5, Loss: None
Step: 9, Action: 1, Reward: 0.0, Total Reward: -12.5, Loss: None
Step: 10, Action: 3, Reward: 0.25, Total Reward: -12.25, Loss: None
Step: 11, Action: 4, Reward: 0.2, Total Reward: -12.05, Loss: None
Step: 12, Action: 1, Reward: 0.0, Total Reward: -12.05, Loss: None
Step: 13, Action: 1, Reward: -10.0, Total Reward: -22.05, Loss: None
Step: 14, Action: 2, Reward: 0.25, Total Reward: -21.8,

  huber_loss = F.smooth_l1_loss(current_q, target_q.unsqueeze(1), reduction='none')


Step: 6, Action: 4, Reward: 0.2, Total Reward: 4.95, Loss: 2.746652364730835
Optimizing model...
Step: 7, Action: 0, Reward: 2.0, Total Reward: 6.95, Loss: 2.687303304672241
Optimizing model...
Step: 8, Action: 2, Reward: 0.25, Total Reward: 7.2, Loss: 2.721449851989746
Optimizing model...
Step: 9, Action: 3, Reward: 0.25, Total Reward: 7.45, Loss: 2.374948501586914

episode: 1, reward: -92.55

Optimizing model...
Step: 0, Action: 0, Reward: 2.0, Total Reward: 2.0, Loss: 4.115664005279541
Optimizing model...
Step: 1, Action: 1, Reward: 0.0, Total Reward: 2.0, Loss: 4.0185089111328125
Optimizing model...
Step: 2, Action: 0, Reward: 2.0, Total Reward: 4.0, Loss: 3.8132288455963135
Optimizing model...
Step: 3, Action: 2, Reward: 0.25, Total Reward: 4.25, Loss: 3.7864651679992676
Optimizing model...
Step: 4, Action: 1, Reward: 0.0, Total Reward: 4.25, Loss: 4.2097978591918945
Optimizing model...
Step: 5, Action: 4, Reward: 0.2, Total Reward: 4.45, Loss: 2.673335552215576
Optimizing model..

KeyboardInterrupt: 