In [63]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random


In [64]:
# DQN model for Crossy Road game
# input is the 2d matrix of the game state (90 x 90)
# output is the action to take (0, 1, 2, 3) for (up, down, left, right)

class DQN(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_size * 160, hidden_size * 16)
        self.fc2 = nn.Linear(hidden_size * 16, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size // 4)
        self.fc4 = nn.Linear(hidden_size // 4, output_size)
        
    def forward(self, x):
        if len(x.shape) > 2:
            batch_size = x.shape[0]
            x = x.view(batch_size, -1)
        else:
            x = torch.flatten(x)
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        
        return x
        
    
    def act(self, state, epsilon):
        # print(epsilon)
        if random.random() < 0.975:
            state = torch.tensor(state, dtype=torch.float32)
            
            q_value = self.forward(state)
            action = torch.argmax(input=q_value).item()
            
        else:
            action = random.choice([0, 1, 2, 3, 4])
        return action


In [65]:
# Replay buffer for DQN
# stores the transitions (state, action, reward, next_state, done)
# and samples a batch of transitions for training

class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
        
    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity
        
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done
        
    def __len__(self):
        return len(self.buffer)
    
    
# DQN agent for Crossy Road game
# uses DQN model and replay buffer for training

class DQNAgent:
    def __init__(self, input_size, output_size, hidden_size, replay_buffer_capacity, batch_size, gamma, epsilon_start, epsilon_end, epsilon_decay):
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.replay_buffer_capacity = replay_buffer_capacity
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        
        self.model = DQN(input_size, output_size, hidden_size)
        self.replay_buffer = ReplayBuffer(replay_buffer_capacity)
        self.optimizer = optim.Adam(self.model.parameters())
        
        self.steps_done = 0
        
    def select_action(self, state):
        epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * np.exp(-1. * self.steps_done / self.epsilon_decay)
        self.steps_done += 1
        return self.model.act(state, epsilon)
    
    def optimize_model(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        
        states, action, reward, next_states, done = self.replay_buffer.sample(self.batch_size)
        
        states = torch.FloatTensor(np.float32(states))
        action = torch.LongTensor(action)
        reward = torch.FloatTensor(reward)
        next_states = torch.FloatTensor(np.float32(next_states))
        done = torch.FloatTensor(done)

        q_values = self.model(states)
        next_q_values = self.model(next_states)
        
        q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        
        
        next_q_value = next_q_values.max(1)[0]  
        expected_q_value = reward + self.gamma *  next_q_value
        
        # print(q_value, expected_q_value)
        
        loss = F.mse_loss(q_value, expected_q_value)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        return loss.item()
    
    def push(self, state, action, reward, next_state, done):
        self.replay_buffer.push(state, action, reward, next_state, done)
        
    def save(self, path):
        torch.save(self.model.state_dict(), path)
        
    def load(self, path):
        self.model.load_state_dict(torch.load(path))
        self.model.eval()
        
    def reset(self):
        self.steps_done = 0
        

In [66]:
import collections
import random

class QLearningAgent:
    def __init__(self, actions, learning_rate, gamma, epsilon):
        self.actions = actions
        self.lr = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_table = collections.defaultdict(float)

    def choose_action(self, state):
        if random.random() < self.epsilon:
            # Explore: random action
            return random.choice(self.actions)
        else:
            # Exploit: choose the best action from Q-table
            q_values = [self.q_table[(state, a)] for a in self.actions]
            max_q = max(q_values)
            # Handle multiple actions with the same max Q-value
            best_actions = [a for a, q in zip(self.actions, q_values) if q == max_q]
            return random.choice(best_actions)

    def learn(self, state, action, reward, next_state, done):
        current_q = self.q_table[(state, action)]
        max_next_q = max([self.q_table[(next_state, a)] for a in self.actions]) if not done else 0
        target_q = reward + self.gamma * max_next_q
        # Update Q-value using the learning rate
        self.q_table[(state, action)] += self.lr * (target_q - current_q)


In [67]:
def simplify_state(state):
    # Extract agent's position
    agent_x, agent_y = get_agent_position(state)
    # Extract positions of nearby obstacles
    obstacles = get_nearby_obstacles(state, agent_x, agent_y)
    # Extract positions of nearby timbers
    timbers = get_nearby_timbers(state, agent_x, agent_y)
    
    # Discretize positions
    # agent_pos = (agent_x // 10, agent_y // 10)
    # obstacle_features = []
    # for obs in obstacles:
    #     obs_x, obs_y = obs
    #     rel_x = (obs_x - agent_x) // 10
    #     rel_y = (obs_y - agent_y) // 10
    #     obstacle_features.append((rel_x, rel_y))
    # Construct simplified state
    simplified_state = ((agent_x, agent_y), tuple(set(obstacles)), tuple(set(timbers)))
    return simplified_state

def get_agent_position(state):
    for i, row in enumerate(state):
        for j, cell in enumerate(row):
            if cell == 1:
                return j, i
    return state.shape[1] // 2, state.shape[0] - 1
    
            
def get_nearby_obstacles(state, agent_x, agent_y):
    obstacles = []
    for i, row in enumerate(state):
        for j, cell in enumerate(row):
            if cell == 2:
                obstacles.append((j, i))
    return obstacles

def get_nearby_timbers(state, agent_x, agent_y):
    timbers = []
    for i, row in enumerate(state):
        for j, cell in enumerate(row):
            if cell == 3:
                timbers.append((j, i))
    return timbers


In [68]:
# run DQN agent on Crossy Road game to train and play the game
import pyautogui
import cv2
import time
import keyboard
from pygame import ver
from torch import res
import torchvision
from ultralytics import YOLO


RES_X = 1920
RES_Y = 1080

GAME_REGION = (405, 210, 850, 480)
restart_button = cv2.imread('restart_button.png', cv2.IMREAD_GRAYSCALE)


def get_screen(region):
    screen = pyautogui.screenshot(region=(region[0], region[1], region[2], region[3]))
    
    non_crop = screen.copy()

    transforms = torchvision.transforms.Compose([
        torchvision.transforms.RandomRotation((14, 14)),
        torchvision.transforms.CenterCrop((320, 566)),
        torchvision.transforms.Resize((240, 425)),
    ])
    
    screen = transforms(screen)   
    
    screen = cv2.cvtColor(np.array(screen), cv2.COLOR_RGB2BGR)
    
    non_crop = cv2.cvtColor(np.array(non_crop), cv2.COLOR_RGB2BGR)
    non_crop = cv2.resize(non_crop, (425, 240))
    
    return screen, non_crop

import numpy as np

def map_to_grid(image_size, grid_size, boxes, class_labels):
    """
    Map detected bounding boxes to a grid representation.

    Args:
        image_size: Tuple (width, height) of the image.
        grid_size: Tuple (N, M) of the grid dimensions.
        boxes: List of bounding boxes [(x_min, y_min, x_max, y_max)].
        class_labels: List of class labels corresponding to the boxes.

    Returns:
        grid: 2D numpy array of shape (N, M) with object class labels.
    """
    width, height = image_size
    grid_width, grid_height = grid_size
    grid = np.zeros((grid_height, grid_width), dtype=int)

    cell_width = width / grid_width
    cell_height = height / grid_height

    for (x_min, y_min, x_max, y_max), label in zip(boxes, class_labels):
        x_start = int(x_min // cell_width)
        y_start = int(y_min // cell_height)
        x_end = int(np.ceil(x_max / cell_width))
        y_end = int(np.ceil(y_max / cell_height))

        for y in range(y_start, y_end):
            for x in range(x_start, x_end):
                grid[y, x] = label + 1

    return grid


def get_state(screen):
    results = cv_model(screen, verbose=False)

    image_size = (425, 240)  # Example image dimensions (width, height)
    grid_size = (180, 160)    # Example grid dimensions (N, M)

    boxes = []
    labels = []
    
    boxes_ = results[0].boxes
    for box in boxes_:
        x_min, y_min, x_max, y_max = box.xyxy[0].tolist()
        
        class_id = int(box.cls[0].item())
        
        boxes.append((x_min, y_min, x_max, y_max))
        labels.append(class_id)  # Assuming class_id is the label

    grid = map_to_grid(image_size, grid_size, boxes, labels)
    # print(grid)
    return grid

def is_game_over(image, score_threshold=0.5, scale=0.5):
    grey_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    resized_template = cv2.resize(restart_button, (0, 0), fx=scale, fy=scale)
    h, w = grey_image.shape

    cropped_search_box = grey_image[int(h * 0.87):, int(w * 0.43):int(w * 0.57)]
    
    # cv2.imwrite('cropped_search_box.png', cropped_search_box)
    
    result = cv2.matchTemplate(cropped_search_box, resized_template, cv2.TM_CCOEFF_NORMED)
    result = np.sort(result.flatten())[::-1]
        
    return result.max() > score_threshold

def compute_reward(reward_state):
    # Compute reward based on the change in the game screen   
    time = reward_state['time']
    state = reward_state['state']
    action = reward_state['action']
    prev_action = reward_state['prev_action']
    next_state = reward_state['next_state']
    total_reward = reward_state['total_reward']
    non_crop_state = reward_state['non_crop_state']
    reward = 0
    
    if is_game_over(non_crop_state):
        reward = -100
        return reward
    
    if action == 0:
        reward += 3
    elif action == 1:
        reward -= 2
    elif action == 2:
        reward -= 0.25
    elif action == 3:
        reward -= 0.25
    elif action == 4:
        reward -= 0.5

    if action == prev_action:
        reward -= 0.25
    
    # print(state)
    
    hen_position_x, hen_position_y = state[0]
    
    if action == 0:
        if state[0][0] // 10 == next_state[0][0] // 10 \
        and state[0][1] // 10 == next_state[0][1] // 10:
            reward -= 10
            
    
    return reward
    

# train DQN agent
# agent = DQNAgent(180, 5, 128, 1000, 32, 0.99, 1.0, 0.1, 10000)
actions = [0, 1, 2, 3, 4]
agent = QLearningAgent(actions=actions, learning_rate=0.1, gamma=0.99, epsilon=0.1)
cv_model = YOLO('best_cv.pt')
episodes = 1000
episode_length = 1000
losses = []
rewards = []

# agent.load('dqn.pth')

print("Model is ready to train")

keyboard.wait('q')

for episode in range(episodes):
    screenshot, non_crop_state = get_screen(GAME_REGION)
    state = get_state(screenshot)
    total_reward = 0
    total_loss = 0
    action = 0

    state = simplify_state(state)
    total_reward = 0
    done = False
    
    
    start_time = time.time()
    for step in range(episode_length):
        prev_action = action
        time.sleep(0.1)
        
        action = agent.choose_action(state)
        
        if action < 4:
            pyautogui.press(['up', 'down', 'left', 'right'][action])
        
        
        next_screenshot, non_crop_state = get_screen(GAME_REGION)
        next_state_raw = get_state(next_screenshot)
        # Save the state as a screenshot
        
        # cv2.imwrite(f'state_{step}.png', state)
        
        next_state = simplify_state(next_state_raw)
        
        reward_state = {
            'state': state,
            'action': action,
            'prev_action': prev_action,
            'next_state': next_state,
            'time': time.time() - start_time,
            'total_reward': total_reward,
            'non_crop_state': non_crop_state
        }
        
        done = 0
        reward = compute_reward(reward_state)
        
        
        
        # agent.push(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        
        if is_game_over(non_crop_state):
            done = 1
            agent.learn(state, action, reward, next_state, done)
            break
        
        agent.learn(state, action, reward, next_state, done)
        # else:
        #     print("False")
        
        print(f"Step: {step}, Action: {action}, Reward: {reward}, Total Reward: {total_reward}, Hen Position: {state[0]}")

    # tap space key to restart the game
    keyboard.press_and_release('space')
    losses.append(total_loss)
    rewards.append(total_reward)
    print('\nepisode: {}, loss: {}, reward: {}'.format(episode, total_loss, total_reward))
    # agent.save('dqn.pth')
    # agent.reset()
    
    time.sleep(3.25)
    keyboard.press_and_release('space')



Model is ready to train
Step: 0, Action: 2, Reward: -0.25, Total Reward: -0.25, Hen Position: (77, 110)
Step: 1, Action: 1, Reward: -2, Total Reward: -2.25, Hen Position: (73, 127)
Step: 2, Action: 0, Reward: -7, Total Reward: -9.25, Hen Position: (77, 122)
Step: 3, Action: 1, Reward: -2, Total Reward: -11.25, Hen Position: (79, 133)
Step: 4, Action: 2, Reward: -0.25, Total Reward: -11.5, Hen Position: (65, 147)
Step: 5, Action: 1, Reward: -2, Total Reward: -13.5, Hen Position: (90, 159)
Step: 6, Action: 4, Reward: -0.5, Total Reward: -14.0, Hen Position: (90, 159)
Step: 7, Action: 3, Reward: -0.25, Total Reward: -14.25, Hen Position: (90, 159)
Step: 8, Action: 2, Reward: -0.25, Total Reward: -14.5, Hen Position: (90, 159)
Step: 9, Action: 0, Reward: -7, Total Reward: -21.5, Hen Position: (90, 159)
Step: 10, Action: 1, Reward: -2, Total Reward: -23.5, Hen Position: (90, 159)
Step: 11, Action: 4, Reward: -0.5, Total Reward: -24.0, Hen Position: (90, 159)
Step: 12, Action: 2, Reward: -0.

KeyboardInterrupt: 