In [1]:
from model import BotDemineur
from utils import Transition, ReplayMemory
from env import DemineurInterface
import torch
import torch.optim as optim
import torch.nn as nn
import random
import math

import pyautogui

import time
import datetime

  warn(f"Failed to load image Python extension: {e}")


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
rows = 9
cols = 9
env = DemineurInterface(rows, cols)

cpu


In [3]:
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.60


EPS_END = 0.05
EPS_DECAY = 30000
TARGET_UPDATE = 30

In [4]:
# Get number of actions from gym action space

n_actions = env.action_space_nb

policy_net = BotDemineur(env.grid.rows, env.grid.cols, n_actions).to(device)
target_net = BotDemineur(env.grid.rows, env.grid.cols, n_actions).to(device)

policy_net.load_state_dict(torch.load("weights/9-9-4400.pt"))
target_net.load_state_dict(policy_net.state_dict())

target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10000)

env.reset()
steps_done = 0

In [5]:
def select_action(state):
    """ 
        Select action to use according to the state

        state: np.array, grid of the game
    """

    
    global steps_done
    
    sample = random.random()
     
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    
    if sample > eps_threshold:
        #print("policy action")
        with torch.no_grad():
            return policy_net(state).max(1)[1].view(1, 1)

            
    
    else:
        #print("random action")
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)

    

In [6]:
def optimize_model():
    
    if len(memory) < BATCH_SIZE:
        return

    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber lossqaaq
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
    

In [10]:
import keyboard

num_episodes = 20000

nb_victories = []

pyautogui.PAUSE = 0.01

print("start")

ref = time.time()

for i_episode in range(0, num_episodes):
    
    env.reset()
    state = env.get_state()
    previous_action = None
    
    done = False
    
    while not done:

        action = select_action(state)
        reward, done = env.step(action.item(), steps_done)
        
        #print("Reward: ", reward, 'Done: ', done)
        
        reward = torch.tensor([reward], device=device)

        if not done:
            next_state = env.get_state()
        else:
            next_state = None

        if action != previous_action:
            memory.push(state, action, next_state, reward)
        
        state = next_state
        previous_action = action
        
        optimize_model()
        
        if keyboard.is_pressed('q'):
            done = True

    if (i_episode + 1) % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

        nb_victories.append(env.victories)

    if (i_episode + 1) % 200 == 0:
        torch.save(policy_net.state_dict(), "weights/{}-{}-{}.pt".format(rows, cols, i_episode+1 + 7200))

    if (i_episode + 1) % 20 == 0:


        eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
        print(datetime.timedelta(seconds = time.time() - ref), i_episode, env.victories, eps_threshold)


start
0:02:21.307018 19 0 0.08579774651414526
0:04:38.009496 39 0 0.08448823592568565
0:07:44.912497 59 0 0.08276688095289735
0:10:38.461503 79 0 0.08128540239368424
0:14:06.374999 99 0 0.07957072775707692
0:16:38.662997 119 0 0.07838663052675422
0:19:16.723496 139 0 0.07720819672874352
0:22:43.395000 159 0 0.0756938486615121
0:26:05.780499 179 0 0.07432452121138566
0:30:00.050501 199 0 0.07281500145667279
0:33:22.710496 219 0 0.07162070913355029
0:36:49.138496 239 0 0.07044595257675125
0:39:46.667497 259 0 0.06948253217519876
0:42:37.710524 279 0 0.06859981457456119
0:46:49.219995 299 0 0.06737591314005456
0:51:32.122499 319 0 0.06607478044008583
0:54:24.802018 339 0 0.06535976781754223
0:57:48.740519 359 0 0.06453488484763101


FailSafeException: PyAutoGUI fail-safe triggered from mouse moving to a corner of the screen. To disable this fail-safe, set pyautogui.FAILSAFE to False. DISABLING FAIL-SAFE IS NOT RECOMMENDED.

In [None]:
eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
print(eps_threshold)

''' policy_net = policy_net.to(torch.float64)
policy_net(state) '''

In [8]:
print(len(memory))
steps_done



10000


78333

In [None]:
img = env.grab_image()
env.grid.is_done(img)

env.step(0)


env.victories

In [None]:
i,j = 0, 0

index = i* env.grid.cols + j
env.step(index)
env.step(index)

In [None]:
env.victories



img = env.grab_image()
env.grid.extract_from_image(img)
print(env.grid.grid)
print(env.grid.is_done(img))

In [None]:
time.sleep(2)
env.reset()