In [1]:
import os
from pathlib import Path

import numpy as np
from tqdm.notebook import trange
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
%matplotlib inline
from dpipe.io import load, choose_existing
from dpipe.torch import load_model_state

from ttt_lib.policy_player import PolicyPlayer
from ttt_lib.self_games import play_self_game, play_duel
from ttt_lib.torch.module.policy_net import PolicyNetworkRandom, PolicyNetworkQ10Light, PolicyNetworkQ10
from ttt_lib.field import Field
from ttt_lib.utils import choose_model
from ttt_lib.monte_carlo_tree_search import run_search, mcts_action

pygame 2.0.2 (SDL 2.0.16, Python 3.8.5)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
Q_EXP_PATH = choose_existing(
    Path('/nmnt/x4-hdd/experiments/rl/q_10x10'),
)

os.listdir(Q_EXP_PATH)

['TB8_continue_eps04',
 'q8_test',
 'TB8_continue_eps04_random_start',
 'tb8_45_tb8_32_tb8_50_eps05',
 'tb8_45_tb8_32_tb8_50_eps04',
 'q4_test',
 'tb8_load45_load32',
 'ql8_load45_load32',
 'q2_test',
 'q8_load45']

In [3]:
exp_path = Q_EXP_PATH / 'tb8_45_tb8_32_tb8_50_eps05'

In [4]:
device = 'cpu'
n = 10
kernel_len = 5
cnn_features = (128, 64)

field = Field(n=n, kernel_len=kernel_len, device=device, check_device=device)
model = PolicyNetworkQ10Light(n=n, structure=cnn_features)
# model = PolicyNetworkQ10(n=n, structure=cnn_features)
load_model_state(model, exp_path / choose_model(exp_path))

eps = 0
player = PolicyPlayer(model=model, field=field, eps=eps, device=device)
player.eval()

# Duels

In [5]:
field_duel = Field(n=n, kernel_len=kernel_len, device=device, check_device=device)

eps = 0.2

In [6]:
model_random = PolicyNetworkRandom(n=n)
player_random = PolicyPlayer(model=model_random, field=field_duel, eps=1., device=device)

## TB8 (127ep) vs TB8 (227ep)

In [7]:
player_model_tb8_127 = PolicyPlayer(model=PolicyNetworkQ10Light(n=n, structure=cnn_features),
                                   field=field_duel, eps=eps, device=device)
path = Path('/nmnt/x4-hdd/experiments/rl/q_10x10/tb8_load45_load32/')
load_model_state(player_model_tb8_127.model, path / choose_model(path))


player_model_tb8_227 = PolicyPlayer(model=PolicyNetworkQ10Light(n=n, structure=cnn_features),
                                    field=field_duel, eps=eps, device=device)
path = Path('/nmnt/x4-hdd/experiments/rl/q_10x10/tb8_45_tb8_32_tb8_50_eps05/')
load_model_state(player_model_tb8_227.model, path / choose_model(path))

In [8]:
res_1, res_2 = [], []
p1, p2 = player_model_tb8_127, player_model_tb8_227


for _ in trange(1000):
    sh, fh, ah, qh, ph, eh, w = play_duel(p1, p2, return_result_only=False)
    res_1.append(w)
    
    sh, fh, ah, qh, ph, eh, w = play_duel(p2, p1, return_result_only=False)
    res_2.append(w)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1000.0), HTML(value='')))

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  argmax_avail_action_idx = random.choice(torch.where(avail_p == avail_p.max(), 1., 0.).nonzero()).item()





In [9]:
print(np.mean(np.array(res_1) == 1))
print(np.mean(np.array(res_1) == 0))
print(np.mean(np.array(res_1) == -1))

0.313
0.0
0.687


In [10]:
print(np.mean(np.array(res_2) == 1))
print(np.mean(np.array(res_2) == 0))
print(np.mean(np.array(res_2) == -1))

0.691
0.0
0.309


## TB8 (227ep) eps=0.4 vs TB8 (227ep) eps=0.5

In [11]:
player_model_tb8_227_04 = PolicyPlayer(model=PolicyNetworkQ10Light(n=n, structure=cnn_features),
                                       field=field_duel, eps=eps, device=device)
path = Path('/nmnt/x4-hdd/experiments/rl/q_10x10/tb8_45_tb8_32_tb8_50_eps04/')
load_model_state(player_model_tb8_227_04.model, path / choose_model(path))


player_model_tb8_227_05 = PolicyPlayer(model=PolicyNetworkQ10Light(n=n, structure=cnn_features),
                                       field=field_duel, eps=eps, device=device)
path = Path('/nmnt/x4-hdd/experiments/rl/q_10x10/tb8_45_tb8_32_tb8_50_eps05/')
load_model_state(player_model_tb8_227_05.model, path / choose_model(path))

In [12]:
res_1, res_2 = [], []
p1, p2 = player_model_tb8_227_04, player_model_tb8_227_05


for _ in trange(1000):
    sh, fh, ah, qh, ph, eh, w = play_duel(p1, p2, return_result_only=False)
    res_1.append(w)
    
    sh, fh, ah, qh, ph, eh, w = play_duel(p2, p1, return_result_only=False)
    res_2.append(w)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [13]:
print(np.mean(np.array(res_1) == 1))
print(np.mean(np.array(res_1) == 0))
print(np.mean(np.array(res_1) == -1))

0.681
0.0
0.319


In [14]:
print(np.mean(np.array(res_2) == 1))
print(np.mean(np.array(res_2) == 0))
print(np.mean(np.array(res_2) == -1))

0.629
0.0
0.371


## MCTS TB8 (10 sec) vs TB8

In [8]:
player_model_tb8_1 = PolicyPlayer(model=PolicyNetworkQ10Light(n=n, structure=cnn_features),
                                field=field_duel, eps=0, device=device)
path = Path('/nmnt/x4-hdd/experiments/rl/q_10x10/tb8_45_tb8_32_tb8_50_eps05/')
load_model_state(player_model_tb8_1.model, path / choose_model(path))


player_model_tb8_2 = PolicyPlayer(model=PolicyNetworkQ10Light(n=n, structure=cnn_features),
                                  field=field_duel, eps=0, device=device)
path = Path('/nmnt/x4-hdd/experiments/rl/q_10x10/tb8_45_tb8_32_tb8_50_eps05/')
load_model_state(player_model_tb8_2.model, path / choose_model(path))

In [8]:
wx = play_duel(player_model_tb8_1, player_model_tb8_2, return_result_only=True,
               mcts_x=True, search_time_x=10)
wo = play_duel(player_model_tb8_1, player_model_tb8_2, return_result_only=True,
               mcts_o=True, search_time_o=10)

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  argmax_avail_action_idx = random.choice(torch.where(avail_p == avail_p.max(), 1., 0.).nonzero()).item()


In [9]:
print(wx, wo)

1 -1


## MCTS TB8 (20 sec) vs MCTS TB8 (10 sec)

In [10]:
wx = play_duel(player_model_tb8_1, player_model_tb8_2, return_result_only=True,
               mcts_x=True, mcts_o=True, search_time_x=20, search_time_o=10)
wo = play_duel(player_model_tb8_1, player_model_tb8_2, return_result_only=True,
               mcts_x=True, mcts_o=True, search_time_x=10, search_time_o=20)

In [11]:
print(wx, wo)

1 1


## MCTS TB8 (30 sec) vs MCTS TB8 (10 sec)

In [9]:
wo = play_duel(player_model_tb8_1, player_model_tb8_2, return_result_only=True,
               mcts_x=True, mcts_o=True, search_time_x=10, search_time_o=30)

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  center = to_np(by_diag1.nonzero()[0][2:])


In [10]:
print(wo)

1


## TB8 (200+ ep) eps=0.4 vs TB8 (200+ ep) eps=0.4 + random starts

In [21]:
player_model_tb8 = PolicyPlayer(model=PolicyNetworkQ10Light(n=n, structure=cnn_features),
                                field=field_duel, eps=eps, device=device)
path = Path('/nmnt/x4-hdd/experiments/rl/q_10x10/TB8_continue_eps04/')
load_model_state(player_model_tb8.model, path / choose_model(path))


player_model_tb8_rs = PolicyPlayer(model=PolicyNetworkQ10Light(n=n, structure=cnn_features),
                                   field=field_duel, eps=eps, device=device)
path = Path('/nmnt/x4-hdd/experiments/rl/q_10x10/TB8_continue_eps04_random_start/')
load_model_state(player_model_tb8_rs.model, path / choose_model(path))

In [22]:
res_1, res_2 = [], []
p1, p2 = player_model_tb8, player_model_tb8_rs


for _ in trange(1000):
    res_1.append(play_duel(p1, p2, return_result_only=True))
    res_2.append(play_duel(p2, p1, return_result_only=True))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [23]:
print(np.mean(np.array(res_1) == 1))
print(np.mean(np.array(res_1) == 0))
print(np.mean(np.array(res_1) == -1))

0.675
0.0
0.325


In [24]:
print(np.mean(np.array(res_2) == 1))
print(np.mean(np.array(res_2) == 0))
print(np.mean(np.array(res_2) == -1))

0.588
0.0
0.412
