In [None]:
from src import games
from src import actor
from src import interactive 
from src import tournament
import tensorflow as tf
import numpy as np
import pandas as pd
from random import choice
import time
import matplotlib
import matplotlib.pyplot as plt
import math

from random import choices
from tqdm.notebook import tqdm
from copy import deepcopy
from ipywidgets import interact

import self_play

In [None]:
def convolutional_block(x, filters, kernel_size = (3, 3), stride=1, activation=tf.nn.swish):
    x = tf.keras.layers.Conv2D(filters, kernel_size=kernel_size, strides=(stride, stride))(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = activation(x)
    return x

def residual_block(x, filters = (256, 256), kernel_size=(3, 3), stride=1, activation = tf.nn.swish):
    skip = x
    
    f1, f2 = filters
    
    x = tf.keras.layers.Conv2D(f1, kernel_size=kernel_size, strides=(stride, stride), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = activation(x)
    
    x = tf.keras.layers.Conv2D(f2, kernel_size=kernel_size, strides=(stride, stride), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    
    x = tf.keras.layers.Add()([x, skip])
    x = activation(x)
    
    return x

def body(x, residual_blocks = 1, convolutional_filter = 256, residual_filters = (256, 256)):
    x = convolutional_block(x, convolutional_filter)
    
    for _ in range(residual_blocks):
        x = residual_block(x, residual_filters)
        
    return x
    

def policy_head(x, size, activation=tf.nn.swish, softmax_policy=False):
    x = tf.keras.layers.Conv2D(2, kernel_size=(1, 1), strides=(1, 1))(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = activation(x)
    
    x = tf.keras.layers.Flatten()(x)
    
    if softmax_policy:
        x = tf.keras.layers.Dense(size * size)(x)
        x = tf.keras.layers.Softmax(name='policy')(x)
    else:
        x = tf.keras.layers.Dense(size * size, name='policy')(x)
    return x

def value_head(x, activation=tf.nn.swish):
    x = tf.keras.layers.Conv2D(1, kernel_size=(1, 1), strides=(1, 1))(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = activation(x)
    
    x = tf.keras.layers.Flatten()(x)
    
    x = tf.keras.layers.Dense(256)(x)
    x = tf.nn.relu(x)
    x = tf.keras.layers.Dense(1)(x)
    x = tf.keras.layers.Activation(tf.nn.tanh, name='value')(x)
    
    return x

def oht_model(size, residual_blocks=1, residual_filters = (256, 256), convolutional_filter=256, softmax_policy=False):
    input = tf.keras.layers.Input(shape=(size, size, 2), name='input')
    
    b = body(input, residual_blocks=residual_blocks, convolutional_filter=convolutional_filter, residual_filters=residual_filters)
    
    policy = policy_head(b, size, softmax_policy=softmax_policy)
    value = value_head(b)
    
    return tf.keras.models.Model(inputs=[input], outputs=[policy, value])

def new_model(size, body, policy=None, value=None):
    input = tf.keras.layers.Input(shape=(size, size, 2), name='input')
    
    x = body(input)
    policy = tf.keras.layers.Layer() if policy is None else policy
    value = tf.keras.layers.Layer() if policy is None else value
    
    policy = tf.keras.layers.Layer(name='policy')(policy(x))
    value = tf.keras.layers.Layer(name='value')(value(x))
    
    return tf.keras.models.Model(inputs=[input], outputs=[policy, value])

In [None]:
def decode(bitboard, player, max_size=11):
    for i in range(max_size*max_size):
        bit = bitboard & (0b1 << i)
        
        x = i % max_size
        y = i // max_size
        
        if bit != 0: 
            yield (x, y) if player == 1 else (y, x)
            
def denormalize_noflip(encoded, size):
    current, opponent = encoded[:, :, 0], encoded[:, :, 1]
    game = games.Hex(size = size)
    game.next_state = "Disabled"
    for x in range(size):
        for y in range(size):
            if current[x, y] == 1.0:
                game.grid[y][x] = 1
            elif opponent[x, y] == 1.0:
                game.grid[y][x] = 2
    return game

            
        
def interleave(it0, it1):
    it0 = iter(it0)
    it1 = iter(it1)
    
    while True:
        try:
            yield next(it0)
            yield next(it1)
        except StopIteration:
            yield from it0
            yield from it1
            break
            
            
def decode_bitboard(player1, player2, size, max_size=11):
    p1 = decode(player1, 1, max_size=max_size)
    p2 = decode(player2, 2, max_size=max_size)
    
    game = games.Hex(size=size)
    
    for action in interleave(p1, p2):
        game = game.next_state(action)
        
    return game

def from_oht_state(state):
    """Converts from the OHT state representation to ours. Note that while the OHT representation allows
    player 2 as the starting player, we DO NOT. Thus, we will always set player 1 as the starting player"""
    player, *board = state
    size = int(len(board)**0.5)
    
    assert len(board) == size * size
    
    # If we see an even number of pieces it means we started, if we see an odd number we were second
    current_started = sum(1 for x in board if x != 0) % 2 == 0
    flip = player == 1 and not current_started or player == 2 and current_started
    
    # Numpy makes it all simpler
    board = np.array(board).reshape((size, size))
    
    # Okay, so if we are player one, and we started, then we're all set.
    
    # If we're player one but did not start, then we're effectively player 2 in our representation
    # As such, we need to replace 1s with 2s and transpose the board.
    # By the same logic, we can find that we must do the same if we started as player 2, since 
    # is really player 1 in our representation
    if flip:
        board = board.T
        ones = board == 1
        twos = board == 2
        board[ones] = 2
        board[twos] = 1
        
    game = games.Hex(size=size)
    game.grid = [[x for x in row] for row in board]
    game.current_player = 1 if current_started else 2
    
    return (not flip, game)

# START OF ACTUAL USEFUL STUFF

# TOPP

In [None]:
size = 5
model_name = f'{size}x{size}x2'

model = new_model(
    size=size,
    # This is the main body
    body=tf.keras.models.Sequential([
        tf.keras.layers.Dense(15),
        #tf.keras.layers.Dense(30)
    ]),
    # The policy head
    policy=tf.keras.models.Sequential([
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(size * size, activation='relu'),
        tf.keras.layers.Softmax(),
    ]),
    # The value head
    value=tf.keras.models.Sequential([
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(1, activation='tanh'),
    ])
)

model.save(f'{model_name}-v0')

model.summary()

In [None]:
import os
import sys
from train import train

def delete_samples_preceeding(number):
    for filename in os.listdir(SAMPLES_DIR):
        filenumber, *_ = filename.split('.')
        try:
            if int(filenumber) < number:
                pass
                ! rm -r -f {SAMPLES_DIR}/{filename}
        except Exception as e:
            print(f'could not delete file {filename}: {e}')

BASE_DIR = '.'
SAMPLES_DIR = './samples'

# The name of the model. All saved models will be named {model_name}-v{version_number}
model_name = model_name
# The number of simulations to perform
simulations = 100
# How many games to run concurrently when generating samples
concurrents = 64
# What kind of leaf evaluation to utilize. Either 'value_fn' or 'rollout'
leaf_evaluation = 'value_fn'
# What kind of encoder to use
encoder = 'normalized'
# We will fit the model to the last `max_sample_sets` sample sets generated
max_sample_sets = 5
# How many samples to generate.
samples = concurrents * 14
# How many 'epochs' to run. I.e. how many loops of (1) generate samples, (2) fit to rbuf to do.
M = 5

# Parameters related to training the model.
optimizer = tf.keras.optimizers.SGD(lr=0.02)
epochs_per_step = 10

# The number of sample sets generated
sample_sets_generated = 0

for current_version in range(0, M):
    current_model_path = f'{BASE_DIR}/{model_name}-v{current_version}'
    sample_set_path = f'{SAMPLES_DIR}/{sample_sets_generated}.json'
    print(f'using model {current_model_path}')
    ! {sys.executable} generate_samples.py --samples {samples}  --simulations {simulations} --model {current_model_path} --out {sample_set_path} --size {size} --concurrents {concurrents} --evaluation {leaf_evaluation} --encoder {encoder}
    sample_sets_generated += 1

    new_model_path = f'{BASE_DIR}/{model_name}-v{current_version + 1}'
    samplesets = [f'{SAMPLES_DIR}/{i}.json' for i in range(max(0, sample_sets_generated - max_sample_sets), sample_sets_generated)]
    train(model_path=current_model_path, sample_paths=samplesets, save_path=new_model_path, size=size, lr=None, epochs=epochs_per_step, optimizer=optimizer)
    
    # TODO: only replace incumbent if a new version wins >= 55%.
    delete_samples_preceeding(sample_sets_generated - max_sample_sets)

In [None]:
agents = [
    actor.SFPredictionAgent(
        encoder='normalized', 
        path=f'{model_name}-v{i}',
        size=size,
        name = f'v{i}',
        policy_kind='proportional',
    ) for i in range(M)
]

agents

In [None]:
from src import tournament

statistics = {}

print(f'PLAYER 1 | PLAYER 2')
for (p1, w1), (p2, w2) in tournament.tournament(agents, games.Hex(size=size), 250, verbose=False):
    n1, n2 = p1.name, p2.name
    # We will always set the lexicographically smallest player first
    if n1 <= n2:
        key = (n1, n2)
        (c, d) = (w1, w2)
    else:
        key = (n2, n1)
        (c, d) = (w2, w1)
        
    (a, b) = statistics.get(key, (0, 0))
    statistics[key] = (a + c, b + d)
    
    print(f'{p1.name}: {w1:>3}  |  {p2.name}: {w2:>3}')
    
statistics

In [None]:
%matplotlib inline
import os
from itertools import chain
from train import load_samples

_, B, X, P, Z = (list(chain(*x)) for x in zip(*[
    load_samples(f'{SAMPLES_DIR}/{file}') for file in os.listdir(SAMPLES_DIR)
]))

print(f'Episodes = {sum(1 for (p1, p2) in B if (p1, p2) == (0, 0))}')

@interact
def show(i = (0, len(B) - 1)):
    (p1, p2) = B[i]
    return decode_bitboard(p1, p2, size)

# Interactive

In [None]:
%matplotlib qt
agent = actor.SFAgent(
    leaf_evaluation='rollout', 
    encoder='normalized', 
    path=f'{model_name}-v{M}',
    c=3,
    size=size,
    simulations=1000,
)

interactive.play(games.Hex(size=6), pretrained)

In [None]:
%matplotlib qt
pretrained = actor.SFPredictionAgent(
    encoder='normalized', 
    path=f'/Users/akselborgen/Downloads/oht6x6-v34',
    size=6,
    name = f'pretrained',
    policy_kind='greedy',
)

interactive.play(games.Hex(size=6), pretrained)

# OHT testing

In [None]:
raise ValueError('stop')

In [None]:
# This is the 'oht6x6-resnet128' model used in the OHT (untrained)
oht = oht_model(6, residual_blocks=10, softmax_policy=True, convolutional_filter=128, residual_filters=(128, 128))
oht.summary()

In [None]:
size = 6

In [None]:
v34 = actor.SFPredictionAgent(
    leaf_evaluation='value_fn', 
    encoder='normalized', 
    path=f'/Users/akselborgen/Downloads/oht6x6-v34',
    c=3,
    size=size,
    simulations=1000,
    policy_kind='proportional',
    name='v34',
)

In [None]:
def resnet_v(x):
    return actor.SFPredictionAgent(
        leaf_evaluation='value_fn', 
        encoder='normalized', 
        path=f'/Users/akselborgen/Downloads/oht6x6resnet128-v{x}',
        c=3,
        size=size,
        simulations=500,
        policy_kind='proportional',
        name=f'r{x:02}',
    )

def resnet_full_v(x):
    return actor.SFAgent(
        leaf_evaluation='value_fn', 
        encoder='normalized', 
        path=f'/Users/akselborgen/Downloads/oht6x6resnet128-v{x}',
        c=3,
        size=size,
        simulations=500,
        name=f'f{x:02}',
    )

In [None]:
agents = [resnet_v(0), resnet_v(6), resnet_v(26), resnet_v(41), resnet_v(44)]

statistics = {}

print(f'PLAYER 1  |  PLAYER 2')
for (p1, w1), (p2, w2) in tournament.tournament(agents, games.Hex(size=size), 500, verbose=False):
    n1, n2 = p1.name, p2.name
    # We will always set the lexicographically smallest player first
    if n1 <= n2:
        key = (n1, n2)
        (c, d) = (w1, w2)
    else:
        key = (n2, n1)
        (c, d) = (w2, w1)
        
    (a, b) = statistics.get(key, (0, 0))
    statistics[key] = (a + c, b + d)
    
    print(f'{p1.name}: {w1:>3}  |  {p2.name}: {w2:>3}')
    
statistics

In [None]:
%matplotlib qt
interactive.play(games.Hex(size=size), agent=resnet_full_v(41))

In [None]:
def encoder(x, add_batch_axis=False):
    tensor = games.hex.normalized_encoder(x)
    #tensor = games.hex.current_player_encoder(x)
    if add_batch_axis:
        return tf.reshape(tensor, (1, *tensor.shape))
    else:
        return tensor

def time_limit(seconds):
    start = None
    def inner(i):
        global start
        if i == 0:
            start = time.time()
        
        return (time.time() - start) >= seconds
    
    return inner