In [None]:
from src import games
from src import actor
from src import interactive 
from src import tournament
import tensorflow as tf
import numpy as np
import pandas as pd
from random import choice
import time
import matplotlib
import matplotlib.pyplot as plt

from random import choices
from tqdm.notebook import tqdm
from copy import deepcopy
from ipywidgets import interact

In [None]:
def decode(bitboard, player):
    for i in range(64):
        bit = bitboard & (0b1 << i)
        
        x = i % 8
        y = i // 8
        
        if bit != 0: 
            yield (x, y) if player == 1 else (y, x)
            
def denormalize_noflip(encoded, size):
    current, opponent = encoded[:, :, 0], encoded[:, :, 1]
    game = games.Hex(size = size)
    game.next_state = "Disabled"
    for x in range(size):
        for y in range(size):
            if current[x, y] == 1.0:
                game.grid[y][x] = 1
            elif opponent[x, y] == 1.0:
                game.grid[y][x] = 2
    return game

            
        
def interleave(it0, it1):
    it0 = iter(it0)
    it1 = iter(it1)
    
    while True:
        try:
            yield next(it0)
            yield next(it1)
        except StopIteration:
            yield from it0
            yield from it1
            break

In [None]:
%matplotlib inline
game = games.Hex(size=6)

#for y in range(5):
#    game = game.next_state((0, y)).next_state((1, y))
game

In [None]:
model = tf.keras.models.load_model('/Users/akselborgen/Downloads/oht6x6-v34')

In [None]:
player, *board = (1, 2, 1, 2, 0, 2, 2, 0, 1, 1, 1, 2, 0, 0, 0, 2, 1, 2, 1, 2, 0, 1, 2, 2, 1, 1, 1, 2, 1, 1, 0, 2, 2, 1, 0, 1, 2)
player, *board = (1, 2, 2, 2, 0, 0, 2, 1, 1, 1, 2, 0, 1, 0, 2, 1, 1, 0, 0, 2, 1, 0, 0, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 0, 1)
#player, *board = (1, 2, 2, 2, 0, 0, 2, 1, 1, 1, 2, 2, 1, 0, 2, 1, 1, 1, 0, 2, 1, 0, 0, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 0, 1)
player, *board = (1, 0, 1, 1, 2, 0, 1, 0, 0, 2, 0, 1, 0, 1, 2, 0, 1, 2, 0, 0, 2, 1, 1, 0, 0, 0, 0, 2, 2, 2, 0, 1, 1, 0, 2, 0, 2)
player, *board = (1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
#player, *board = (1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)

In [None]:
import tensorflow as tf
def convolutional_block(x, filters, kernel_size = (3, 3), stride=1, activation=tf.nn.swish):
    x = tf.keras.layers.Conv2D(filters, kernel_size=kernel_size, strides=(stride, stride))(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = activation(x)
    return x

def residual_block(x, filters = (256, 256), kernel_size=(3, 3), stride=1, activation = tf.nn.swish):
    skip = x
    
    f1, f2 = filters
    
    x = tf.keras.layers.Conv2D(f1, kernel_size=kernel_size, strides=(stride, stride), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = activation(x)
    
    x = tf.keras.layers.Conv2D(f2, kernel_size=kernel_size, strides=(stride, stride), padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    
    x = tf.keras.layers.Add()([x, skip])
    x = activation(x)
    
    return x

def body(x, residual_blocks = 1):
    x = convolutional_block(x, 256)
    
    for _ in range(residual_blocks):
        x = residual_block(x)
        
    return x
    

def policy_head(x, size, activation=tf.nn.swish):
    x = tf.keras.layers.Conv2D(2, kernel_size=(1, 1), strides=(1, 1))(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = activation(x)
    
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(size * size, name='policy')(x)
    return x

def value_head(x, activation=tf.nn.swish):
    x = tf.keras.layers.Conv2D(1, kernel_size=(1, 1), strides=(1, 1))(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = activation(x)
    
    x = tf.keras.layers.Flatten()(x)
    
    x = tf.keras.layers.Dense(256)(x)
    x = tf.nn.relu(x)
    x = tf.keras.layers.Dense(1)(x)
    x = tf.keras.layers.Activation(tf.nn.tanh, name='value')(x)
    
    return x

def oht_model(size, residual_blocks=1):
    input = tf.keras.layers.Input(shape=(size, size, 2), name='input')
    
    b = body(input, residual_blocks=residual_blocks)
    
    policy = policy_head(b, size)
    value = value_head(b)
    
    return tf.keras.models.Model(inputs=[input], outputs=[policy, value])

In [None]:
model = oht_model(6, residual_blocks=5)

In [None]:
model.summary()

In [None]:
model.save('oht6x6-resnet-v0')

In [None]:
def policy_loss(y_true, y_pred):
    print(y_true)
    print(y_pred)
    raise ValueError()
    
def new_policy_loss(y_true, y_pred_before_softmax):
    return tf.nn.softmax_cross_entropy_with_logits(
        labels=y_true,
        logits=y_pred_before_softmax,
    )

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    metrics={'policy': tf.keras.metrics.CategoricalAccuracy(), 'value': 'mse'},
    loss_weights={'value': 1.0, 'policy': 1.0},
    loss={'policy': new_policy_loss, 'value': 'mse'},
)

In [None]:
history = model.fit(tf.stack(X[:1000]), y={'policy': tf.stack(P[:1000]), 'value': tf.stack(Z[:1000])}, epochs=1000)

In [None]:
model.get_weights()

In [None]:
tf.reduce_sum(-pi * tf.math.log(tf.math.maximum(pi, 0.00001)))

In [None]:
p, z = model(tf.stack(X)[26:26+1])
tf.nn.softmax(p), z

In [None]:
np.argmax(p), np.argmax(P[14])

In [None]:
P[26]

In [None]:
P[0]

In [None]:
P[0][21], P[0][9]

In [None]:
np.argmax(P[1])

In [None]:
max([(x, y) for x in range(6) for y in range(6)], key=lambda t: np.reshape(p, (6, 6))[t])

In [None]:
np.reshape(P[0], (6, 6))

In [None]:
tf.nn.conv2d

In [None]:
grid = np.reshape(board, (6, 6)).T
if False:
    grid = grid.T
    ones = grid == 1
    twos = grid == 2
    grid[ones] = 2
    grid[twos] = 1
    
h = games.Hex(size=6)
h.grid = grid
h

In [None]:
g = games.Hex(size = 6)

for y in range(5):
    g = g.next_state((0, y)).next_state((1, y))
g = g.next_state((3, 3))
g

In [None]:
encoder(g, add_batch_axis=True)[0, :, :, 0]

In [None]:
np.reshape(model()[0], (6, 6))

In [None]:
agent = actor.SFAgent(
    leaf_evaluation='value_fn',
    encoder='normalized',
    policy='greedy',
    model_path='/Users/akselborgen/Downloads/oht6x6-v34',
    size=6,
    simulations=10000,
    c=3,
)

agent.policy(g)

In [None]:
np.reshape(model(encoder(game, add_batch_axis=True))[0], (6, 6))

In [None]:
model(encoder(game, add_batch_axis=True))

In [None]:
p, z = model(tf.expand_dims(X[2], axis=0))
np.reshape(p, (6, 6)), z

In [None]:
np.reshape(P[2], (6, 6))

In [None]:
%%time
import self_play
policy, *_ = self_play.policy_distribution(leaf_evaluation='value_fn', encoder='normalized', model_path='/Users/akselborgen/Downloads/oht6x6-v7', size=6, states=[game.grid], simulations=1000, c=3)
policy, max(policy, key=lambda t: t[1])

In [None]:
simulations = 500
policies = [
    actor.SFAgent(policy='greedy', leaf_evaluation='value_fn', encoder='normalized', model_path='/Users/akselborgen/Downloads/oht6x6-v34', size=6, simulations=simulations, c=3),
    actor.SFAgent(policy='greedy', leaf_evaluation='value_fn', encoder='normalized', model_path='/Users/akselborgen/Downloads/oht6x6-v40', size=6, simulations=simulations, c=3),
]
policies

In [None]:
statistics = []
for stats in tournament.tournament(policies, game, 100):
    print(stats)
    statistics.append(stats)
statistics

In [None]:
%matplotlib qt
interactive.play(game, agent=actor.SFAgent(leaf_evaluation='value_fn', encoder='normalized', model_path='6x6x2', size=6, simulations=10000, c=3))

In [None]:
import tensorflow as tf
print(tf.__version__)

# Set CPU as available physical device
my_devices = tf.config.experimental.list_physical_devices(device_type='CPU')
tf.config.experimental.set_visible_devices(devices= my_devices, device_type='CPU')

# To find out which devices your operations and tensors are assigned to
tf.debugging.set_log_device_placement(True)

# Create some tensors and perform an operation
a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
c = tf.matmul(a, b)

In [None]:
def compete(model1, model2)

In [None]:
import self_play
dir(self_play)

In [None]:
from src import games
game = games.Hex(size=5).next_state((0, 0)).next_state((0, 1)).next_state((1, 0)).next_state((1, 1)).next_state((2, 0)).next_state((2, 1)).next_state((3, 0)).next_state((3, 1)).next_state((0, 2))
game

In [None]:
from src import games
game = games.Hex(size=6).next_state((3, 1)).next_state((4, 0)).next_state((3, 0)).next_state((0, 4)).next_state((3, 3)).next_state((3, 2)).next_state((2, 2)).next_state((2, 3)).next_state((4, 2)).next_state((2, 4)).next_state((4, 1)).next_state((3, 4)).next_state((4, 3)).next_state((4, 4)).next_state((1, 3)).next_state((1, 4)).next_state((5, 4)).next_state((5, 3))
game

In [None]:
game.grid

In [None]:
%%time
policy, *_ = self_play.policy(leaf_evaluation='rollout', encoder='normalized', model_path='6x6x2', size=6, states=[game.grid], simulations=1000, c=3)
policy, max(policy, key=lambda t: t[1])

In [None]:
import train

shape, B, X, P, Z = train.load_samples('/Users/akselborgen/Downloads/21.json')

In [None]:
P[10]

In [None]:
X[0]

In [None]:
P[2]

In [None]:
with open('out2.txt') as f:
    x = f.read()
x

In [None]:
import sys
! {sys.executable} generate_samples.py --size 5 --model test5x5x2 --out samples/1.json --concurrents 16 --samples 250
#! {sys.executable} train.py --size 5 --model test5x5x2 --out test5x5x2v1 --data samples/1.json --lr 0.2 --epochs 10000

In [None]:
policy = actor.BetaHex(
    size = 5,
    encoder = encoder,
    shape = encoder(game).shape,
    optimizer = tf.keras.optimizers.Adam()
)

#@tf.function
#def policy_loss(y_true, y_pred):
#    return tf.reduce_mean(tf.reduce_sum(- y_true * tf.math.log(tf.math.maximum(y_pred, 0.0001)), axis=[1]))

#tf.keras.losses.policy_loss = policy_loss

#policy.compile(
#    optimizer=tf.keras.optimizers.Adam(), 
#    metrics=['accuracy'],
#    loss_weights={'output_1': 1.0, 'output_2': 1.0},
#    loss = {'policy': policy_loss, 'value': 'mse'},
#)
# To force the tensorflow model to be built
policy.model.set_weights(tf.keras.models.load_model('test5x5x2').get_weights())

In [None]:
p, z = policy.model(tf.expand_dims(X[180], axis=0))
np.array(p).reshape((5, 5)), z

In [None]:
np.array(P[180]).reshape((5, 5))

In [None]:
policy.model.get_weights()

In [None]:
N = 250
policy.model.fit(tf.stack(X[:N]), y={'policy': tf.stack(P[:N]), 'value': tf.stack(Z[:N])}, epochs=1000)

In [None]:
%%time
self_play.run_save(model_path='test5x5x2', size=5, concurrents=1, simulations=100, samples=100, leaf_evaluation='value_fn', encoder='normalized', out_path='samples/1.json')

In [None]:
import json

In [None]:
%%time
with open('samples/1.json', 'r') as f:
    js = json.load(f)

In [None]:
import json
def load_samples(path):
    with open(path, 'r') as f:
        data = json.load(f)

    shape = data['stateShape']
    samples = data['samples']

    X = []
    P = []
    Z = []

    for sample in samples:
        X.append(np.array(sample['state']).reshape(shape))
        P.append(np.array(sample['policy']))
        Z.append(sample['value'])

    return shape, X, P, Z

In [None]:

shape, X, P, Z = load_samples('samples/1.json')

x = [tuple(x.reshape((-1,))) for x in X]
I = [i for i, a in enumerate(x) if a == x[9]]
len(x), len(set(x)), I

In [None]:
P[0]

In [None]:
P[I[1]]

In [None]:
set([a for a in x if sum(1 for b in x) > 1])

In [None]:
%%time
self_play.run(path='test5x5x2', size=5, concurrents=1, simulations=100, samples=100, leaf_evaluation='rollout', encoder='normalized')

In [None]:
! rm "self-play.log"

In [None]:
self_play.init_logging('debug')

In [None]:
import time
for c in [128, 256, 512, 1024, 2048]:
    N = 24 * c
    print(f'{c} concurrents')
    print(f'{N} samples to generate')
    start = time.time()
    self_play.run(path='test5x5x2', size=5, concurrents=c, simulations=100, samples=N)
    end = time.time()
    throughput = N / (end - start)
    print(f'{end - start} s')
    print(f'{throughput} samples/s')
    print(flush=True)

In [None]:


def parse(xs):
    lines = [line.split(' ')[0] for line in xs.splitlines()]
    concurrents, samples, time, throughput = lines[0::5], lines[1::5], lines[2::5], lines[3::5]
    
    for (c, s, t, thr) in zip(concurrents, samples, time, throughput):
        yield (int(c), int(s), float(t), float(thr))

In [None]:
import matplotlib.pyplot as plt

samples = list(parse(log))

C = [t[0] for t in samples]
T = np.array([t[3] for t in samples])

plt.plot(C, T / T[0])

In [None]:
sum(xs) / len(xs)

In [None]:
%%time
def f():
    xs = self_play.run(path='test5x5x2', size=5, concurrents=1, simulations=100, samples=110)
    print(xs, flush=True)
    return all(x >= 5 for x in xs)
    


while f():
    continue

In [None]:
%matplotlib inline

p1 = 112189247232
p2 = 30133257493

print(len(list(decode(p1, 1))), len(list(decode(p2, 2))))

game = games.Hex(size = 5)
print(game.winner())
for action in interleave(decode(p1, 1), decode(p2, 2)):
    game = game.next_state(action)
game

In [None]:
raise ValueError()

In [None]:
S = 1
N = 50000
C = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]

for concurrents in C:
    %time generated = self_play.run(path='test5x5x2', size=5, concurrents=concurrents, simulations=S, samples=N)
    print(f"{concurrents}: {generated}")

In [None]:
raise ValueError()

In [None]:
p1 = 68920804608
p2 = 16843009

In [None]:
len(list(decode(p1, 1))), len(list(decode(p2, 2)))

In [None]:
%matplotlib inline
game = games.Hex(size = 5)
print(game.winner())
for action in interleave(decode(p1, 1), decode(p2, 2)):
    game = game.next_state(action)
game

In [None]:
lower_eight = 0b11111111   
max_size = 8
board = p2
#// Both boards have the same "perspective" as that of player 1, and can be treated equivalently
reachable = [0 for _ in range(5 + 1)]
reachable[0] = board & lower_eight


for _ in range(5):
    for i in range(1, 5):
        print(f'i = {i}')
        print(f'\tingoing = {[f"{r:05b}" for r in reachable]}')
        row = (board >> (max_size * i)) & lower_eight
        print(f'\trow     = {row:5b}')
        reachable[i] = row & (reachable[i - 1] >> 1 | reachable[i - 1] | reachable[i + 1] << 1 | reachable[i + 1] | reachable[i] >> 1 | reachable[i] << 1)
        
[f"{r:05b}" for r in reachable]

In [None]:
def encoder(x, add_batch_axis=False):
    tensor = games.hex.normalized_encoder(x)
    #tensor = games.hex.current_player_encoder(x)
    if add_batch_axis:
        return tf.reshape(tensor, (1, *tensor.shape))
    else:
        return tensor

def time_limit(seconds):
    start = None
    def inner(i):
        global start
        if i == 0:
            start = time.time()
        
        return (time.time() - start) >= seconds
    
    return inner

In [None]:
%matplotlib inline
game = games.Hex(size = 6)
print(game.winner())
game

In [None]:
policy = actor.BetaHex(
    size = 6,
    encoder = encoder,
    shape = encoder(game).shape,
    optimizer = tf.keras.optimizers.Adam()
)

#@tf.function
#def policy_loss(y_true, y_pred):
#    return tf.reduce_mean(tf.reduce_sum(- y_true * tf.math.log(tf.math.maximum(y_pred, 0.0001)), axis=[1]))

#tf.keras.losses.policy_loss = policy_loss

#policy.compile(
#    optimizer=tf.keras.optimizers.Adam(), 
#    metrics=['accuracy'],
#    loss_weights={'output_1': 1.0, 'output_2': 1.0},
#    loss = {'policy': policy_loss, 'value': 'mse'},
#)
# To force the tensorflow model to be built
policy(tf.expand_dims(encoder(game), axis=0))

In [None]:
policy.model.summary()

In [None]:
policy.model.summary()

In [None]:
raise ValueError()

In [None]:
policy.fit()

In [None]:
policy.model.save('6x6x2/', include_optimizer=False)

In [None]:
model = tf.keras.models.load_model('test')

In [None]:
model

In [None]:
policy.summary()

In [None]:
policy.body.summary()

In [None]:
policy.policy_head.summary()

In [None]:
policy.value_head.summary()

In [None]:
mcts = actor.MCTS(
    default_policy = policy,
    leaf_evaluator = actor.Rollout(policy),
    terminate = lambda i: i >= 10,
    c = 0.4,
)

raw_replay_buffer = []
replay_buffer = []

In [None]:
%matplotlib qt
interactive.play(game, mcts)

In [None]:
histories = []
#gen = mcts.generate_episodes(game, concurrents=32)
for i in tqdm(range(1)):
    for _ in tqdm(range(1)):
        samples = mcts.episode(game)
        raw_replay_buffer.extend(samples)
        replay_buffer.extend((policy.encoder(s), (policy.numpy_distribution(D), z)) for (s, D, z) in samples)
        
        X, Y = zip(*choices(replay_buffer, k=100))
        X = tf.stack(X)
        P = tf.stack([p for p, _ in Y])
        Z = tf.expand_dims(tf.stack([float(z) for _, z in Y]), axis=1)
        histories.append(policy.model.fit(X, y = {'policy': P, 'value': Z}, epochs=1))

In [None]:
X, Y = zip(*choices(replay_buffer, k=100))
X = tf.stack(X)
P = tf.stack([p for p, _ in Y])
Z = tf.expand_dims(tf.stack([float(z) for _, z in Y]), axis=1)
histories.append(policy.model.fit(X, y = {'policy': P, 'value': Z}, epochs=1000))

In [None]:
#X, Y = zip(*processed_replay_buffer[:2])
#X = tf.stack(X)
#P = tf.stack([p for p, _ in Y])
#Z = tf.expand_dims(tf.stack([float(z) for _, z in Y]), axis=1)
def expand_sample(d):
    keys, values = zip(*d.items())
    
    for sample in zip(*values):
        yield {
            k: v for k, v in zip(keys, sample)
        }
    

#df = pd.DataFrame(policy.fit(X, y = {'output_1': P, 'output_2': Z}, epochs=1000).history)
df = pd.DataFrame([d for h in histories for d in expand_sample(h.history)])
#df = pd.DataFrame([{k: sum(v) / len(v) for k, v in h.history.items()} for h in histories])

for column in df.columns:
    plt.title(column)
    plt.plot(df[column])
    plt.show()

In [None]:
def checkpoint(i):
    dumb_policy = actor.NeuralNetworkActorCritic(
        encoder = encoder,
        epsilon=0.0,
    )

    dumb_policy.model.load_weights(f'weights{i}')

    return actor.MCTS(
        default_policy = dumb_policy,
        terminate = time_limit(2)
    )

agents = [(i, checkpoint(i)) for i in range(4)]

In [None]:
from itertools import cycle

def compete(agent1, agent2, times, state):
    initial_state = state
    agents = [agent1, agent2]
    stats = [0, 0]
    
    for _ in tqdm(range(times)):
        # We will let both players play `times` times as the first one to make a move.
        order = [[0, 1], [1, 0]]
        for o in order:
            state = initial_state
            for player in cycle(o):
                winner = state.winner()
                if winner is not None:
                    stats[o[winner - 1]] += 1
                    break
                    
                action = agents[player].policy(state)
                state = state.next_state(action)
    
    
    return stats

In [None]:
from itertools import combinations
for (i0, a0), (i1, a1) in combinations(agents, r=2):
    (w0, w1) = compete(a0, a1, 10, game)
    print(f'{i0} wins {w0}; {i1} wins {w1}')

In [None]:
state = game
history = [state]
while not state.is_final():
    action = mcts.policy(state)
    state = state.next_state(action)
    history.append(state)
    
@interact
def show(i = (0, len(history) - 1, 1)):
    return history[i]