In [1]:
import numpy as np
from numba import jit
from scipy.ndimage import convolve
from env import Game

import numpy as np
import tensorflow as tf

PLAYS = 10000

In [3]:
types = {'*':0,
#          '':1,
         '0':2,
         '1':3,
         '2':4,
         '3':5,
         '4':6,
         '5':7,
         '6':8,
         '7':9,
         '8':10}

def prepare_view(state):
    n = np.zeros((*state.shape, 11))
    for x, row in enumerate(state):
        for y, col in enumerate(row):
            if col:
                n[x, y, types[col]] = 1
    return n[:, :, :-1]

def prepare_rew(state):
    n = np.zeros((*state.shape, 1))
    for x, row in enumerate(state):
        for y, col in enumerate(row):
            n[x, y, 0] = (0.5-col)*2
    return n

def allowed_points(state):
    n = np.zeros(state.shape)
    for x, row in enumerate(state):
        for y, col in enumerate(row):
            if col == '*':
                n[x, y] = True
    return n

In [5]:
def agent():
    inp = tf.placeholder(tf.float32, shape=(None, 5, 5, 10))
    flinp = tf.keras.layers.Flatten()(inp)
    d1 = tf.keras.layers.Dense(1250, activation='relu')(flinp)
    d2 = tf.keras.layers.Dense(250, activation='relu')(d1)
    d3 = tf.keras.layers.Dense(100, activation='relu')(d2)
    d4 = tf.keras.layers.Dense(25, activation='relu')(d3)
    
    x1 = tf.keras.layers.MaxPooling2D((2, 2))(inp)
    x2 = tf.keras.layers.AveragePooling2D((2, 2))(inp)
    x3 = tf.keras.layers.MaxPooling2D((3, 3))(inp)
    x4 = tf.keras.layers.AveragePooling2D((3, 3))(inp)

    x1 = tf.keras.layers.Flatten()(x1)
    x2 = tf.keras.layers.Flatten()(x2)
    x3 = tf.keras.layers.Flatten()(x3)
    x4 = tf.keras.layers.Flatten()(x4)

    input_ready = tf.keras.layers.concatenate([x1, x2, x3, x4, flinp, d4])
    out = tf.keras.layers.Dense(250)(input_ready)
    out = tf.keras.layers.Dense(25)(out)

    out = tf.keras.layers.Dense(1)(out)
    return inp, out

    
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1)

sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
state_inp, values = agent()

reward_phold = tf.placeholder(tf.float32, shape = (None, 1))
optim = tf.train.AdamOptimizer()
loss_f = tf.reduce_sum(tf.pow(values - reward_phold, 2))
train_function = optim.minimize(loss_f)
sess.run(tf.global_variables_initializer())

env = Game(9, 9, 0.15)
for it in range(1000):
    rewsum_l = []
    targets = []
    vics = 0
    states  = []
    rewards = []
    for pl in range(PLAYS):
        env.start(mode='bot')
        rewsum = 0
        target = 0
        steps  = 1
        while env.game_runned:
            inps = np.array([prepare_view(field) for field in env.show()])
            est_values = sess.run(values, feed_dict={state_inp:inps})
            est_values = np.squeeze(est_values).reshape(9, 9)
            allowed = allowed_points(env.pg.player_field).astype(np.uint8)
            allowed_flat = [n1*9+n2 for n1, a in enumerate(allowed) \
                            for n2, b in enumerate(a) if b == 1]
            point = allowed_flat[est_values.ravel()[allowed.ravel()==1].argmax()]
            env.do_step(point%9, point//9)
            reward = est_values
            rew = 1 if not env.lose else -1
            reward[point//9, point%9] += np.copy(rew)
            target += steps if not env.lose else (steps - 81)
            rewsum += rew
            rewards += [reward[point//9, point%9].reshape(1, 1)]
            states += [np.copy(inps[point]).reshape(1, 5, 5, 10)]
            steps += 1
            vics += 1 if env.won else 0
            rewsum_l += [np.copy(rewsum)]
            targets += [np.copy(target)]
    states = np.asarray(states)
    rewards = np.asarray(rewards)
    targets = np.array(targets)
    rewsum_l = np.array(rewsum_l)
    print(f'Iteration #{it}', '| aver. reward:', round(np.mean(rewsum_l[rewsum_l>=0]), 2), '| Victory count:', f'{vics} | Target vals:', 
         round(np.mean(targets), 2))

    for ep in range(10):
        pos = np.random.choice(range(states.shape[0]), size=(states.shape[0],), replace=False)
        inp_st = states[pos].reshape(-1, 5, 5, 10)
        inp_rew = rewards[pos].reshape(-1, 1)
        sess.run(train_function, feed_dict = {state_inp: inp_st,
                                              reward_phold: inp_rew.reshape(-1, 1)})

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Iteration #0 | aver. reward: 3.72 | Victory count: 0 | Target vals: -2.8
Iteration #1 | aver. reward: 5.33 | Victory count: 0 | Target vals: 14.13
Iteration #2 | aver. reward: 6.17 | Victory count: 0 | Target vals: 23.82
Iteration #3 | aver. reward: 6.57 | Victory count: 0 | Target vals: 28.75
Iteration #4 | aver. reward: 6.91 | Victory count: 0 | Target vals: 32.8
Iteration #5 | aver. reward: 7.24 | Victory count: 1 | Target vals: 37.55
Iteration #6 | aver. reward: 7.46 | Victory count: 3 | Target vals: 40.23
Iteration #7 | aver. reward: 7.67 | Victory count: 4 | Target vals: 42.81
Iteration #8 | aver. reward: 7.96 | Victory count: 6 | Target vals: 47.01
Iteration #9 | aver. reward: 8.17 | Victory count: 14 | Target vals: 50.2
Iteration #10 | aver. reward: 8.46 | Victory count

Iteration #97 | aver. reward: 13.15 | Victory count: 2452 | Target vals: 132.39
Iteration #98 | aver. reward: 13.17 | Victory count: 2534 | Target vals: 132.66
Iteration #99 | aver. reward: 13.26 | Victory count: 2479 | Target vals: 134.61
Iteration #100 | aver. reward: 13.29 | Victory count: 2523 | Target vals: 135.26
Iteration #101 | aver. reward: 13.29 | Victory count: 2545 | Target vals: 134.87
Iteration #102 | aver. reward: 13.34 | Victory count: 2603 | Target vals: 136.13
Iteration #103 | aver. reward: 13.23 | Victory count: 2463 | Target vals: 133.19
Iteration #104 | aver. reward: 13.39 | Victory count: 2537 | Target vals: 136.49
Iteration #105 | aver. reward: 13.39 | Victory count: 2597 | Target vals: 136.33
Iteration #106 | aver. reward: 13.24 | Victory count: 2513 | Target vals: 133.75
Iteration #107 | aver. reward: 13.1 | Victory count: 2493 | Target vals: 131.1
Iteration #108 | aver. reward: 13.08 | Victory count: 2513 | Target vals: 130.75
Iteration #109 | aver. reward: 13

KeyboardInterrupt: 