In [1]:
import numpy as np
from numba import jit
from scipy.ndimage import convolve
from env import Game

import numpy as np
import tensorflow as tf

PLAYS = 10000

In [2]:
types = {'*':0,
#          '':1,
         '0':2,
         '1':3,
         '2':4,
         '3':5,
         '4':6,
         '5':7,
         '6':8,
         '7':9,
         '8':10}

def prepare_view(state):
    n = np.zeros((*state.shape, 11))
    for x, row in enumerate(state):
        for y, col in enumerate(row):
            if col:
                n[x, y, types[col]] = 1
    return n[:, :, :-1]

def prepare_rew(state):
    n = np.zeros((*state.shape, 1))
    for x, row in enumerate(state):
        for y, col in enumerate(row):
            n[x, y, 0] = (0.5-col)*2
    return n

def allowed_points(state):
    n = np.zeros(state.shape)
    for x, row in enumerate(state):
        for y, col in enumerate(row):
            if col == '*':
                n[x, y] = True
    return n

In [3]:
def agent():
    inp = tf.placeholder(tf.float32, shape=(None, 5, 5, 10))
    flinp = tf.keras.layers.Flatten()(inp)
    d1 = tf.keras.layers.Dense(1250, activation='relu')(flinp)
    d2 = tf.keras.layers.Dense(250, activation='relu')(d1)
    d3 = tf.keras.layers.Dense(100, activation='relu')(d2)
    d4 = tf.keras.layers.Dense(25, activation='relu')(d3)
    
    x1 = tf.keras.layers.MaxPooling2D((2, 2))(inp)
    x2 = tf.keras.layers.AveragePooling2D((2, 2))(inp)
    x3 = tf.keras.layers.MaxPooling2D((3, 3))(inp)
    x4 = tf.keras.layers.AveragePooling2D((3, 3))(inp)

    x1 = tf.keras.layers.Flatten()(x1)
    x2 = tf.keras.layers.Flatten()(x2)
    x3 = tf.keras.layers.Flatten()(x3)
    x4 = tf.keras.layers.Flatten()(x4)

    input_ready = tf.keras.layers.concatenate([x1, x2, x3, x4, flinp, d4])
    out = tf.keras.layers.Dense(250)(input_ready)
    out = tf.keras.layers.Dense(25)(out)

    out = tf.keras.layers.Dense(1)(out)
    return inp, out

    
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1)

sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
state_inp, values = agent()

reward_phold = tf.placeholder(tf.float32, shape = (None, 1))
optim = tf.train.AdamOptimizer()
loss_f = tf.reduce_sum(tf.pow(values - reward_phold, 2))
train_function = optim.minimize(loss_f)
sess.run(tf.global_variables_initializer())

env = Game(9, 9, 0.15)
for it in range(1000):
    rewsum_l = []
    targets = []
    vics = 0
    states  = []
    rewards = []
    for pl in range(PLAYS):
        env.start(mode='bot')
        rewsum = 0
        target = 0
        steps  = 1
        while env.game_runned:
            inps = np.array([prepare_view(field) for field in env.show()])
            est_values = sess.run(values, feed_dict={state_inp:inps})
            est_values = np.squeeze(est_values).reshape(9, 9)
            allowed = allowed_points(env.pg.player_field).astype(np.uint8)
            allowed_flat = [n1*9+n2 for n1, a in enumerate(allowed) \
                            for n2, b in enumerate(a) if b == 1]
            point = allowed_flat[est_values.ravel()[allowed.ravel()==1].argmax()]
            env.do_step(point%9, point//9)
            reward = est_values
            rew = 1 if not env.lose else -1
            reward[point//9, point%9] += np.copy(rew)
            target += steps if not env.lose else (steps - 81)
            rewsum += rew
            rewards += [reward[point//9, point%9].reshape(1, 1)]
            states += [np.copy(inps[point]).reshape(1, 5, 5, 10)]
            steps += 1
            vics += 1 if env.won else 0
            rewsum_l += [np.copy(rewsum)]
            targets += [np.copy(target)]
    states = np.asarray(states)
    rewards = np.asarray(rewards)
    targets = np.array(targets)
    rewsum_l = np.array(rewsum_l)
    print(f'Iteration #{it}', '| aver. reward:', round(np.mean(rewsum_l[rewsum_l>=0]), 2), '| Victory count:', f'{vics} | Target vals:', 
         round(np.mean(targets), 2))

    for ep in range(10):
        pos = np.random.choice(range(states.shape[0]), size=(states.shape[0],), replace=False)
        inp_st = states[pos].reshape(-1, 5, 5, 10)
        inp_rew = rewards[pos].reshape(-1, 1)
        sess.run(train_function, feed_dict = {state_inp: inp_st,
                                              reward_phold: inp_rew.reshape(-1, 1)})

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Iteration #0 | aver. reward: 2.17 | Victory count: 0 | Target vals: -16.47
Iteration #1 | aver. reward: 4.82 | Victory count: 2 | Target vals: 10.68
Iteration #2 | aver. reward: 5.35 | Victory count: 6 | Target vals: 16.51
Iteration #3 | aver. reward: 5.73 | Victory count: 5 | Target vals: 20.64
Iteration #4 | aver. reward: 5.95 | Victory count: 8 | Target vals: 23.6
Iteration #5 | aver. reward: 6.11 | Victory count: 14 | Target vals: 25.43
Iteration #6 | aver. reward: 6.39 | Victory count: 21 | Target vals: 29.06
Iteration #7 | aver. reward: 6.64 | Victory count: 22 | Target vals: 32.11
Iteration #8 | aver. reward: 6.81 | Victory count: 30 | Target vals: 34.38
Iteration #9 | aver. reward: 6.93 | Victory count: 28 | Target vals: 35.69
Iteration #10 | aver. reward: 7.03 | Victor

Iteration #98 | aver. reward: 10.07 | Victory count: 2645 | Target vals: 79.15
Iteration #99 | aver. reward: 10.1 | Victory count: 2672 | Target vals: 79.36
Iteration #100 | aver. reward: 10.07 | Victory count: 2660 | Target vals: 79.19
Iteration #101 | aver. reward: 10.07 | Victory count: 2703 | Target vals: 78.94
Iteration #102 | aver. reward: 10.09 | Victory count: 2675 | Target vals: 79.5
Iteration #103 | aver. reward: 10.13 | Victory count: 2737 | Target vals: 80.07
Iteration #104 | aver. reward: 10.11 | Victory count: 2723 | Target vals: 79.46
Iteration #105 | aver. reward: 10.08 | Victory count: 2628 | Target vals: 79.44
Iteration #106 | aver. reward: 10.07 | Victory count: 2702 | Target vals: 79.18
Iteration #107 | aver. reward: 10.03 | Victory count: 2739 | Target vals: 78.35
Iteration #108 | aver. reward: 10.16 | Victory count: 2757 | Target vals: 80.62
Iteration #109 | aver. reward: 10.07 | Victory count: 2778 | Target vals: 78.99
Iteration #110 | aver. reward: 10.08 | Victo

Iteration #201 | aver. reward: 10.42 | Victory count: 3284 | Target vals: 84.86
Iteration #202 | aver. reward: 10.37 | Victory count: 3293 | Target vals: 83.91
Iteration #203 | aver. reward: 10.37 | Victory count: 3287 | Target vals: 83.91
Iteration #204 | aver. reward: 10.43 | Victory count: 3349 | Target vals: 84.83
Iteration #205 | aver. reward: 10.4 | Victory count: 3336 | Target vals: 84.72
Iteration #206 | aver. reward: 10.43 | Victory count: 3412 | Target vals: 85.06
Iteration #207 | aver. reward: 10.49 | Victory count: 3361 | Target vals: 85.86
Iteration #208 | aver. reward: 10.35 | Victory count: 3389 | Target vals: 83.24
Iteration #209 | aver. reward: 10.36 | Victory count: 3397 | Target vals: 83.59
Iteration #210 | aver. reward: 10.43 | Victory count: 3426 | Target vals: 84.86
Iteration #211 | aver. reward: 10.45 | Victory count: 3419 | Target vals: 84.97
Iteration #212 | aver. reward: 10.46 | Victory count: 3387 | Target vals: 85.03
Iteration #213 | aver. reward: 10.49 | Vi

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\Daniil\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-3-6c841de43af2>", line 51, in <module>
    inps = np.array([prepare_view(field) for field in env.show()])
  File "<ipython-input-3-6c841de43af2>", line 51, in <listcomp>
    inps = np.array([prepare_view(field) for field in env.show()])
  File "<ipython-input-2-0087678d520e>", line 14, in prepare_view
    n = np.zeros((*state.shape, 11))
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Daniil\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2040, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (m

KeyboardInterrupt: 