# Declare global variables

### Gomoku environment consulted from https://github.com/rockingdingo/gym-gomoku/blob/master/gym_gomoku/envs/gomoku.py
### DQN consulted from https://github.com/doublejtoh/GomokuAI/blob/master/dqn.py

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras

from environment.ReplayMemory import ReplayMemory
from environment.GomokuEnvironmentRewardTen import GomokuEnvironmentRewardTen
import math
import os
import random

# set global variables
num_actions = 100
num_states = 100
hidden_units = 1000
mem_threshold = 30000
batch = 50
num_epoch = 100
epsilon_disRate = 0.999
min_epsilon = 0.1
gamma = 0.9
learning_rate = 0.2
winning_reward = 1

Instructions for updating:
Colocations handled automatically by placer.


Using TensorFlow backend.


## Set the network

In [2]:
# Set the model
X = tf.placeholder(tf.float32, [None, num_states])
weight1 = tf.Variable(tf.truncated_normal([num_states, hidden_units], stddev = 1.0 / math.sqrt(float(num_states))))
bias1 = tf.Variable(tf.truncated_normal([hidden_units], stddev = 0.01))
input_layer = tf.nn.relu(tf.matmul(X, weight1) + bias1)

weight2 = tf.Variable(tf.truncated_normal([hidden_units, hidden_units], stddev = 1.0 / math.sqrt(float(hidden_units))))
bias2 = tf.Variable(tf.truncated_normal([hidden_units], stddev = 0.01))
hidden_layer = tf.nn.relu(tf.matmul(input_layer, weight2) + bias2)

weight3 = tf.Variable(tf.truncated_normal([hidden_units, hidden_units], stddev = 1.0 / math.sqrt(float(hidden_units))))
bias3 = tf.Variable(tf.truncated_normal([hidden_units], stddev = 0.01))
hidden_layer2 = tf.nn.relu(tf.matmul(hidden_layer, weight3) + bias3)

weight4 = tf.Variable(tf.truncated_normal([hidden_units, num_actions], stddev = 1.0 / math.sqrt(float(hidden_units))))
bias4 = tf.Variable(tf.truncated_normal([num_actions], stddev = 0.01))
output_layer = tf.matmul(hidden_layer2, weight4) + bias4

Y = tf.placeholder(tf.float32, [None, num_actions])
cost = tf.reduce_sum(tf.square(Y - output_layer)) / (2 * batch)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)


empty = 0
BlackStone = 1
whiteStone = 2

## Function for training

In [3]:
def playGame(env, memory, sess, saver, epsilon, iteration):

        # Repeat playing
        num_wins = 0

        for i in range(num_epoch):
                env.reset()

                err = 0
                gameOver = False
                currentPlayer = BlackStone
                
                while( gameOver != True ):
                        
                        # Act
                        
                        action = - 9999
                        
                        if( currentPlayer == BlackStone ):
                                currentState = env.getState()
                        else:
                                currentState = env.getStateInverse()

                        if( (float(random.randrange(0, 9999)) / 10000) <= epsilon ):
                                action = env.getActionRandom()
                        else:
                                action = env.getAction(sess, currentState)

                        if( epsilon > min_epsilon ):
                                epsilon = epsilon * epsilon_disRate
                        
                        next_state, reward, gameOver = env.makeMove(currentPlayer, action)

                        if( reward == 1 and currentPlayer == BlackStone ):
                                num_wins = num_wins + 1

                        
                        # Learning
                
                        memory.remember(currentState, action, reward, next_state, gameOver)

                        inputs, targets = memory.getBatch(output_layer, batch, num_actions, num_states, sess, X)
                        
                        _, loss = sess.run([optimizer, cost], feed_dict = {X: inputs, Y: targets})
                        err = err + loss
                        
                        if( currentPlayer == BlackStone ):
                                currentPlayer = whiteStone
                        else:
                                currentPlayer = BlackStone

                print("num_epoch " + str(iteration) + str(i) + ": err = " + str(err) + ": Win count = " + str(num_wins) +
                                " Win ratio = " + str(float(num_wins) / float(i + 1) * 100))

                print(targets)

                if( (i % 10 == 0) and (i != 0) ):
                        save_path = saver.save(sess, os.getcwd() + "/GomokuModel.ckpt")
                        print("Model saved in file: %s" % save_path)
        return float(num_wins) / float(i + 1) * 100

## Run and Train

In [4]:
print("Training new model")

# Instantiate the environment
env = GomokuEnvironmentRewardTen(10)

# Instantiate replay memory
memory = ReplayMemory(10, mem_threshold, gamma)

# Initialize tensorflow
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# Saver
saver = tf.train.Saver()

# Load model
if( os.path.isfile(os.getcwd() + "/GomokuModel.ckpt.index") == True ):
        saver.restore(sess, os.getcwd() + "/GomokuModel.ckpt")
        print('Saved model is loaded!')

# Playing the game
iteration = 0
winRateList = []
for x in range(9000000):
        winRate = playGame(env, memory, sess, saver, 1, iteration)
        winRateList.append(winRate)
        print(winRateList)
        iteration += 1

        df = pd.DataFrame(winRateList, columns=["colummn"])
        df.to_csv('results/list2.csv', index=False)


# close session
sess.close()

Training new model
num_epoch 00: err = 0.687717353567: Win count = 1 Win ratio = 100.0
[[ 0.82583249  0.7722047   0.42870101 ..., -0.03959448 -0.18507947
  -0.04592571]
 [ 0.69334608  0.67293453  0.20283471 ...,  0.15863803 -0.14301051
  -0.15185651]
 [ 0.05044281  0.08557514  0.04213211 ..., -0.00171563  0.00386832
  -0.02603359]
 ..., 
 [ 0.61952847  0.73792946  0.29140508 ...,  0.1322259  -0.16090304
  -0.05082181]
 [ 0.26584998  0.3475492   0.17796841 ...,  0.03984526 -0.01827494
  -0.04266731]
 [ 0.52038342  0.52105498  0.26403543 ...,  0.05762331 -0.07130639
  -0.02805105]]


KeyboardInterrupt: 