# Declare global variables

### Gomoku environment consulted from https://github.com/rockingdingo/gym-gomoku/blob/master/gym_gomoku/envs/gomoku.py
### DQN consulted from https://github.com/doublejtoh/GomokuAI/blob/master/dqn.py

In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras

from environment.ReplayMemory import ReplayMemory
from environment.GomokuEnvironmentTen import GomokuEnvironmentTen
import math
import os
import random
# set global variables
num_actions = 100
num_states = 100
hidden_units = 1000
mem_threshold = 30000
batch = 50
num_epoch = 100
epsilon_disRate = 0.999
min_epsilon = 0.1
gamma = 0.99
learning_rate = 0.2
winning_reward = 1

## Set the network

In [6]:
# Set the model
X = tf.placeholder(tf.float32, [None, num_states])
weight1 = tf.Variable(tf.truncated_normal([num_states, hidden_units], stddev = 1.0 / math.sqrt(float(num_states))))
bias1 = tf.Variable(tf.truncated_normal([hidden_units], stddev = 0.01))
input_layer = tf.nn.relu(tf.matmul(X, weight1) + bias1)

weight2 = tf.Variable(tf.truncated_normal([hidden_units, hidden_units], stddev = 1.0 / math.sqrt(float(hidden_units))))
bias2 = tf.Variable(tf.truncated_normal([hidden_units], stddev = 0.01))
hidden_layer = tf.nn.relu(tf.matmul(input_layer, weight2) + bias2)

weight3 = tf.Variable(tf.truncated_normal([hidden_units, hidden_units], stddev = 1.0 / math.sqrt(float(hidden_units))))
bias3 = tf.Variable(tf.truncated_normal([hidden_units], stddev = 0.01))
hidden_layer2 = tf.nn.relu(tf.matmul(hidden_layer, weight3) + bias3)

weight4 = tf.Variable(tf.truncated_normal([hidden_units, num_actions], stddev = 1.0 / math.sqrt(float(hidden_units))))
bias4 = tf.Variable(tf.truncated_normal([num_actions], stddev = 0.01))
output_layer = tf.matmul(hidden_layer2, weight4) + bias4

Y = tf.placeholder(tf.float32, [None, num_actions])
cost = tf.reduce_sum(tf.square(Y - output_layer)) / (2 * batch)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)


empty = 0
BlackStone = 1
whiteStone = 2

## Function for training

In [7]:
def playGame(env, memory, sess, saver, epsilon, iteration):

        # Repeat playing
        num_wins = 0

        for i in range(num_epoch):
                env.reset()

                err = 0
                gameOver = False
                currentPlayer = BlackStone
                
                while( gameOver != True ):
                        
                        # Act
                        
                        action = - 9999
                        
                        if( currentPlayer == BlackStone ):
                                currentState = env.getState()
                        else:
                                currentState = env.getStateInverse()

                        if( (float(random.randrange(0, 9999)) / 10000) <= epsilon ):
                                action = env.getActionRandom()
                        else:
                                action = env.getAction(sess, currentState)

                        if( epsilon > min_epsilon ):
                                epsilon = epsilon * epsilon_disRate
                        
                        next_state, reward, gameOver = env.makeMove(currentPlayer, action)

                        if( reward == 1 and currentPlayer == BlackStone ):
                                num_wins = num_wins + 1

                        
                        # Learning
                
                        memory.remember(currentState, action, reward, next_state, gameOver)

                        inputs, targets = memory.getBatch(output_layer, batch, num_actions, num_states, sess, X)
                        
                        _, loss = sess.run([optimizer, cost], feed_dict = {X: inputs, Y: targets})
                        err = err + loss
                        
                        if( currentPlayer == BlackStone ):
                                currentPlayer = whiteStone
                        else:
                                currentPlayer = BlackStone

                print("num_epoch " + str(iteration) + str(i) + ": err = " + str(err) + ": Win count = " + str(num_wins) +
                                " Win ratio = " + str(float(num_wins) / float(i + 1) * 100))

                print(targets)

                if( (i % 10 == 0) and (i != 0) ):
                        save_path = saver.save(sess, os.getcwd() + "/GomokuModel.ckpt")
                        print("Model saved in file: %s" % save_path)
        return float(num_wins) / float(i + 1) * 100

## Run and Train

In [8]:
# Instantiate the environment
env = GomokuEnvironmentTen(10)

# Instantiate replay memory
memory = ReplayMemory(10, mem_threshold, gamma)

# Initialize tensorflow
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# Saver
saver = tf.train.Saver()

# Load model
if( os.path.isfile(os.getcwd() + "/GomokuModel.ckpt.index") == True ):
        saver.restore(sess, os.getcwd() + "/GomokuModel.ckpt")
        print('Saved model is loaded!')

# Playing the game
iteration = 0
winRateList = []
for x in range(9000000):
        winRate = playGame(env, memory, sess, saver, 1, iteration)
        winRateList.append(winRate)
        print(winRateList)
        iteration += 1

        df = pd.DataFrame(winRateList, columns=["colummn"])
        df.to_csv('results/list2.csv', index=False)


# close session
sess.close()

num_epoch 00: err = 0.707643896632: Win count = 1 Win ratio = 100.0
[[ 0.94182771  0.7771973   0.86938298 ...,  0.99316537  0.15527529
   1.02199745]
 [ 0.97041172  0.83368385  1.08388126 ...,  1.05204642  0.18093157
   1.04096723]
 [ 0.94725144  0.96629345  0.76451641 ...,  0.58852738  0.23251589
   0.9564501 ]
 ..., 
 [ 0.93896919  1.02494001  0.75032663 ...,  0.47706613  0.1396396
   0.95614356]
 [ 0.78291178  0.70191133  0.58568442 ...,  0.50568449  0.03458634
   0.75904793]
 [ 0.86587226  0.8377161   0.89935178 ...,  0.69442379  0.13656621
   0.82907122]]
num_epoch 01: err = 0.314418418275: Win count = 2 Win ratio = 100.0
[[ 0.20672235  0.30257723  0.30051139 ...,  0.21775652  0.2574971
   0.31149003]
 [ 0.35031053  0.34938875  0.59141225 ...,  0.2328244   0.75594753
   0.60389972]
 [ 0.47385111  0.51055104  0.75135845 ...,  0.18886739  0.72994214
   0.88574338]
 ..., 
 [ 0.95029259  0.81123573  0.98498869 ...,  0.98451984  0.79267049
   1.06824064]
 [ 0.64831764  0.54226393  0.73

KeyboardInterrupt: 