In [176]:
from tictaczero.board import Board, EMPTY, CROSS, CIRCLE
from tictaczero.player import BasePlayer, RandomPlayer, SmartPlayer, BrainPlayer
from tictaczero.games import play_a_game

from tensorflow import keras
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

import time

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [186]:
def play_x_brainy_games(x=100):
    """
    Plays x games and returns both (hopefully) smarter players.
    """
    player1 = BrainPlayer(side=CROSS)
    player2 = BrainPlayer(side=CIRCLE)
    
    for i in range(x):
        result = play_a_game(player1, player2, print_result=False)
        player1.memorize(result)
        player2.memorize(result)
        
    return player1, player2

In [193]:
%%timeit
rBplayer1, rBplayer2 = play_x_brainy_games(100)

14.9 s ± 1.63 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [191]:
rBplayer1.memory.shape

(866, 10)

In [None]:
def play_brainy_games(BPlayer1, BPlayer2, x = 100):
    player1 = BPlayer1
    player2 = BPlayer2
    
    play_a_game(player1, player2):
        
    """
    Playes x games and returns both (hopefully) smarter players/
    
    """
    
    return player1, player2

In [167]:
def play_brainy_games(BPlayer1, BPlayer2, x = 100):
    states = np.zeros(shape(1, 10), dtype = np.int32)
    for i in range(x):
        board_history = play_a_game()

SyntaxError: unexpected EOF while parsing (<ipython-input-167-8757c2b9fdcb>, line 3)

In [3]:
def play_x_games(Player1, Player2, x=100):
    states = np.zeros(shape=(1, 10), dtype=np.int32)
    for i in range(x):
        board_history = play_a_game(Player1, Player2)
        
        states = np.concatenate((states, board_history), axis = 0)
        
    return states

df = play_x_games(SmartPlayer, SmartPlayer, x=1000)

In [146]:
board = Board(3)

squares, board_states = board.next_board_states(CROSS)

In [152]:
bp = BrainPlayer(side=CROSS, max_memory=10000)

In [159]:
bp.memorize(df)

In [160]:
bp.memory.shape

(10000, 10)

In [161]:
bp.max_memory

10000

In [117]:
bp.brain.predict(np.array([[0, 1, -1, 0, 0, 0, 0, 0, 0]]))

array([[0.31411847, 0.24697386, 0.43890762]], dtype=float32)

In [133]:
df1 = df.copy()

In [138]:
df1[3:,:]

array([[ 0,  0,  0, ...,  1, -1,  0],
       [ 0,  0,  0, ...,  1, -1,  0],
       [ 0,  0,  0, ...,  1, -1,  0],
       ...,
       [-1, -1,  1, ...,  0,  1,  1],
       [-1, -1,  1, ...,  0,  1,  1],
       [-1, -1,  1, ...,  0,  1,  1]])

In [163]:
df1[-3:,:]

array([[-1, -1,  1,  1,  0,  0,  0,  0,  1,  1],
       [-1, -1,  1,  1,  0,  0, -1,  0,  1,  1],
       [-1, -1,  1,  1,  0,  1, -1,  0,  1,  1]])

In [164]:
df1

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  1,  0,  0],
       ...,
       [-1, -1,  1, ...,  0,  1,  1],
       [-1, -1,  1, ...,  0,  1,  1],
       [-1, -1,  1, ...,  0,  1,  1]])

Now we need a state.

We need an agent (player)

Agent can make an action

Agent gets a reward.

In [102]:
board = Board(3)

board.move(1, CROSS)
board.move(5, CIRCLE)

np.array([board.state])

array([[ 0,  1,  0,  0,  0, -1,  0,  0,  0]])

In [108]:
bp.brain.predict(np.array([board.state]))[0]

array([0.35162076, 0.30168402, 0.34669527], dtype=float32)

Network architecture:

Input layer:
$$B + S = 9+1 = 10 \text{ Nodes}$$

Where $B$ is the board state and $S$ is the side. 

Output layer:
$$B + S = 9+1 = 10 \text{ Nodes}$$

# OPTION 1

Have the neural network predict the move.

# OPTION 2 

Have the neural network predict the outcome of the game. And let a Q-learning agent decide.

# OPTION 2 - Network predicts outcome.


## The Network
In this section we're going to build a neural network which predicts the outcome of a game of tic tac toe give a certain board state. First we'll have to generate enough board states with outcomes.

In total there are three outcomes:

1 - CROSS WINS

-1 - CIRCLE WINS

0 - DRAW

So the output layer consists of three nodes. The input layer consists of the board state. We don't give the turn because CROSS always starts.

In [5]:
model = keras.models.Sequential([
    keras.layers.Dense(28, input_shape = [9], activation = 'relu'),
    keras.layers.Dense(28, activation = 'relu'),
    keras.layers.Dense(3, activation = 'softmax')
])

In [6]:
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

## Generating the data.

In total there are only $3^9=19683$ different possible tic-tac-toe positions. With mirror or rotation permutation even less. Let's try to generate 10.000 games and save each result of the game and put in the following dataframe.


In [7]:
X = df[:,:-1]
y = df[:, -1]


cut_off = 0.7

X_train = X[:int(cut_off*len(X))]
X_test = X[int(cut_off*len(X)):]
y_train = y[:int(cut_off*len(X))]
y_test = y[int(cut_off*len(X)):]

In [8]:
xtrain_tf = tf.convert_to_tensor(X_train, np.float32)
ytrain_tf = tf.convert_to_tensor(y_train, np.int32)

In [9]:
model.fit(xtrain_tf, ytrain_tf, epochs=20)

Train on 4942 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x143178d90>

In [88]:
y_pred = model.predict_classes(tf.convert_to_tensor(X_test, np.float32))

In [89]:
y_pred

array([1, 1, 1, ..., 1, 1, 3])

In [10]:
model.predict(tf.convert_to_tensor(np.array([[0,0,0,0,0,0,0,0,0]]), np.float32))

array([[0.03995128, 0.7037237 , 0.25632495]], dtype=float32)

In [11]:
model.predict(np.array([[2,2,0,0,1,0,0,1,1]]))

array([[1.8495394e-04, 9.6335566e-01, 3.6459364e-02]], dtype=float32)

In [15]:
from collections import Counter
Counter(y_train)

Counter({0: 291, 1: 3315, 2: 1336})

In [100]:
model.predict_classes(np.array([[0,0,0,0,0,0,0,0,0]]))

array([1])

In [91]:
from collections import Counter
counts = Counter(y_pred)

In [92]:
counts

Counter({1: 8678, 2: 1420, 3: 250})

In [93]:
Counter(y_test)

Counter({2: 3114, 1: 6584, 3: 650})

In [94]:
results = pd.DataFrame({"y_pred": y_pred, "y_test": y_test})

In [95]:

results = results.assign(wrong = lambda d: d["y_pred"] != d["y_test"])

In [96]:
sum(results['wrong'])/len(results['wrong'])

0.2908774642442984

In [97]:
results

Unnamed: 0,y_pred,y_test,wrong
0,1,2,True
1,1,2,True
2,1,2,True
3,2,2,False
4,2,2,False
...,...,...,...
10343,1,1,False
10344,1,1,False
10345,1,1,False
10346,1,1,False


## Q-Learning

In [24]:
import pickle

In [21]:
BOARD_ROWS = 3
BOARD_COLS = 3

class Board2:
    def __init__(self, p1, p2):
            self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
            self.p1 = p1
            self.p2 = p2
            self.isEnd = False
            self.boardHash = None
            # init p1 plays first
            self.playerSymbol = 1
            
    def getHash(self):
        self.boardHash = str(self.board.reshape(BOARD_COLS * BOARD_ROWS))
        return self.boardHash
    
    def availablePositions(self):
        positions = []
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                if self.board[i, j] == 0:
                    positions.append((i, j))
        return positions
    
    def updateState(self, position):
        self.board[position] = self.playerSymbol
        
        self.playerSymbol = -1 if self.playerSymbol == 1 else 1
        
    def winner(self):
        # row
        for i in range(BOARD_ROWS):
            if sum(self.board[i, :]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[i, :]) == -3:
                self.isEnd = True
                return -1
            
        # col
        for j in range(BOARD_COLS):
            if sum(self.board[:, j]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[:, j]) == -3:
                self.isEnd = True
                return 1
            
        # diagonal
        diag_sum1 = sum([self.board[i, i] for i in range(BOARD_COLS)])
        diag_sum2 = sum([self.board[i, BOARD_COLS - i - 1] for i in range(BOARD_COLS)])
        diag_sum = max(abs(diag_sum1), abs(diag_sum2))
        if diag_sum == 3:
            self.isEnd = True
            if diag_sum1 == 3 or diag_sum2 == 3:
                return 1
            else:
                return -1
            
        # tie
        if len(self.availablePositions()) == 0:
            self.isEnd = True
            return 0
        # not end
        self.isEnd = False
        return None
    
        # not done
        self.isEnd = False
        return None
    
    def giveReward(self):
        result = self.winner()
        # backpropagate reward
        if result == 1:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
        elif result == -1:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
        else:
            self.p1.feedReward(0.1)
            self.p2.feedReward(0.5)
            
    def play(self, rounds=100):
        for i in range(rounds):
            if i % 1000 == 0:
                print("Rounds {}".format(i))
                
            while not self.isEnd:
                # Player 1
                positions = self.availablePositions()
                p1_action = self.p1.chooseAction(positions,
                                                 self.board,
                                                 self.playerSymbol)
                self.updateState(p1_action)
                board_hash = self.getHash()
                self.p1.addState(board_hash)
                
                win = self.winner()
                if win is not None:
                    # self.showBoard()
                    # ended with p1 either win or draw
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break
                    
                else:
                    # Player 2
                    positions = self.availablePositions()
                    p2_action = self.p2.chooseAction(positions,
                                                     self.board,
                                                     self.playerSymbol)
                    self.updateState(p2_action)
                    board_hash = self.getHash()
                    self.p2.addState(board_hash)
                    
                    win = self.winner()
                    if win is not None:
                        # self.showBoard()
                        # ended with p2 either win or draw
                        self.giveReward()
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break
                        
    # play against human
    def play2(self):
        while not self.isEnd:
            # Player 1
            positions = self.availablePositions()
            p1_action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
            # take action and upate board state
            self.updateState(p1_action)
            self.showBoard()
            # check board status if it is end
            win = self.winner()
            if win is not None:
                if win == 1:
                    print(self.p1.name, "wins!")
                else:
                    print("tie!")
                self.reset()
                break

            else:
                # Player 2
                positions = self.availablePositions()
                p2_action = self.p2.chooseAction(positions)

                self.updateState(p2_action)
                self.showBoard()
                win = self.winner()
                if win is not None:
                    if win == -1:
                        print(self.p2.name, "wins!")
                    else:
                        print("tie!")
                    self.reset()
                    break

In [31]:
class Player:
    def __init__(self, name, exp_rate=0.3):
        self.name = name
        self.states = []
        self.lr = 0.2
        self.exp_rate = exp_rate
        self.decay_gamma = 0.9
        self.states_value = {}
        
    def chooseAction(self, positions, current_board, symbol):
        if np.random.uniform(0, 1) <= self.exp_rate:
            # take random action
            idx = np.random.choice(len(positions))
            action = positions[idx]
            
        else:
            value_max = -999
            for p in positions:
                next_board = current_board.copy()
                next_board[p] = symbol
                next_boardHash = current_board.getHash(next_board)
                value = 0 if self.states_value.get(next_boardHash) is None else self.states_value.get(next_boardHash)
                #print("value", value)
                if value >= value_max:
                    value_max = value
                    action = p
                    
        # print(f"{self.name} takes action {action}")
        return action
    
    def feedReward(self, reward):
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            self.states_value[st] += self.lr * [self.decay_gamma * reward - self.states_value[st]]
            reward = self.states_value[st]
            
    # accept a state
    def addState(self, state):
        self.states.append(state)
            
    def savePolicy(self):
        fw = open('policy_' + str(self.name), 'wb')
        pickle.dump(self.states_value, fw)
        fw.close()
        
    def loadPolicy(self, file):
        fr = open(file, 'rb')
        self.state_value = pickle.load(fr)
        fr.close()
        

In [23]:
class HumanPlayer:
    def __init__(self, name):
        self.name = name
        
    def chooseAction(self, positions):
        while True:
            row = int(input("Input your action row: "))
            col = int(input("Input your action col:"))
            action = (row, col)
            if action in positions:
                return action
            
    def addState(self, state):
        pass
    
    def feedReward(self, reward):
        pass
    
    def reset(self):
        pass

In [32]:
p1 = Player('Player1')
p2 = Player('Player2')

board = Board2(p1, p2)

In [33]:
board

<__main__.Board2 at 0x122229d10>

In [75]:
model

NameError: name 'model' is not defined