In [1]:
import numpy as np
import gym
from gym import spaces


BOARD_ROWS = 3
BOARD_COLS = 3

ModuleNotFoundError: No module named 'numpy'

In [None]:
class TicTacToe(gym.Env):
    def __init__(self, p1, p2):
        self.observation_space = spaces.Discrete(BOARD_COLS*BOARD_ROWS)  # can also be configuration of the board - large
        self.action_space = spaces.Discrete(BOARD_COLS*BOARD_ROWS)
        self.board = np.zeros((BOARD_COLS, BOARD_ROWS))
        self.p1 = p1
        self.p2 = p2
        self.done = False
        self.reset()
        self.player_symbols = {1: "Player 1", -1: "Player 2", 0:"Tie"}
        self.current_player = self.p1.symbol = 1  # first player
        self.p2.symbol = -1

    def reset(self):
        self.board = np.zeros((BOARD_COLS, BOARD_ROWS))
        self.done = False
        self.current_player = 1
        return self.board
    
    def available_positions(self):
        return list(zip(*np.where(self.board == 0)))

    def update_state(self, position):
        self.board[position] = self.current_player
        self.current_player  = -self.current_player

    def check_game_status(self):
        vertical = np.sum(self.board, 0)
        horizontal = np.sum(self.board, 1)
        diag = np.sum(np.diag(self.board))
        antidiag = np.sum(np.diag(np.fliplr(self.board)))

        if any(vertical == 3) or any(horizontal==3) or diag==3 or antidiag==3 : 
            self.done = True
            return 1
        
        if any(vertical == -3) or any(horizontal==-3)or diag==-3 or antidiag==-3: 
            self.done = True
            return -1

        if np.all(self.board):  # if board is filled
            self.done = True
            return 0  # tie
        
        self.done = False
        return None

    def step(self, action):
        if self.done:
            return self.board, 0 , True, None
        
        self.update_state(action)
        status = self.check_game_status()
        if status is not None:
            reward = status
            info = {"result": self.player_symbols[status]}
            return self.board, reward, self.done, info
        
        return self.board, None, self.done, None
        
    def render(self, mode="human"):
        for i in range(0, BOARD_ROWS):
            print('-------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                token = ''
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)

        print('-------------')
                

In [None]:
class Player:
    def __init__(self, name):
        self.name = name
        self.symbol = 1 # modified when assigned to the game
    
    def act(self, positions, board=None):
        idx = np.random.choice(len(positions))
        return positions[idx]

In [None]:
class HumanPlayer(Player):
    def act(self, positions, current_board=None):
        while True:
            matrix = {1: (0, 0), 2: (0, 1), 3: (0, 2),
                      4: (1, 0), 5: (1, 1), 6: (1, 2),
                      7: (2, 0), 8: (2, 1), 9: (2, 2)}
            user_input = int(input("Input your action 1-9:"))
            pos = matrix[user_input]
            if pos in positions:
                return pos

In [None]:
def play(P1, P2, render=False):
    agents = [P1, P2]
    env = TicTacToe(*agents)
    env.reset()
    if render: env.render()
    
    while not env.done:
        for agent in agents:
            action = agent.act(env.available_positions(), env.board)
            state, reward, done, info = env.step(action)
            if render: 
                print("place ", action)
                env.render()
            if done:
                if render: 
                    print(info['result'])
                return reward


play(Player("P1"), HumanPlayer("P2"), True)

In [None]:
import matplotlib.pyplot as plt

def win_stats(P1,P2, n=10000):
    print(f"Playing {n} games")
    game_stats = [play(P1,P2) for i in range(n)]
    
    print("P1 Wins: ",game_stats.count(1))
    print("P2 Wins: ",game_stats.count(-1))
    print("Ties: ",game_stats.count(0))


win_stats(Player("P1"), Player("P2"))

In [None]:
import pickle

class QAgent(Player):
    def __init__(self, name, policy=None):
        self.name = name
        self.symbol = 1 
        self.α = .2
        self.γ = .9
        self.ϵ = .3 
        self.states = []  # save all taken positions
        self.Q_value = {} # {state-action: values}}, generated/initialized on the fly
        if policy is not None:
            with open(policy, 'rb') as fr:
                self.Q_value = pickle.load(fr)

    def add_state(self, state):
        self.states.append(state)
    
    def reset(self):
        self.states = []

    def board2vec(self, board):
        return str(board.reshape(BOARD_COLS * BOARD_ROWS))
        

    def max_q(self, board, positions):
        action = None
        # Initialize Q
        value_max = -999 
        # choose Actions from Q 
        for p in positions:
            next_board = board.copy()
            next_board[p] = self.symbol
            next_board_vector = self.board2vec(next_board)
            action_value = 0 if self.Q_value.get(next_board_vector) is None else self.Q_value.get(next_board_vector)
            if action_value >= value_max:
                value_max = action_value
                action = p
        return action

    
    def act(self, positions, current_board):
        if np.random.uniform(0,1) <= self.ϵ:               # Explore
            idx = np.random.choice(len(positions))
            action = positions[idx]
        else:
            action = self.max_q(current_board, positions)  # Exploit
        return action


    # at the end of game, backpropagate and update states value
    def feed_reward(self, reward):
        for st in reversed(self.states):
            if self.Q_value.get(st) is None:
                self.Q_value[st] = 0
            self.Q_value[st] += self.α * (self.γ * reward - self.Q_value[st])
            reward = self.Q_value[st]


    def save_policy(self):
        with open('policy_' + str(self.name), 'wb') as fw:
            pickle.dump(self.Q_value, fw)

    
    def load_policy(self, file):
        with open(file, 'rb') as fr:
            self.Q_value = pickle.load(fr)



In [None]:
def train(n_episodes=10000):
    p1 = QAgent("p1")
    p2 = QAgent("p2")
    agents = [p1, p2]
    env = TicTacToe(p1, p2)
    for i in range(n_episodes):
        if i % 1000 == 0:
            print("Episode {}".format(i))
        env.reset()
        done = False
        while not done:
            for agent in agents:
                if not done:
                    action = agent.act(env.available_positions(), env.board)
                    state, reward, done, info = env.step(action)
                    agent.add_state(agent.board2vec(state))
                    if done:
                        if reward == 1:
                            p1.feed_reward(1)
                            p2.feed_reward(0)
                        elif reward == -1:
                            p1.feed_reward(0)
                            p2.feed_reward(1)
                        else:
                            p1.feed_reward(.1)
                            p2.feed_reward(.5)
                        p1.reset()
                        p2.reset()
        env.reset()
    p1.save_policy()
    p2.save_policy()

train(100000)

In [None]:
play(QAgent("p1",policy='policy_p1'),Player("P2"),True)

In [None]:
P1 = QAgent("p1",policy='policy_p1')
P2 = Player("P2")

win_stats(P1,P2, n=1000)

In [None]:
P1 = Player("P1")
P2 = QAgent("p2",policy='policy_p2')

win_stats(P1, P2, n=5000)

In [None]:
play(HumanPlayer("P1"), Player("P2"),True)