In [1]:
import random
import time
import math
import csv
import copy
import numpy as np
import matplotlib.pyplot as plt
from queue import *
#final version

In [2]:
def print_board(board):
    print("   0    1    2")
    print("  ------------")
    idx = 0
    for i in range(3):
        print(i, end=" | ")
        for j in range(3):
            print(board[idx],end=" | ")
            idx += 1
        print("\n  ------------")

In [3]:
class Agent:
  def __init__(self, player_number):
      self.utilities = {}
      self.ns = {}
      self.alpha = 0.3
      self.epsilon = 0
      self.gamma = 0.05
      self.plays = 0
      self.total_plays = 0
      self.game_count = 0
      self.player_number = player_number
      self.previous_state = None
      self.new_state = None
      self.state_queue = Queue()
      self.board_multiplicator = 3 ** np.arange(9)
      self.winning_lines = [
          [0, 1, 2],
          [3, 4, 5],
          [6, 7, 8],
          [0, 3, 6],
          [1, 4, 7],
          [2, 5, 8],
          [0, 4, 8],
          [2, 4, 6]
      ]
      self.perms = [
          [
              0, 1, 2,
              3, 4, 5,
              6, 7, 8
          ],
          [
              2, 5, 8,
              1, 4, 7,
              0, 3, 6
          ],
          [
              8, 7, 6,
              5, 4, 3,
              2, 1, 0
          ],
          [
              6, 3, 0,
              7, 4, 1,
              8, 5, 2
          ],
          [
              2, 1, 0,
              5, 4, 3,
              8, 7, 6
          ],
          [
              0, 3, 6,
              1, 4, 7,
              2, 5, 8
          ],
          [
              6, 7, 8,
              3, 4, 5,
              0, 1, 2
          ],
          [
              8, 5, 2,
              7, 4, 1,
              6, 3, 0
          ]
      ]
      self.permutations = [[3 ** i for i in perm] for perm in self.perms] 

  def setplayer_number(self, n):
      self.player_number = n
      

  def reward(self, player_number, board):
      winner = self.check_winner(board)
      if winner == player_number:
          return 1
      elif winner == 3:
          return -0.04
      elif winner != 0:
          return -1
      return 0


  def check_winner(self, board):
      winner = 0
      for player in range(1,3):
          for line in self.winning_lines:
              if board[line[0]] == player\
                and board[line[1]] == player\
                and board[line[2]] == player:
                winner = player
              if winner != 0: return winner
      if (0 in board) == False:
        return 3
      return winner


  def check_possible_actions(self, state):
      actions = []
      for i in range(len(state)):
          if state[i] == 0:
              actions.append(i)
      return actions


  def place_chip(self, temp_state, chip_location):
      temp_state[chip_location] = self.player_number
      return temp_state


  def policy(self, state, utilities, epsilon, player_number):
      actions = self.check_possible_actions(state)
      action_reward = -10000000000
      current_action = None
      if actions == []:
          return 0
      if random.random() < epsilon:
          return random.choice(actions)
      for action in actions:
          temp_state = self.place_chip(np.copy(state), action)
          temp_state_key = self.get_state_idx(temp_state)
          if temp_state_key in utilities:
              if utilities[temp_state_key] >= action_reward:
                  current_action = action
                  action_reward = utilities[temp_state_key]
      if current_action == None: 
          return random.choice(actions)
      return current_action


  def get_state_idx(self, board):
      return np.sum(np.multiply(board, self.permutations[0]))
    

  def check_if_exists(self, state):
    key = self.get_state_idx(state)
    is_in_dict = key in self.ns
    if not is_in_dict:
      self.ns[key] = 0
      self.utilities[key] = self.reward(self.player_number, state)
    return key
          


  def learning_agent(self, state):
      self.plays += 1
      self.epsilon = max(1 - (0.0000004 * self.total_plays), 0.05)
      
      if self.reward(self.player_number, state) == 0:
        action = self.policy(state, self.utilities, self.epsilon, self.player_number)
        self.previous_state = np.copy(state)
        self.new_state = self.place_chip(np.copy(state), action)
      else:
        self.previous_state = np.copy(self.new_state)
        self.new_state = np.copy(state)
      current_reward = self.reward(self.player_number, self.new_state)
      new_state_key = self.check_if_exists(self.new_state)
      previous_state_key = self.check_if_exists(self.previous_state)
      
      self.ns[previous_state_key] += 1
      self.utilities[previous_state_key] +=\
                          (\
                              self.alpha *\
                                  (\
                                      current_reward + self.gamma *\
                                      self.utilities[new_state_key] -\
                                      self.utilities[previous_state_key]\
                                  )\
                          )

      for i in range(1, len(self.permutations)):
          permutation_idx = np.sum(np.multiply(self.previous_state, self.permutations[i]))
          self.utilities[permutation_idx] = self.utilities[previous_state_key]

      if current_reward != 0:
          self.game_count += 1
          self.total_plays += self.plays
          self.plays = 0
          return np.copy(self.new_state)
          
      return np.copy(self.new_state)

  def play_game(self, player_number, state):
      return self.place_chip(state, self.policy(state, self.utilities, 0, player_number))

In [4]:
def game_turn(player):
    while True:
        print("Player", player)
        x = int(input("Choose y position: (0,1,2) "))
        y = int(input("Choose x position: (0,1,2) "))
        if x > 2 or y > 2:
            print("Out of bounds")
            continue
        if board[x,y] == 0:
            board[x,y] = player
            return player.check_winner(board)
        print("There is already a chip in this place")
    

def game():
    board = np.zeros((9,), dtype=int)
    winner = 0
    player1 = Agent(1)
    player2 = Agent(2)
    games_done = 0
    quit = True
    limit = 200000
    print_counter = 0
    while games_done < limit:
        if print_counter == 10000:
            print("Games done:", games_done)
            print("epsilon player 1", player1.epsilon)
            print("epsilon player 2", player2.epsilon)
            print_counter = 0
        while True:
            board = player1.learning_agent(board)
            if games_done >= limit - 10:
                print_board(board)
            if player1.check_winner(board)!= 0:
                player2.learning_agent(board)
                break
            board = player2.learning_agent(board)
            if games_done >= limit - 10:
                print_board(board)
            if player2.check_winner(board)!= 0:
                player1.learning_agent(board)
                break
        board = np.zeros((9,), dtype=int)
        games_done += 1
        print_counter += 1
    return player1, player2



def play_vs_bot(player):
    quit = "n"
    board = np.zeros((9,), dtype=int)
    while quit != "y":
        while True:
            print("Enter the coordinates where you want to place your chip")
            x = input("X: ")
            y = input("Y: ")
            try:
                x = int(x)
                y = int(y)
            except ValueError:
                print("Values entered are not valid, try again")
                continue
            if board[y * 3 + x] == 0:
                board[y * 3 + x] = 1
            else:
                print("Theres already a chip there, try again")
                continue
            print_board(board)
            if player.check_winner(board)!= 0:
                print("Game Over")
                break
            board = player.play_game(2, board)
            print_board(board)
            if player.check_winner(board)!= 0:
                print("Game Over")
                break
        board = np.zeros((9,), dtype=int)
        quit = input("Quit (y/n)")

        
start = time.time()
player1, player2 = game()
end = time.time()
print("Time:", end - start)

#saving utilitity table to csv
# w = csv.writer(open("util.csv", "w"))
# for key, val in player2.utilities.items():
#     w.writerow([key, val])
    

play_vs_bot(player2)

Games done: 10000
epsilon player 1 0.982226
epsilon player 2 0.983378


KeyboardInterrupt: 

In [None]:
#saving utilitity table to csv
w = csv.writer(open("util.csv", "w"))
for key, val in player2.utilities.items():
    w.writerow([key, val])
    

In [None]:
play_vs_bot(player2)