## DDPG VS MIN MAX SIZE 3

In [6]:
import numpy as np
import copy

from gym_tictactoe.env import TicTacToeEnv


env = TicTacToeEnv(size=3)

def evaluate(board):
    """
    Evaluate the current state of the board.
    Returns:
    - 1 if the maximizing player wins
    - -1 if the minimizing player wins
    - 0 if it's a tie or the game is ongoing
    """
    n = int(np.sqrt(len(board)))

    # Check rows and columns
    for i in range(n):
        if all(board[i * n + j] == 1 for j in range(n)):
            return 1
        if all(board[j * n + i] == 1 for j in range(n)):
            return 1
        if all(board[i * n + j] == 2 for j in range(n)):
            return -1
        if all(board[j * n + i] == 2 for j in range(n)):
            return -1

    # Check diagonals
    if all(board[i * (n + 1)] == 1 for i in range(n)):
        return 1
    if all(board[i * (n - 1) + (n - 1)] == 1 for i in range(n)):
        return 1
    if all(board[i * (n + 1)] == 2 for i in range(n)):
        return -1
    if all(board[i * (n - 1) + (n - 1)] == 2 for i in range(n)):
        return -1

    # Check for a tie
    if all(cell != 0 for cell in board):
        return 0

    # Game still ongoing
    return None

def abminimax(board, depth, alpha, beta, maximizingPlayer):
    """
    Implementation of the alpha-beta pruning minimax algorithm.
    """
    score = evaluate(board)
    # print(score)

    if score is not None:
        return score

    if maximizingPlayer:
        maxEval = -np.inf
        for i in range(len(board)):
            if board[i] == 0:
                new_board = copy.deepcopy(board)
                new_board[i] = 1
                eval = abminimax(new_board, depth + 1, alpha, beta, False)
                maxEval = max(maxEval, eval)
                alpha = max(alpha, eval)
                if beta <= alpha:
                    break
        return maxEval
    else:
        minEval = np.inf
        for i in range(len(board)):
            if board[i] == 0:
                new_board = copy.deepcopy(board)
                new_board[i] = 2
                eval = abminimax(new_board, depth + 1, alpha, beta, True)
                minEval = min(minEval, eval)
                beta = min(beta, eval)
                if beta <= alpha:
                    break
        return minEval

def find_best_move(board):
    """
    Finds the best move using the alpha-beta pruning minimax algorithm.
    """
    print(board)
    best_val = -np.inf
    best_move = None
    alpha = -np.inf
    beta = np.inf

    for i in range(len(board)):
        if board[i] == 0:
            new_board = copy.deepcopy(board)
            new_board[i] = 1
            move_val = abminimax(new_board, 0, alpha, beta, False)

            if move_val > best_val:
                best_move = i
                best_val = move_val
            alpha = max(alpha, move_val)

    return best_move

In [7]:
# Example of how to use the minimax algorithm with the TicTacToe environment
from stable_baselines3 import DQN, DDPG, PPO
import time
env.reset()
done = False
model = DDPG.load("ddpgmodel_3/best_model.zip")
observation = np.asarray([0] * 9)
while not done:
    # Human player's turn
    print("DDPG's turn!")
    start = time.time()
    action, _states = model.predict(observation, deterministic=True)
    print("Time taken: ", time.time() - start)
    
    observation, reward, done, trunc, _ = env.step(np.float32(action.item()))
    env.render()

    if done:
        print("Game Over!")
        break
    
    # Agent's turn using minimax
    
    print("Alpha-Beta's turn!")
    start = time.time()
    best_move = np.asarray(find_best_move(observation))
    print("Time taken: ", time.time() - start)
    observation, reward, done, _, _ = env.step(np.float32(best_move.item()))
    env.render()
    
    if done:
        print("Game Over!")
        break

    

DDPG's turn!
Time taken:  0.001592397689819336
   | | 
  ------
   | | 
  ------
   |O| 
Alpha-Beta's turn!
[0. 0. 0. 0. 0. 0. 0. 2. 0.]
Time taken:  0.3490467071533203
   |X| 
  ------
   | | 
  ------
   |O| 
DDPG's turn!
Time taken:  0.0019240379333496094
   |X| 
  ------
   | |O
  ------
   |O| 
Alpha-Beta's turn!
[0. 1. 0. 0. 0. 2. 0. 2. 0.]
Time taken:  0.03490400314331055
   |X| 
  ------
   | |O
  ------
  X|O| 
DDPG's turn!
Time taken:  0.0021255016326904297
   |X|O
  ------
   | |O
  ------
  X|O| 
Alpha-Beta's turn!
[0. 1. 2. 0. 0. 2. 1. 2. 0.]
Time taken:  0.0027532577514648438
   |X|O
  ------
   | |O
  ------
  X|O|X
DDPG's turn!
Time taken:  0.0030155181884765625
   |X|O
  ------
   |O|O
  ------
  X|O|X
Alpha-Beta's turn!
[0. 1. 2. 0. 2. 2. 1. 2. 1.]
Time taken:  0.0004611015319824219
   |X|O
  ------
  X|O|O
  ------
  X|O|X
DDPG's turn!
Time taken:  0.0011348724365234375
  O|X|O
  ------
  X|O|O
  ------
  X|O|X
Game Over!


## DDPG VS RANDOM size 3

In [8]:
import numpy as np
import copy

from gym_tictactoe.env import TicTacToeEnv


env = TicTacToeEnv(size=3)

def evaluate(board):
    """
    Evaluate the current state of the board.
    Returns:
    - 1 if the maximizing player wins
    - -1 if the minimizing player wins
    - 0 if it's a tie or the game is ongoing
    """
    n = int(np.sqrt(len(board)))

    # Check rows and columns
    for i in range(n):
        if all(board[i * n + j] == 1 for j in range(n)):
            return 1
        if all(board[j * n + i] == 1 for j in range(n)):
            return 1
        if all(board[i * n + j] == 2 for j in range(n)):
            return -1
        if all(board[j * n + i] == 2 for j in range(n)):
            return -1

    # Check diagonals
    if all(board[i * (n + 1)] == 1 for i in range(n)):
        return 1
    if all(board[i * (n - 1) + (n - 1)] == 1 for i in range(n)):
        return 1
    if all(board[i * (n + 1)] == 2 for i in range(n)):
        return -1
    if all(board[i * (n - 1) + (n - 1)] == 2 for i in range(n)):
        return -1

    # Check for a tie
    if all(cell != 0 for cell in board):
        return 0

    # Game still ongoing
    return None

def abminimax(board, depth, alpha, beta, maximizingPlayer):
    """
    Implementation of the alpha-beta pruning minimax algorithm.
    """
    score = evaluate(board)
    # print(score)

    if score is not None:
        return score

    if maximizingPlayer:
        maxEval = -np.inf
        for i in range(len(board)):
            if board[i] == 0:
                new_board = copy.deepcopy(board)
                new_board[i] = 1
                eval = abminimax(new_board, depth + 1, alpha, beta, False)
                maxEval = max(maxEval, eval)
                alpha = max(alpha, eval)
                if beta <= alpha:
                    break
        return maxEval
    else:
        minEval = np.inf
        for i in range(len(board)):
            if board[i] == 0:
                new_board = copy.deepcopy(board)
                new_board[i] = 2
                eval = abminimax(new_board, depth + 1, alpha, beta, True)
                minEval = min(minEval, eval)
                beta = min(beta, eval)
                if beta <= alpha:
                    break
        return minEval

def find_best_move(board):
    """
    Finds the best move using the alpha-beta pruning minimax algorithm.
    """
    print(board)
    best_val = -np.inf
    best_move = None
    alpha = -np.inf
    beta = np.inf

    for i in range(len(board)):
        if board[i] == 0:
            new_board = copy.deepcopy(board)
            new_board[i] = 1
            move_val = abminimax(new_board, 0, alpha, beta, False)

            if move_val > best_val:
                best_move = i
                best_val = move_val
            alpha = max(alpha, move_val)

    return best_move


from stable_baselines3 import DQN, DDPG, PPO
import time
import numpy as np
from tqdm.auto import tqdm
from gym_tictactoe.env import TicTacToeEnv


env = TicTacToeEnv(size=3)

model = DDPG.load("ddpgmodel_3/best_model.zip")

ddpg_won, random_won = 0, 0 
time_taken = 0
n = 100
observation = np.asarray([0] * 9)
for iter in tqdm(range(n)):
    done = False
    env.reset()

    while not done:
        # Human player's turn
        # print("DDPG's turn!")
        
        start = time.time()
        action, _states = model.predict(observation, deterministic=True)
        # print("Time taken: ", time.time() - start)
        time_taken += time.time() - start
        observation, reward_ddpg, done, trunc, _ = env.step(np.float32(action.item()))
        # env.render()
        # print("\n\n")
    
        if done:
            # print(reward)
            ddpg_won+=1
            # print("Game Over!")
            break
        
        # Agent's turn using minimax

        # print("Random's turn!")
        random_move = np.random.randint(0, 9)
        observation, reward, done, _, _ = env.step(np.float32(random_move))

        if done:
            random_won+=1
            # print("Game Over!")
            break
        
    # print("===\n\n", done, reward, reward_ddpg, trunc)
    # env.render()

print("AVG Time taken: ", time_taken / n)
print("DDPG won: ", ddpg_won, "Lost: ", random_won, "Tie: ", n - ddpg_won - random_won)


  0%|          | 0/100 [00:00<?, ?it/s]

AVG Time taken:  0.002591557502746582
DDPG won:  83 Lost:  17 Tie:  0


## DDPG VS MIN MAX size 4

In [9]:
import numpy as np
import copy

from gym_tictactoe.env import TicTacToeEnv


env = TicTacToeEnv(size=4)

def evaluate(board):
    """
    Evaluate the current state of the board.
    Returns:
    - 1 if the maximizing player wins
    - -1 if the minimizing player wins
    - 0 if it's a tie or the game is ongoing
    """
    n = int(np.sqrt(len(board)))

    # Check rows and columns
    for i in range(n):
        if all(board[i * n + j] == 1 for j in range(n)):
            return 1
        if all(board[j * n + i] == 1 for j in range(n)):
            return 1
        if all(board[i * n + j] == 2 for j in range(n)):
            return -1
        if all(board[j * n + i] == 2 for j in range(n)):
            return -1

    # Check diagonals
    if all(board[i * (n + 1)] == 1 for i in range(n)):
        return 1
    if all(board[i * (n - 1) + (n - 1)] == 1 for i in range(n)):
        return 1
    if all(board[i * (n + 1)] == 2 for i in range(n)):
        return -1
    if all(board[i * (n - 1) + (n - 1)] == 2 for i in range(n)):
        return -1

    # Check for a tie
    if all(cell != 0 for cell in board):
        return 0

    # Game still ongoing
    return None

def abminimax(board, depth, alpha, beta, maximizingPlayer):
    """
    Implementation of the alpha-beta pruning minimax algorithm.
    """
    score = evaluate(board)
    # print(score)

    if score is not None:
        return score

    if maximizingPlayer:
        maxEval = -np.inf
        for i in range(len(board)):
            if board[i] == 0:
                new_board = copy.deepcopy(board)
                new_board[i] = 1
                eval = abminimax(new_board, depth + 1, alpha, beta, False)
                maxEval = max(maxEval, eval)
                alpha = max(alpha, eval)
                if beta <= alpha:
                    break
        return maxEval
    else:
        minEval = np.inf
        for i in range(len(board)):
            if board[i] == 0:
                new_board = copy.deepcopy(board)
                new_board[i] = 2
                eval = abminimax(new_board, depth + 1, alpha, beta, True)
                minEval = min(minEval, eval)
                beta = min(beta, eval)
                if beta <= alpha:
                    break
        return minEval

def find_best_move(board):
    """
    Finds the best move using the alpha-beta pruning minimax algorithm.
    """
    print(board)
    best_val = -np.inf
    best_move = None
    alpha = -np.inf
    beta = np.inf

    for i in range(len(board)):
        if board[i] == 0:
            new_board = copy.deepcopy(board)
            new_board[i] = 1
            move_val = abminimax(new_board, 0, alpha, beta, False)

            if move_val > best_val:
                best_move = i
                best_val = move_val
            alpha = max(alpha, move_val)

    return best_move

In [5]:
# Example of how to use the minimax algorithm with the TicTacToe environment
from stable_baselines3 import DQN, DDPG, PPO
import time
env.reset()
done = False
model = DDPG.load("ddpgmodel_4/best_model.zip")
observation = np.asarray([0] * 16)
while not done:
    # Human player's turn
    print("DDPG's turn!")
    start = time.time()
    action, _states = model.predict(observation, deterministic=True)
    print("Time taken: ", time.time() - start)
    
    observation, reward, done, trunc, _ = env.step(np.float32(action.item()))
    env.render()

    if done:
        print("Game Over!")
        break
    
    # Agent's turn using minimax
    
    print("Alpha-Beta's turn!")
    start = time.time()
    best_move = np.asarray(find_best_move(observation))
    print("Time taken: ", time.time() - start)
    observation, reward, done, _, _ = env.step(np.float32(best_move.item()))
    env.render()
    
    if done:
        print("Game Over!")
        break

    

DDPG's turn!
Time taken:  0.0022487640380859375
   | | | 
  --------
   | | | 
  --------
   | |O| 
  --------
   | | | 
Alpha-Beta's turn!
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0.]


KeyboardInterrupt: 

In [10]:
import numpy as np
import copy

from gym_tictactoe.env import TicTacToeEnv


env = TicTacToeEnv(size=4)

def evaluate(board):
    """
    Evaluate the current state of the board.
    Returns:
    - 1 if the maximizing player wins
    - -1 if the minimizing player wins
    - 0 if it's a tie or the game is ongoing
    """
    n = int(np.sqrt(len(board)))

    # Check rows and columns
    for i in range(n):
        if all(board[i * n + j] == 1 for j in range(n)):
            return 1
        if all(board[j * n + i] == 1 for j in range(n)):
            return 1
        if all(board[i * n + j] == 2 for j in range(n)):
            return -1
        if all(board[j * n + i] == 2 for j in range(n)):
            return -1

    # Check diagonals
    if all(board[i * (n + 1)] == 1 for i in range(n)):
        return 1
    if all(board[i * (n - 1) + (n - 1)] == 1 for i in range(n)):
        return 1
    if all(board[i * (n + 1)] == 2 for i in range(n)):
        return -1
    if all(board[i * (n - 1) + (n - 1)] == 2 for i in range(n)):
        return -1

    # Check for a tie
    if all(cell != 0 for cell in board):
        return 0

    # Game still ongoing
    return None

def abminimax(board, depth, alpha, beta, maximizingPlayer):
    """
    Implementation of the alpha-beta pruning minimax algorithm.
    """
    score = evaluate(board)
    # print(score)

    if score is not None:
        return score

    if maximizingPlayer:
        maxEval = -np.inf
        for i in range(len(board)):
            if board[i] == 0:
                new_board = copy.deepcopy(board)
                new_board[i] = 1
                eval = abminimax(new_board, depth + 1, alpha, beta, False)
                maxEval = max(maxEval, eval)
                alpha = max(alpha, eval)
                if beta <= alpha:
                    break
        return maxEval
    else:
        minEval = np.inf
        for i in range(len(board)):
            if board[i] == 0:
                new_board = copy.deepcopy(board)
                new_board[i] = 2
                eval = abminimax(new_board, depth + 1, alpha, beta, True)
                minEval = min(minEval, eval)
                beta = min(beta, eval)
                if beta <= alpha:
                    break
        return minEval

def find_best_move(board):
    """
    Finds the best move using the alpha-beta pruning minimax algorithm.
    """
    print(board)
    best_val = -np.inf
    best_move = None
    alpha = -np.inf
    beta = np.inf

    for i in range(len(board)):
        if board[i] == 0:
            new_board = copy.deepcopy(board)
            new_board[i] = 1
            move_val = abminimax(new_board, 0, alpha, beta, False)

            if move_val > best_val:
                best_move = i
                best_val = move_val
            alpha = max(alpha, move_val)

    return best_move


from stable_baselines3 import DQN, DDPG, PPO
import time
import numpy as np
from tqdm.auto import tqdm
from gym_tictactoe.env import TicTacToeEnv


env = TicTacToeEnv(size=4)

model = DDPG.load("ddpgmodel_4/best_model.zip")

ddpg_won, random_won = 0, 0 
time_taken = 0
n = 100
for iter in tqdm(range(n)):
    done = False
    env.reset()
    observation = np.asarray([0] * 16)
    while not done:
        # Human player's turn
        # print("DDPG's turn!")
        
        start = time.time()
        action, _states = model.predict(observation, deterministic=True)
        # print("Time taken: ", time.time() - start)
        time_taken += time.time() - start
        observation, reward_ddpg, done, trunc, _ = env.step(np.float32(action.item()))
        # env.render()
        # print("\n\n")
    
        if done:
            # print(reward)
            random_won+=1
            # print("Game Over!")
            break
        
        # Agent's turn using minimax

        # print("Random's turn!")
        random_move = np.random.randint(0, 9)
        observation, reward, done, _, _ = env.step(np.float32(random_move))

        if done:
            ddpg_won+=1
            # print("Game Over!")
            break
        
    # print("===\n\n", done, reward, reward_ddpg, trunc)
    # env.render()

print("AVG Time taken: ", time_taken / n)
print("DDPG won: ", ddpg_won, "Lost: ", random_won, "Tie: ", n - ddpg_won - random_won)


  0%|          | 0/100 [00:00<?, ?it/s]

AVG Time taken:  0.003969688415527344
DDPG won:  69 Lost:  31 Tie:  0
