In [1]:
# from pettingzoo.classic import tictactoe_v3
# import numpy as np

# class TicTacToeAgent:
#     def __init__(self):
#         self.name = 'TicTacToeAgent'
#         self.env = tictactoe_v3.env()
#         self.env.reset()
#         self.action_space = [0,1,2,3,4,5,6,7,8]

#     def reset(self):
#         self.env.reset()

#     def act(self, observation, player):
#         board, action_mask = observation['observation'], observation['action_mask']
#         legal_moves = [i for i in range(9) if action_mask[i] == 1]
        
#         # Check for win or block
#         for move in legal_moves:
#             if self.is_winning_move(board, move, player):
#                 return move
#             if self.is_winning_move(board, move, 1 - player):
#                 return move

#         # If no win or block move, pick a random legal move
#         return np.random.choice(legal_moves)

#     def is_winning_move(self, board, move, player):
#         temp_board = board.copy()
#         temp_board[move // 3, move % 3, player] = 1
#         return self.is_winner(temp_board, player)

#     def is_winner(self, board, player):
#         # Check rows, columns and diagonals for a win
#         for i in range(3):
#             if all(board[i,:,player] == 1) or all(board[:,i,player] == 1):
#                 return True
#         if board[0,0,player] == board[1,1,player] == board[2,2,player] == 1 or board[0,2,player] == board[1,1,player] == board[2,0,player] == 1:
#             return True
#         return False

# # Example usage
# agent = TicTacToeAgent()
# # This is how you would typically use it in a PettingZoo loop
# for agent_id in agent.env.agent_iter():
#     observation, _, _, _, _ = agent.env.last()
#     if agent_id == agent.name:
#         action = agent.act(observation, agent.env.agent_selection)
#         agent.env.step(action)
#     else:
#         agent.env.step(None)


In [2]:
# import numpy as np
# from pettingzoo.classic import tictactoe_v3

# class Agent:
#     def __init__(self):
#         self.env = tictactoe_v3.env()
#         self.env.reset()
#         self.action_space = [0, 1, 2, 3, 4, 5, 6, 7, 8]

#     def reset(self):
#         self.env.reset()
        
#     def observe(self, observation, reward, termination, truncation, info):
#         self.current_observation = observation
#         obs_message = f"Observation: {observation}"
#         return obs_message

#     def act(self, observation, player):
#         board, action_mask = observation['observation'], observation['action_mask']
#         valid_actions = np.where(action_mask == 1)[0]

#         # Check for potential wins or blocks
#         for action in valid_actions:
#             simulated_board = self.simulate_action(board, action, player)
#             if self.is_winner(simulated_board, player):
#                 return action

#         # If no immediate win or block, choose a random valid action
#         return np.random.choice(valid_actions)

#     def simulate_action(self, board, action, player):
#         simulated_board = np.copy(board)
#         row, col = divmod(action, 3)
#         simulated_board[row, col, player] = 1
#         return simulated_board

#     def is_winner(self, board, player):
#         # Check rows, columns, and diagonals for a win
#         for i in range(3):
#             if np.all(board[i, :, player] == 1) or np.all(board[:, i, player] == 1):
#                 return True
#         if board[0, 0, player] == board[1, 1, player] == board[2, 2, player] == 1:
#             return True
#         if board[0, 2, player] == board[1, 1, player] == board[2, 0, player] == 1:
#             return True
#         return False

# # Example of how to use the Agent
# def simulate(agents, env):
#     env.reset()
#     Total_reward = 0
    
#     for agent_name in env.agent_iter():
#         observation, reward, termination, truncation, info = env.last()
#         Total_reward += reward
        
#         obs_message = agents[agent_name].observe(
#             observation, reward, termination, truncation, info
#         )
#         # print(obs_message)
#         if termination or truncation:
#             action = None
#         else:
#             action = agents[agent_name].act(observation)
#         print(f"Action: {action}")
#         env.step(action)
#     env.close()
#     print("Total reward: ", Total_reward)

# # Initialize environment and agents
# env = tictactoe_v3.env()
# agents = {'player_1': Agent(), 'player_2': Agent()}

# # Simulate the game
# simulate(agents, env)


### Blocking Strategy

In [1]:
from pettingzoo.classic import tictactoe_v3
import numpy as np

class Agent:
    def __init__(self):
        self.observation_space = None
        self.action_space = None
        self.current_observation = None

    def reset(self):
        self.current_observation = None

    def observe(self, observation, reward, termination, truncation, info):
        self.current_observation = observation
        obs_message = f"Observation: {observation}"
        return obs_message

    def act(self):
        # Simple strategy: first try to win, then block, else random
        action = self.find_winning_move()
        if action is None:
            action = self.find_blocking_move()
        if action is None:
            action = self.random_move()
        return action

    def find_winning_move(self):
        return self.find_best_move(is_winning_move=True)

    def find_blocking_move(self):
        return self.find_best_move(is_winning_move=False)

    def find_best_move(self, is_winning_move):
        board, action_mask = self.current_observation['observation'], self.current_observation['action_mask']
        for action in range(9):
            if action_mask[action]:
                simulated_board = self.simulate_move(board, action, is_winning_move)
                if self.is_winner(simulated_board, is_winning_move):
                    return action
        return None

    def random_move(self):
        action_mask = self.current_observation['action_mask']
        legal_actions = [action for action in range(9) if action_mask[action]]
        return np.random.choice(legal_actions) if legal_actions else None

    def simulate_move(self, board, action, is_winning_move):
        simulated_board = np.copy(board)
        player_index = 0 if is_winning_move else 1
        row, col = action % 3, action // 3
        simulated_board[row, col, player_index] = 1
        return simulated_board

    def is_winner(self, board, is_winning_move):
        player_index = 0 if is_winning_move else 1
        # Check rows, columns, and diagonals
        for i in range(3):
            if np.all(board[:, i, player_index] == 1) or np.all(board[i, :, player_index] == 1):
                return True
        if np.all([board[i, i, player_index] == 1 for i in range(3)]) or np.all([board[i, 2-i, player_index] == 1 for i in range(3)]):
            return True
        return False

def simulate(agents, env):
    env.reset()
    Total_reward = 0
    
    for agent_name in env.agent_iter():
        observation, reward, termination, truncation, info = env.last()
        Total_reward += reward
        
        obs_message = agents[agent_name].observe(
            observation, reward, termination, truncation, info
        )
        # print(obs_message)
        if termination or truncation:
            action = None
        else:
            action = agents[agent_name].act()
        print(f"Action: {action}")
        env.step(action)
    env.close()
    print("Total reward: ", Total_reward)

# Initialize environment and agents
env = tictactoe_v3.env()
agents = {'player_1': Agent(), 'player_2': Agent()}

# Simulate the game
simulate(agents, env)


Action: 6
Action: 5
Action: 4
Action: 1
Action: 7
Action: 3
Action: 8
Action: None
Action: None
Total reward:  0


### MinMax

In [7]:
from pettingzoo.classic import tictactoe_v3
import numpy as np

class Agent:
    def __init__(self, mark):
        self.mark = mark
        self.opponent_mark = 'X' if mark == 'O' else 'O'
        self.action_space = list(range(9))
        self.board = np.zeros((3, 3), dtype=str)

    def act(self):
        best_score = -float('inf')
        best_action = None

        legal_actions = [action for action in self.action_space if self.is_legal_action(action)]

        for action in legal_actions:
            self.make_move(action, self.mark)
            score = self.minimax(0, False)
            self.undo_move(action)
            if score > best_score:
                best_score = score
                best_action = action

        return best_action if best_action is not None else np.random.choice(legal_actions)

    def reset(self):
        self.board = np.zeros((3, 3), dtype=str)

    def observe(self, observation, reward, termination, truncation, info):
        self.update_board(observation)
        return f"Observation: {observation}, Reward: {reward}"

    def minimax(self, depth, is_maximizing):
        if self.check_winner(self.mark):
            return 1
        elif self.check_winner(self.opponent_mark):
            return -1
        elif np.all(self.board != ''):
            return 0

        if is_maximizing:
            best_score = -float('inf')
            for action in self.action_space:
                if self.is_legal_action(action):
                    self.make_move(action, self.mark)
                    score = self.minimax(depth + 1, False)
                    self.undo_move(action)
                    best_score = max(best_score, score)
            return best_score
        else:
            best_score = float('inf')
            for action in self.action_space:
                if self.is_legal_action(action):
                    self.make_move(action, self.opponent_mark)
                    score = self.minimax(depth + 1, True)
                    self.undo_move(action)
                    best_score = min(best_score, score)
            return best_score

    def is_legal_action(self, action):
        x, y = divmod(action, 3)
        return self.board[x, y] == ''

    def make_move(self, action, mark):
        x, y = divmod(action, 3)
        self.board[x, y] = mark

    def undo_move(self, action):
        x, y = divmod(action, 3)
        self.board[x, y] = ''

    def update_board(self, observation):
        player_plane, opponent_plane = np.array(observation['observation']).reshape((2, 3, 3))
        self.board = np.where(player_plane == 1, self.mark, '')
        self.board = np.where(opponent_plane == 1, self.opponent_mark, self.board)


    def check_winner(self, mark):
        for row in self.board:
            if np.all(row == mark):
                return True
        for col in self.board.T:
            if np.all(col == mark):
                return True
        if np.all(np.diag(self.board) == mark) or np.all(np.diag(np.fliplr(self.board)) == mark):
            return True
        return False

# Instantiate the environment and agents
env = tictactoe_v3.env()
agents = {'player_1': Agent('X'), 'player_2': Agent('O')}

def simulate(agents, env):
    env.reset()
    
    for agent_name in env.agent_iter():
        observation, reward, termination, truncation, info = env.last()
        obs_message = agents[agent_name].observe(
            observation, reward, termination, truncation, info
        )
        print(obs_message)
        if termination or truncation:
            action = None
        else:
            action = agents[agent_name].act()
        print(f"Action: {action}")
        env.step(action)
    env.close()

simulate(agents, env)


Observation: {'observation': array([[[0, 0],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0]]], dtype=int8), 'action_mask': array([1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int8)}, Reward: 0
Action: 0
Observation: {'observation': array([[[0, 1],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0]]], dtype=int8), 'action_mask': array([0, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int8)}, Reward: 0
Action: 0
obs['action_mask'] contains a mask of all legal moves that can be chosen.
Observation: {'observation': array([[[1, 0],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0]]], dtype=int8), 'action_mask': array([0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int8)}, Reward: 0
Action: None
Observation: {'observation': array([[[0, 1],
        [0, 0],
        [0, 0]],

       [[0, 0],


### Only act on the next available action

In [2]:
from pettingzoo.classic import tictactoe_v3
import numpy as np

class Agent:
    def __init__(self):
        self.action_space = list(range(9))  # Possible actions from 0 to 8
        self.current_observation = None

    def reset(self):
        self.current_observation = None

    def observe(self, observation, reward, termination, truncation, info):
        self.current_observation = observation
        return f"Observation: {observation}, Reward: {reward}"

    def act(self):
        # Implement a simple strategy: Choose the first available legal move
        action_mask = self.current_observation['action_mask']
        for action, legal in enumerate(action_mask):
            if legal:
                return action
        return None  # In case no legal moves are available

def simulate(agents, env):
    env.reset()
    for agent in agents.values():
        agent.reset()

    total_reward = 0
    while True:
        agent_name = env.agent_selection
        observation, reward, termination, truncation, info = env.last()
        obs_message = agents[agent_name].observe(
            observation, reward, termination, truncation, info
        )
        # print(obs_message)
        total_reward += reward
        if termination or truncation:
            action = None
        else:
            action = agents[agent_name].act()
        print(f"Action: {action}")
        env.step(action)
        if termination or truncation:
            break
    env.close()
    print("Total reward: ", total_reward)

# Initialize the environment and agents
env = tictactoe_v3.env()
agents = {'player_1': Agent(), 'player_2': Agent()}

# Simulate a game
simulate(agents, env)


Action: 0
Action: 1
Action: 2
Action: 3
Action: 4
Action: 5
Action: 6
Action: None
Total reward:  -1


In [3]:
from pettingzoo.classic import tictactoe_v3
import numpy as np

class Agent:
    def __init__(self):
        self.action_space = list(range(9))  # Possible actions from 0 to 8
        self.current_observation = None
        self.player_id = 1  # Assuming the agent is player 1 (X)

    def reset(self):
        self.current_observation = None

    def observe(self, observation, reward, termination, truncation, info):
        self.current_observation = observation
        return f"Observation: {observation}, Reward: {reward}"

    def act(self):
        board, action_mask = self.get_board_and_mask(self.current_observation)
        best_val = -np.inf
        best_move = None

        for action in self.action_space:
            if action_mask[action] == 1:
                board[action // 3, action % 3] = self.player_id
                move_val = self.minimax(board, 0, False)
                board[action // 3, action % 3] = 0  # Undo the move

                if move_val > best_val:
                    best_val = move_val
                    best_move = action

        return best_move if best_move is not None else 0  # Default to 0 if no valid move found

    def evaluate(self, b):
        # Implement the evaluation logic here
        # Return a score for the board state
        pass

    def is_moves_left(self, b):
        return np.any(b == 0)

    def minimax(self, board, depth, is_max):
        score = self.evaluate(board)

        if score == 10:
            return score
        if score == -10:
            return score
        if not self.is_moves_left(board):
            return 0

        if is_max:
            best = -np.inf
            for i in range(3):
                for j in range(3):
                    if board[i][j] == 0:
                        board[i][j] = self.player_id
                        best = max(best, self.minimax(board, depth + 1, not is_max))
                        board[i][j] = 0
            return best
        else:
            best = np.inf
            for i in range(3):
                for j in range(3):
                    if board[i][j] == 0:
                        board[i][j] = 1 - self.player_id
                        best = min(best, self.minimax(board, depth + 1, not is_max))
                        board[i][j] = 0
            return best

    def get_board_and_mask(self, observation):
        board = np.zeros((3, 3), dtype=int)
        action_mask = observation['action_mask']
        for i in range(3):
            for j in range(3):
                if observation['observation'][i][j][0] == 1:
                    board[i, j] = 1  # Player 1's mark
                elif observation['observation'][i][j][1] == 1:
                    board[i, j] = -1  # Player 2's mark
        return board, action_mask

# def simulate(agents, env):
#     env.reset()
#     for agent in agents.values():
#         agent.reset()

#     while True:
#         agent_name = env.agent_selection
#         observation, reward, termination, truncation, info = env.last()
#         obs_message = agents[agent_name].observe(observation, reward, termination, truncation, info)
#         print(obs_message)
#         if termination or truncation:
#             action = None
#         else:
#             action = agents[agent_name].act()
#         print(f"Action: {action}")
#         env.step(action)
#         if termination or truncation:
#             break
#     env.close()

# # Initialize the environment and agents
# env = tictactoe_v3.env()
# agents = {'player_1': Agent(), 'player_2': Agent()}

# # Simulate a game
# simulate(agents, env)

def simulate(agents, env):
    env.reset()
    Total_reward = 0
    
    for agent_name in env.agent_iter():
        observation, reward, termination, truncation, info = env.last()
        Total_reward += reward
        
        obs_message = agents[agent_name].observe(
            observation, reward, termination, truncation, info
        )
        print(obs_message)
        if termination or truncation:
            action = None
        else:
            action = agents[agent_name].act()
        print(f"Action: {action}")
        env.step(action)
    env.close()
    print("Total reward: ", Total_reward)

# Initialize environment and agents
env = tictactoe_v3.env()
agents = {'player_1': Agent(), 'player_2': Agent()}

# Simulate the game
simulate(agents, env)

Observation: {'observation': array([[[0, 0],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0]]], dtype=int8), 'action_mask': array([1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int8)}, Reward: 0


KeyboardInterrupt: 