In [1]:
import agent
import environment

def play_round(player1: agent.Player, player2: agent.Player, learn: bool = False) -> [int]:
    board = environment.Board()
    rewards = {player1.role: 0, player2.role: 0}
    
    while board.check_winner() == environment.GameStatus.ONGOING:
        current_player_agent = player1 if board.current_player() == 1 else player2
        current_state = board.state

        chosen_move = current_player_agent.make_move(current_state, board.available_moves(), current_player_agent.exploration_rate)
        game_status = board.make_move(chosen_move)
        
        if game_status != environment.GameStatus.ONGOING:
            rewards[player1] = board.reward(player1.role)
            rewards[player2] = board.reward(player2.role)
            
            if learn:
                player1.train(current_state, rewards[player1])
                player2.train(current_state, rewards[player2])
                
            break
    
    return board.state, board.winner

In [3]:
player1 = agent.Player(role=1)
player2 = agent.Player(role=-1)

for episode in range(100000):
    board, winner = play_round(player1, player2, learn=True)
    print(f"Episode {episode+1}, Board: {board}, Winner: {winner}")
    
    if winner == player1.role:
        player1.update_exploration_rate(0.99)
    elif winner == player2.role:
        player2.update_exploration_rate(0.99)

Episode 1, Board: [1, 0, 1, 1, -1, -1, 1, 0, -1], Winner: 1
Episode 2, Board: [1, 1, 1, 1, -1, -1, -1, 1, -1], Winner: 1
Episode 3, Board: [-1, -1, -1, 1, 1, 0, 1, -1, 1], Winner: -1
Episode 4, Board: [1, -1, -1, -1, 1, 1, 1, -1, 1], Winner: 1
Episode 5, Board: [1, 0, 1, -1, 1, -1, 1, 0, -1], Winner: 1
Episode 6, Board: [0, -1, 1, 1, 1, 0, 1, -1, -1], Winner: 1
Episode 7, Board: [0, 0, -1, -1, 1, -1, 1, 1, 1], Winner: 1
Episode 8, Board: [1, 0, 1, 1, -1, 0, 1, -1, -1], Winner: 1
Episode 9, Board: [1, 1, -1, 1, 0, 1, -1, -1, -1], Winner: -1
Episode 10, Board: [0, 0, 1, 0, 1, -1, 1, 0, -1], Winner: 1
Episode 11, Board: [1, 1, -1, 1, -1, -1, -1, 0, 1], Winner: -1
Episode 12, Board: [1, 0, -1, 1, 0, 0, 1, 0, -1], Winner: 1
Episode 13, Board: [1, -1, 0, -1, 0, -1, 1, 1, 1], Winner: 1
Episode 14, Board: [0, -1, 1, -1, 1, 1, 1, 0, -1], Winner: 1
Episode 15, Board: [-1, -1, -1, 1, 0, 1, 0, 1, 0], Winner: -1
Episode 16, Board: [-1, 0, 1, -1, 1, 0, 1, -1, 1], Winner: 1
Episode 17, Board: [-1, 0,