In [None]:
import numpy as np
import random

ROWS, COLS = 3, 3
EMPTY, PLAYER_X, PLAYER_O = 0, 1, -1
WIN_REWARD = 1
DRAW_REWARD = 0
LOSE_REWARD = -1
LEARNING_RATE = 0.1
DISCOUNT_FACTOR = 0.9
EXPLORATION_PROB = 0.1

# Initialize Q-table using a dictionary
q_table = {}

def initialize_board():
    return np.zeros((ROWS, COLS), dtype=int)

def get_board_hash(board):
    return board.tobytes()

def get_legal_moves(board):
    return list(zip(*np.where(board == EMPTY)))

def make_move(board, player, row, col):
    board[row, col] = player

def check_winner(board):
    # Check rows and columns
    if np.any(np.abs(np.sum(board, axis=0)) == ROWS) or np.any(np.abs(np.sum(board, axis=1)) == COLS):
        return True

    # Check diagonals
    if np.abs(np.trace(board)) == ROWS or np.abs(np.trace(np.fliplr(board))) == ROWS:
        return True

    return False

def get_reward(board):
    if check_winner(board):
        return WIN_REWARD
    elif np.all(board != EMPTY):
        return DRAW_REWARD
    else:
        return LOSE_REWARD

def get_best_move(board, player):
    legal_moves = get_legal_moves(board)

    if random.uniform(0, 1) < EXPLORATION_PROB:
        return random.choice(legal_moves)

    best_move = None
    best_value = -float('inf') if player == PLAYER_X else float('inf')

    for move in legal_moves:
        next_board = board.copy()
        make_move(next_board, player, *move)
        next_board_hash = get_board_hash(next_board)
        q_value = q_table.get((next_board_hash, player), 0)

        if player == PLAYER_X:
            if q_value > best_value:
                best_value = q_value
                best_move = move
        else:
            if q_value < best_value:
                best_value = q_value
                best_move = move

    return best_move

def train_q_learning(episodes=50000):
    for _ in range(episodes):
        current_board = initialize_board()
        current_player = PLAYER_X

        while True:
            current_board_hash = get_board_hash(current_board)
            legal_moves = get_legal_moves(current_board)

            # Choose action
            if random.uniform(0, 1) < EXPLORATION_PROB:
                action = random.choice(legal_moves)
            else:
                action = get_best_move(current_board, current_player)

            # Take action
            make_move(current_board, current_player, *action)

            # Check for game end
            if check_winner(current_board) or np.all(current_board != EMPTY):
                reward = get_reward(current_board)
                q_value = q_table.get((current_board_hash, current_player), 0)

                # Update Q-value
                updated_q_value = (1 - LEARNING_RATE) * q_value + LEARNING_RATE * (reward + DISCOUNT_FACTOR * max(q_table.get((get_board_hash(current_board), PLAYER_X), 0),
                                                                                                               q_table.get((get_board_hash(current_board), PLAYER_O), 0)))
                q_table[(current_board_hash, current_player)] = updated_q_value
                break

            # Switch player
            current_player = PLAYER_X if current_player == PLAYER_O else PLAYER_O

# Human vs. Trained Agent
def play_game():
    current_board = initialize_board()
    human_player = PLAYER_X

    print("Welcome to Tic Tac Toe!")
    print(current_board)

    train_q_learning()

    while True:
        if human_player == PLAYER_X:
            row = int(input("Enter row (0, 1, or 2): "))
            col = int(input("Enter column (0, 1, or 2): "))
            action = (row, col)
        else:
            action = get_best_move(current_board, human_player)

        if action in get_legal_moves(current_board):
            make_move(current_board, human_player, *action)
            print(current_board)

            if check_winner(current_board):
                print("You win!")
                break
            elif np.all(current_board != EMPTY):
                print("It's a draw!")
                break

            # Switch player
            human_player = PLAYER_O if human_player == PLAYER_X else PLAYER_X
        else:
            print("Invalid move. Try again.")

if __name__ == "__main__":
    play_game()


Welcome to Tic Tac Toe!
[[0 0 0]
 [0 0 0]
 [0 0 0]]
Enter row (0, 1, or 2): 0
Enter column (0, 1, or 2): 1
[[0 1 0]
 [0 0 0]
 [0 0 0]]
[[-1  1  0]
 [ 0  0  0]
 [ 0  0  0]]
Enter row (0, 1, or 2): 3
Enter column (0, 1, or 2): 2
Invalid move. Try again.
Enter row (0, 1, or 2): 1
Enter column (0, 1, or 2): 2
[[-1  1  0]
 [ 0  0  1]
 [ 0  0  0]]
[[-1  1 -1]
 [ 0  0  1]
 [ 0  0  0]]
Enter row (0, 1, or 2): 2
Enter column (0, 1, or 2): 1
[[-1  1 -1]
 [ 0  0  1]
 [ 0  1  0]]
[[-1  1 -1]
 [-1  0  1]
 [ 0  1  0]]
Enter row (0, 1, or 2): 1
Enter column (0, 1, or 2): 1
[[-1  1 -1]
 [-1  1  1]
 [ 0  1  0]]
You win!
