In [6]:
import numpy as np
import random
import pickle

class TicTacToeQLearning:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.2):
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_table = {}

    def get_state(self, board):
        return tuple(board.flatten())

    def get_available_actions(self, board):
        return [i for i in range(9) if board.flat[i] == 0]

    def choose_action(self, board, player, explore=True):
        state = self.get_state(board)
        available_actions = self.get_available_actions(board)

        if explore and random.random() < self.epsilon:
            return random.choice(available_actions)

        q_values = [self.q_table.get((state, a, player), 0) for a in available_actions]
        max_q = max(q_values)
        best_actions = [a for a, q in zip(available_actions, q_values) if q == max_q]
        return random.choice(best_actions)

    def update_q_table(self, state, action, reward, next_state, done, player):
        old_value = self.q_table.get((state, action, player), 0)
        future_value = 0
        if not done:
            future_value = max(
                [self.q_table.get((next_state, a, player), 0)
                 for a in self.get_available_actions(np.array(next_state).reshape(3, 3))],
                default=0
            )
        self.q_table[(state, action, player)] = old_value + self.alpha * (reward + self.gamma * future_value - old_value)

    def check_winner(self, board, player):
        return any([
            all(row == player) for row in board
        ]) or any([
            all(col == player) for col in board.T
        ]) or all([
            board[i, i] == player for i in range(3)
        ]) or all([
            board[i, 2 - i] == player for i in range(3)
        ])

    def train(self, episodes=10000):
        for _ in range(episodes):
            board = np.zeros((3, 3), dtype=int)
            done = False
            player = 1

            while not done:
                state = self.get_state(board)
                action = self.choose_action(board, player)
                board.flat[action] = player

                if self.check_winner(board, player):
                    reward = 1
                    done = True
                elif len(self.get_available_actions(board)) == 0:
                    reward = 0.5
                    done = True
                else:
                    reward = 0

                next_state = self.get_state(board)
                self.update_q_table(state, action, reward, next_state, done, player)
                player *= -1

    def save_q_table(self, filename="q_table.pkl"):
        with open(filename, "wb") as f:
            pickle.dump(self.q_table, f)

if __name__ == "__main__":
    agent = TicTacToeQLearning()
    agent.train(episodes=10000)
    agent.save_q_table()
    print("Training complete. Q-table saved.")


Training complete. Q-table saved.


In [7]:
import numpy as np
import random
import pickle

class TicTacToeQLearning:
    def __init__(self):
        self.q_table = {}

    def get_state(self, board):
        return tuple(board.flatten())

    def get_available_actions(self, board):
        return [i for i in range(9) if board.flat[i] == 0]

    def choose_action(self, board, player):
        state = self.get_state(board)
        available_actions = self.get_available_actions(board)

        q_values = [self.q_table.get((state, a, player), 0) for a in available_actions]
        max_q = max(q_values)
        best_actions = [a for a, q in zip(available_actions, q_values) if q == max_q]
        return random.choice(best_actions)

    def visualize_board(self, board):
        symbols = {1: 'X', -1: 'O', 0: '-'}
        for row in board:
            print(" ".join([symbols[int(cell)] for cell in row]))
        print("-" * 10)

    def check_winner(self, board, player):
        return any([
            all(row == player) for row in board
        ]) or any([
            all(col == player) for col in board.T
        ]) or all([
            board[i, i] == player for i in range(3)
        ]) or all([
            board[i, 2 - i] == player for i in range(3)
        ])

    def load_q_table(self, filename="q_table.pkl"):
        with open(filename, "rb") as f:
            self.q_table = pickle.load(f)

    def play_human_vs_ai(self):
        board = np.zeros((3, 3), dtype=int)
        self.load_q_table()
        human = 1
        ai = -1
        turn = human

        while True:
            self.visualize_board(board)

            if turn == human:
                try:
                    action = int(input("Enter your move (0-8): "))
                except ValueError:
                    print("Invalid input.")
                    continue
            else:
                action = self.choose_action(board, ai)
                print(f"AI chooses: {action}")

            if action not in self.get_available_actions(board):
                print("Invalid move! Try again.")
                continue

            board.flat[action] = turn

            if self.check_winner(board, turn):
                self.visualize_board(board)
                print("You win!" if turn == human else "AI wins!")
                break
            elif len(self.get_available_actions(board)) == 0:
                self.visualize_board(board)
                print("It's a draw!")
                break

            turn *= -1

if __name__ == "__main__":
    agent = TicTacToeQLearning()
    agent.play_human_vs_ai()


- - -
- - -
- - -
----------
X - -
- - -
- - -
----------
AI chooses: 4
X - -
- O -
- - -
----------
X X -
- O -
- - -
----------
AI chooses: 8
X X -
- O -
- - O
----------
X X X
- O -
- - O
----------
You win!
