In [None]:
import numpy as np
import random
import pickle
import time

class TicTacToeQLearning:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.2):
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.q_table = {}  # Q-value table

    def get_state(self, board):
        return tuple(board.flatten())

    def get_available_actions(self, board):
        return [i for i in range(9) if board.flatten()[i] == 0]

    def choose_action(self, board, explore=True):
        state = self.get_state(board)
        available_actions = self.get_available_actions(board)

        if explore and random.uniform(0, 1) < self.epsilon:
            return random.choice(available_actions)

        q_values = [self.q_table.get((state, a), 0) for a in available_actions]
        max_q = max(q_values)
        return random.choice([a for a, q in zip(available_actions, q_values) if q == max_q])

    def update_q_table(self, state, action, reward, next_state, done):
        old_value = self.q_table.get((state, action), 0)
        future_value = 0 if done else max([self.q_table.get((next_state, a), 0) for a in range(9)], default=0)
        self.q_table[(state, action)] = old_value + self.alpha * (reward + self.gamma * future_value - old_value)

    def train(self, episodes=10000):
        for episode in range(episodes):
            board = np.zeros((3, 3), dtype=int)
            done = False
            turn = 1  # 1 for player X, -1 for player O
            states_actions = []

            while not done:
                action = self.choose_action(board)
                state = self.get_state(board)
                board.flat[action] = turn
                self.visualize_board(board)
                next_state = self.get_state(board)
                states_actions.append((state, action))

                if self.check_winner(board, turn):
                    reward = 1 if turn == 1 else -1
                    done = True
                elif len(self.get_available_actions(board)) == 0:
                    reward = 0  # Draw
                    done = True
                else:
                    reward = 0

                self.update_q_table(state, action, reward, next_state, done)
                turn *= -1
                time.sleep(0.5)  # Add delay for better visualization

    def visualize_board(self, board):
        symbols = {1: 'X', -1: 'O', 0: '-'}
        print("\n".join([" ".join([symbols[cell] for cell in row]) for row in board]))
        print("\n" + "-" * 10)

    def check_winner(self, board, player):
        for row in board:
            if all(row == player):
                return True
        for col in board.T:
            if all(col == player):
                return True
        if all([board[i, i] == player for i in range(3)]) or all([board[i, 2 - i] == player for i in range(3)]):
            return True
        return False

    def save_q_table(self, filename="q_table.pkl"):
      with open(filename, "wb") as f:
        pickle.dump(self.q_table, f)
        print("Q-Table:")
        for key, value in self.q_table.items():
            print(f"State: {key}, Action-Value: {value}")



    def load_q_table(self, filename="q_table.pkl"):
        with open(filename, "rb") as f:
            self.q_table = pickle.load(f)

if __name__ == "__main__":
    agent = TicTacToeQLearning()
    agent.train(2)
    agent.save_q_table()


- - -
- - -
- X -

----------
- - -
- - -
- X O

----------
- - -
X - -
- X O

----------
- - -
X - O
- X O

----------
- X -
X - O
- X O

----------
- X -
X O O
- X O

----------
- X X
X O O
- X O

----------
- X X
X O O
O X O

----------
X X X
X O O
O X O

----------
- - -
- - -
X - -

----------
- - O
- - -
X - -

----------
- - O
- - -
X - X

----------
- - O
- - O
X - X

----------
- - O
- - O
X X X

----------
Q-Table:
State: ((np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0)), 7), Action-Value: 0.0
State: ((np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0)), 8), Action-Value: 0.0
State: ((np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(-1)), 3), Action-Value: 0.0
State: ((np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(-

In [None]:
import numpy as np
import random
import pickle
import time

class TicTacToeQLearning:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.2):
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.q_table = {}  # Q-value table

    def get_state(self, board):
        return tuple(board.flatten())

    def get_available_actions(self, board):
        return [i for i in range(9) if board.flatten()[i] == 0]

    def choose_action(self, board, explore=True):
        state = self.get_state(board)
        available_actions = self.get_available_actions(board)

        if explore and random.uniform(0, 1) < self.epsilon:
            return random.choice(available_actions)

        q_values = [self.q_table.get((state, a), 0) for a in available_actions]
        max_q = max(q_values)
        return random.choice([a for a, q in zip(available_actions, q_values) if q == max_q])

    def update_q_table(self, state, action, reward, next_state, done):
        old_value = self.q_table.get((state, action), 0)
        future_value = 0 if done else max([self.q_table.get((next_state, a), 0) for a in range(9)], default=0)
        self.q_table[(state, action)] = old_value + self.alpha * (reward + self.gamma * future_value - old_value)

    def train(self, episodes=10000):
        for episode in range(episodes):
            board = np.zeros((3, 3), dtype=int)
            done = False
            turn = 1  # 1 for player X, -1 for player O
            states_actions = []

            while not done:
                action = self.choose_action(board)
                state = self.get_state(board)
                board.flat[action] = turn
                self.visualize_board(board)
                next_state = self.get_state(board)
                states_actions.append((state, action))

                if self.check_winner(board, turn):
                    reward = 1 if turn == 1 else -1
                    done = True
                elif len(self.get_available_actions(board)) == 0:
                    reward = 0  # Draw
                    done = True
                else:
                    reward = 0

                self.update_q_table(state, action, reward, next_state, done)
                turn *= -1
                time.sleep(0.5)  # Add delay for better visualization

    def visualize_board(self, board):
        symbols = {1: 'X', -1: 'O', 0: '-'}
        print("\n".join([" ".join([symbols[cell] for cell in row]) for row in board]))
        print("\n" + "-" * 10)

    def check_winner(self, board, player):
        for row in board:
            if all(row == player):
                return True
        for col in board.T:
            if all(col == player):
                return True
        if all([board[i, i] == player for i in range(3)]) or all([board[i, 2 - i] == player for i in range(3)]):
            return True
        return False

    def play_human_vs_ai(self):
        board = np.zeros((3, 3), dtype=int)
        self.load_q_table()
        turn = 1  # Human starts as 'X'

        while True:
            self.visualize_board(board)

            if turn == 1:
                action = int(input("Enter your move (0-8): "))
            else:
                action = self.choose_action(board, explore=False)
                print(f"AI chooses position: {action}")

            if action not in self.get_available_actions(board):
                print("Invalid move! Try again.")
                continue

            board.flat[action] = turn

            if self.check_winner(board, turn):
                self.visualize_board(board)
                print("Player X wins!" if turn == 1 else "AI wins!")
                break
            elif len(self.get_available_actions(board)) == 0:
                self.visualize_board(board)
                print("It's a draw!")
                break

            turn *= -1

    def save_q_table(self, filename="q_table.pkl"):
        with open(filename, "wb") as f:
            pickle.dump(self.q_table, f)

    def load_q_table(self, filename="q_table.pkl"):
        with open(filename, "rb") as f:
            self.q_table = pickle.load(f)

if __name__ == "__main__":
    agent = TicTacToeQLearning()
    agent.train(10)
    agent.save_q_table()

    print("Starting human vs AI mode!")
    agent.play_human_vs_ai()


- - -
X - -
- - -

----------
O - -
X - -
- - -

----------
O - -
X - X
- - -

----------
O - -
X - X
- - O

----------
O - -
X - X
X - O

----------
O - -
X - X
X O O

----------
O - X
X - X
X O O

----------
O - X
X O X
X O O

----------
- - -
X - -
- - -

----------
- - -
X - -
- - O

----------
- - X
X - -
- - O

----------
O - X
X - -
- - O

----------
O - X
X X -
- - O

----------
O - X
X X O
- - O

----------
O - X
X X O
- X O

----------
O O X
X X O
- X O

----------
O O X
X X O
X X O

----------
- - -
X - -
- - -

----------
- - -
X - O
- - -

----------
- - -
X X O
- - -

----------
- - -
X X O
O - -

----------
- - -
X X O
O X -

----------
- - -
X X O
O X O

----------
- - X
X X O
O X O

----------
O - X
X X O
O X O

----------
O X X
X X O
O X O

----------
- - -
- - -
X - -

----------
- - -
O - -
X - -

----------
- - -
O X -
X - -

----------
- O -
O X -
X - -

----------
- O -
O X -
X - X

----------
- O O
O X -
X - X

----------
X O O
O X -
X - X

----------
X - -
- - 

In [None]:
import numpy as np
import random
import pickle
import time

class TicTacToeQLearning:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.2):
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.q_table = {}  # Q-value table

    def get_state(self, board):
        return tuple(board.flatten())

    def get_available_actions(self, board):
        return [i for i in range(9) if board.flatten()[i] == 0]

    def choose_action(self, board, explore=True):
        state = self.get_state(board)
        available_actions = self.get_available_actions(board)

        if explore and random.uniform(0, 1) < self.epsilon:
            return random.choice(available_actions)

        q_values = [self.q_table.get((state, a), 0) for a in available_actions]
        max_q = max(q_values)
        return random.choice([a for a, q in zip(available_actions, q_values) if q == max_q])

    def update_q_table(self, state, action, reward, next_state, done):
        old_value = self.q_table.get((state, action), 0)
        future_value = 0 if done else max([self.q_table.get((next_state, a), 0) for a in range(9)], default=0)
        self.q_table[(state, action)] = old_value + self.alpha * (reward + self.gamma * future_value - old_value)

    def train(self, episodes=10000):
        for episode in range(episodes):
            board = np.zeros((3, 3), dtype=int)
            done = False
            turn = 1  # 1 for player X, -1 for player O
            states_actions = []

            while not done:
                action = self.choose_action(board)
                state = self.get_state(board)
                board.flat[action] = turn
                self.visualize_board(board)
                next_state = self.get_state(board)
                states_actions.append((state, action))

                if self.check_winner(board, turn):
                    reward = 1 if turn == 1 else -1
                    done = True
                elif len(self.get_available_actions(board)) == 0:
                    reward = 0  # Draw
                    done = True
                else:
                    reward = 0

                self.update_q_table(state, action, reward, next_state, done)
                turn *= -1
                time.sleep(0.5)  # Add delay for better visualization

    def visualize_board(self, board):
        symbols = {1: 'X', -1: 'O', 0: '-'}
        print("\n".join([" ".join([symbols[cell] for cell in row]) for row in board]))
        print("\n" + "-" * 10)

    def check_winner(self, board, player):
        for row in board:
            if all(row == player):
                return True
        for col in board.T:
            if all(col == player):
                return True
        if all([board[i, i] == player for i in range(3)]) or all([board[i, 2 - i] == player for i in range(3)]):
            return True
        return False

    def play_human_vs_ai(self):
        board = np.zeros((3, 3), dtype=int)
        self.load_q_table()  # Load the saved Q-table
        turn = 1  # Human starts as 'X'

        while True:
            self.visualize_board(board)

            if turn == 1:
                action = int(input("Enter your move (0-8): "))
            else:
                action = self.choose_action(board, explore=False)  # AI's turn
                print(f"AI chooses position: {action}")

            if action not in self.get_available_actions(board):
                print("Invalid move! Try again.")
                continue

            board.flat[action] = turn
            state = self.get_state(board)

            if self.check_winner(board, turn):
                self.visualize_board(board)
                print("Player X wins!" if turn == 1 else "AI wins!")

                # Update Q-table based on the game result
                if turn == -1: #AI won
                  reward = 1
                elif turn == 1: #Human won
                  reward = -1
                else: #draw
                  reward = 0

                self.update_q_table(state, action, reward, state, True) #update state-action pair for AI's winning/losing move

                break
            elif len(self.get_available_actions(board)) == 0:
                self.visualize_board(board)
                print("It's a draw!")
                break

            turn *= -1

    def save_q_table(self, filename="q_table.pkl"):
        with open(filename, "wb") as f:
            pickle.dump(self.q_table, f)

    def load_q_table(self, filename="q_table.pkl"):
        with open(filename, "rb") as f:
            self.q_table = pickle.load(f)

if __name__ == "__main__":
    agent = TicTacToeQLearning()
    agent.train(10)
    agent.save_q_table()

    while (True):
        print("Starting human vs AI mode!")
        agent.play_human_vs_ai()
        choice = input("Do you want to play again? (y/n): ")
        if choice.lower() != 'y':
            break

- - -
- - -
X - -

----------
- O -
- - -
X - -

----------
- O -
- - -
X - X

----------
- O -
- - O
X - X

----------
- O -
- X O
X - X

----------
- O O
- X O
X - X

----------
- O O
- X O
X X X

----------
- - -
- X -
- - -

----------
- - -
- X O
- - -

----------
X - -
- X O
- - -

----------
X - -
- X O
- - O

----------
X - -
- X O
- X O

----------
X O -
- X O
- X O

----------
X O X
- X O
- X O

----------
X O X
O X O
- X O

----------
X O X
O X O
X X O

----------
- - -
- - -
- - X

----------
- - -
- O -
- - X

----------
X - -
- O -
- - X

----------
X O -
- O -
- - X

----------
X O X
- O -
- - X

----------
X O X
- O -
O - X

----------
X O X
X O -
O - X

----------
X O X
X O O
O - X

----------
X O X
X O O
O X X

----------
- - -
- - X
- - -

----------
- - -
- - X
- - O

----------
- - -
- X X
- - O

----------
- - O
- X X
- - O

----------
- - O
- X X
X - O

----------
- - O
O X X
X - O

----------
X - O
O X X
X - O

----------
X O O
O X X
X - O

----------
X O O
O X 

Using Eligibility Traces

In [None]:
import numpy as np
import pickle
import random

class TicTacToeQLearning:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.2, lambda_=0.8):
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.lambda_ = lambda_  # Trace decay factor

        self.q_table = {}  # Q-value table
        self.e_table = {}  # Eligibility trace table

    def get_state(self, board):
        return tuple(board.flatten())

    def get_available_actions(self, board):
        return [i for i in range(9) if board.flatten()[i] == 0]

    def choose_action(self, board, explore=True):
        state = self.get_state(board)
        available_actions = self.get_available_actions(board)

        if explore and random.uniform(0, 1) < self.epsilon:
            return random.choice(available_actions)  # Explore

        q_values = [self.q_table.get((state, a), 0) for a in available_actions]
        max_q = max(q_values, default=0)
        return random.choice([a for a, q in zip(available_actions, q_values) if q == max_q])

    def update_q_table(self, reward, next_state, done):
        """Update Q-values using TD(λ) with eligibility traces."""
        next_q = 0 if done else max([self.q_table.get((next_state, a), 0) for a in self.get_available_actions(np.array(next_state))], default=0)

        for (state, action) in self.e_table.keys():
            old_value = self.q_table.get((state, action), 0)
            td_error = reward + self.gamma * next_q - old_value  # TD(0) error

            # Update Q-value with eligibility trace
            self.q_table[(state, action)] = old_value + self.alpha * td_error * self.e_table[(state, action)]

            # Decay eligibility trace
            self.e_table[(state, action)] *= self.gamma * self.lambda_

    def train_ai_vs_ai(self, episodes=10000):
        """Train AI by playing AI vs AI for multiple episodes."""
        for episode in range(1, episodes + 1):
            board = np.zeros((3, 3), dtype=int)
            turn = 1  # AI-1 (X) starts
            self.e_table.clear()  # Reset eligibility traces
            states_actions = []

            while True:
                state = self.get_state(board)
                action = self.choose_action(board, explore=True)
                board.flat[action] = turn
                next_state = self.get_state(board)

                # Track state-action pairs for eligibility traces
                states_actions.append((state, action))
                self.e_table[(state, action)] = self.e_table.get((state, action), 0) + 1

                if self.check_winner(board, turn):
                    reward = 1 if turn == 1 else -1
                    self.update_q_table(reward, next_state, done=True)
                    break
                elif len(self.get_available_actions(board)) == 0:
                    self.update_q_table(0, next_state, done=True)
                    break

                self.update_q_table(0, next_state, done=False)
                turn *= -1  # Switch turns

            if episode % 1000 == 0:
                print(f"Training Episode {episode}/{episodes} completed.")

    def visualize_board(self, board):
        symbols = {1: 'X', -1: 'O', 0: '-'}
        print("\n".join([" ".join([symbols[cell] for cell in row]) for row in board]))
        print("\n" + "-" * 10)

    def check_winner(self, board, player):
        for row in board:
            if all(row == player):
                return True
        for col in board.T:
            if all(col == player):
                return True
        if all([board[i, i] == player for i in range(3)]) or all([board[i, 2 - i] == player for i in range(3)]):
            return True
        return False

    def save_q_table(self, filename="q_table.pkl"):
        with open(filename, "wb") as f:
            pickle.dump(self.q_table, f)

    def load_q_table(self, filename="q_table.pkl"):
        try:
            with open(filename, "rb") as f:
                self.q_table = pickle.load(f)
        except FileNotFoundError:
            print("No Q-table found, starting fresh.")

    def play_human_vs_ai(self):
        """Allows a human to play against the trained AI."""
        board = np.zeros((3, 3), dtype=int)

        print("Welcome to Tic-Tac-Toe!")
        player_symbol = int(input("Choose your symbol: 1 for X, -1 for O: "))
        ai_symbol = -player_symbol

        turn = 1  # X always starts
        while True:
            self.visualize_board(board)

            if turn == player_symbol:
                # Human turn
                available_moves = self.get_available_actions(board)
                move = -1
                while move not in available_moves:
                    try:
                        move = int(input(f"Your turn! Choose a position (0-8): "))
                        if move not in available_moves:
                            print("Invalid move! Try again.")
                    except ValueError:
                        print("Enter a valid integer between 0 and 8.")
                board.flat[move] = player_symbol
            else:
                # AI turn
                print("AI is thinking...")
                move = self.choose_action(board, explore=False)
                board.flat[move] = ai_symbol

            # Check for win or draw
            if self.check_winner(board, turn):
                self.visualize_board(board)
                if turn == player_symbol:
                    print("Congratulations! You win!")
                else:
                    print("AI wins! Better luck next time.")
                break
            elif len(self.get_available_actions(board)) == 0:
                self.visualize_board(board)
                print("It's a draw!")
                break

            turn *= -1  # Switch turns

if __name__ == "__main__":
    agent = TicTacToeQLearning()
    agent.load_q_table()

    print("Training AI vs AI...")
    agent.train_ai_vs_ai(episodes=10000)  # Train AI for 10,000 games
    agent.save_q_table()

    print("Training complete. Q-table saved!")

    # Play Human vs AI
    agent.play_human_vs_ai()

Training AI vs AI...
Training Episode 1000/10000 completed.
Training Episode 2000/10000 completed.
Training Episode 3000/10000 completed.
Training Episode 4000/10000 completed.
Training Episode 5000/10000 completed.
Training Episode 6000/10000 completed.
Training Episode 7000/10000 completed.
Training Episode 8000/10000 completed.
Training Episode 9000/10000 completed.
Training Episode 10000/10000 completed.
Training complete. Q-table saved!
Welcome to Tic-Tac-Toe!
Choose your symbol: 1 for X, -1 for O: -1
- - -
- - -
- - -

----------
AI is thinking...
- - -
- X -
- - -

----------
Your turn! Choose a position (0-8): 1
- O -
- X -
- - -

----------
AI is thinking...
- O -
- X -
X - -

----------
Your turn! Choose a position (0-8): 2
- O O
- X -
X - -

----------
AI is thinking...
- O O
- X X
X - -

----------
Your turn! Choose a position (0-8): 3
- O O
O X X
X - -

----------
AI is thinking...
- O O
O X X
X - X

----------
Your turn! Choose a position (0-8): 0
O O O
O X X
X - X

-----