In [None]:
import numpy as np
import random

class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.current_player = 1

    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.current_player = 1
        return self.get_state()

    def get_state(self):
        return self.board.flatten()

    def make_move(self, action):
        row = action // 3
        col = action % 3

        if self.board[row, col] == 0:
            self.board[row, col] = self.current_player
            self.current_player = 3 - self.current_player  # Switch player (1 -> 2, 2 -> 1)
            return True
        else:
            return False  # Invalid move

    def check_winner(self):
        # Check rows, columns, and diagonals
        for i in range(3):
            if np.all(self.board[i, :] == self.current_player) or np.all(self.board[:, i] == self.current_player):
                return 3- self.current_player
        if np.all(np.diag(self.board) == self.current_player) or np.all(np.diag(np.fliplr(self.board)) == self.current_player):
             return 3 - self.current_player

        # Check for a draw
        if np.all(self.board != 0):
            return 0  # Draw

        return None  # Game not over


class QLearningAgent:
    def __init__(self, alpha=0.5, gamma=0.9, epsilon=0.1):
        self.q_table = {}
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon

    def get_q_value(self, state, action):
        state_tuple = tuple(state)
        if (state_tuple, action) not in self.q_table:
            self.q_table[(state_tuple, action)] = 0.0
        return self.q_table[(state_tuple, action)]

    def choose_action(self, state, available_actions):
         if random.uniform(0, 1) < self.epsilon:
            return random.choice(available_actions)  # Exploration
         else:
             q_values = [self.get_q_value(state, a) for a in available_actions]
             return available_actions[np.argmax(q_values)] # Exploitation

    def update_q_value(self, state, action, reward, next_state, next_available_actions):
         state_tuple = tuple(state)
         if next_available_actions: # handles terminal state
             max_next_q = max([self.get_q_value(next_state, a) for a in next_available_actions])
         else:
             max_next_q = 0
         self.q_table[(state_tuple, action)] += self.alpha * (reward + self.gamma * max_next_q - self.get_q_value(state, action))

def train_agent(agent, env, num_episodes):
    for episode in range(num_episodes):
        state = env.reset()
        done = False
        while not done:
            available_actions = [i for i in range(9) if env.board.flatten()[i] == 0]
            action = agent.choose_action(state, available_actions)

            env.make_move(action)
            next_state = env.get_state()
            reward = 0
            winner = env.check_winner()
            if winner == 1: # Agent's player number
                 reward = 1
            elif winner is not None:
                reward = -1
            done = winner is not None


            next_available_actions = [i for i in range(9) if env.board.flatten()[i] == 0]
            agent.update_q_value(state, action, reward, next_state, next_available_actions)

            state = next_state




def play_against_human(agent, env):
    state = env.reset()
    done = False
    while not done:
        print(env.board)
        if env.current_player == 1: # agent's turn
            available_actions = [i for i in range(9) if env.board.flatten()[i] == 0]
            action = agent.choose_action(state, available_actions)
            print("Agent chooses:", action)
            env.make_move(action)


        else:
            while True:
                try:
                    action = int(input("Enter your move (0-8): "))
                    if 0 <= action <= 8 and env.board.flatten()[action] == 0:
                        env.make_move(action)
                        break
                    else:
                        print("Invalid move. Try again.")
                except ValueError:
                    print("Invalid input. Please enter a number.")
        winner = env.check_winner()
        if winner is not None:
            print(env.board)
            if winner == 0:
                print("It's a draw!")
            elif winner == 1:
                print("Agent wins!")
            else:
                print("You win!")
            done = True

        state = env.get_state()



# Example usage:
env = TicTacToe()
agent = QLearningAgent()

train_agent(agent, env, num_episodes=50000) # Adjust for more training


[[0 0 0]
 [0 0 0]
 [0 0 0]]
Agent chooses: 0
[[1 0 0]
 [0 0 0]
 [0 0 0]]
[[1 0 0]
 [0 0 2]
 [0 0 0]]
Agent chooses: 1
[[1 1 0]
 [0 0 2]
 [0 0 0]]
[[1 1 2]
 [0 0 2]
 [0 0 0]]
Agent chooses: 7
[[1 1 2]
 [0 0 2]
 [0 1 0]]
[[1 1 2]
 [0 2 2]
 [0 1 0]]
Agent chooses: 3
[[1 1 2]
 [1 2 2]
 [0 1 0]]
[[1 1 2]
 [1 2 2]
 [2 1 0]]
Agent chooses: 8
[[1 1 2]
 [1 2 2]
 [2 1 1]]
Agent wins!


In [11]:
play_against_human(agent, env)

[[0 0 0]
 [0 0 0]
 [0 0 0]]
Agent chooses: 3
[[0 0 0]
 [1 0 0]
 [0 0 0]]
[[0 0 0]
 [1 2 0]
 [0 0 0]]
Agent chooses: 0
[[1 0 0]
 [1 2 0]
 [0 0 0]]
[[1 0 0]
 [1 2 0]
 [2 0 0]]
Agent chooses: 1
[[1 1 0]
 [1 2 0]
 [2 0 0]]
[[1 1 2]
 [1 2 0]
 [2 0 0]]
Agent chooses: 5
[[1 1 2]
 [1 2 1]
 [2 0 0]]
Agent wins!
