In [44]:
import numpy as np

class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3))  # 3x3 grid initialized to 0
        self.done = False  # Track if the game is over
        self.current_player = 1  # Player 1 starts

    def reset(self):
        self.board = np.zeros((3, 3))
        self.done = False
        self.current_player = 1
        return self.board

    def available_actions(self):
        return np.argwhere(self.board == 0)  # Get all empty positions

    def step(self, action):
        if self.board[action[0], action[1]] == 0:
            self.board[action[0], action[1]] = self.current_player
            reward = self.check_winner()
            self.current_player = -self.current_player  # Switch player
            return self.board, reward, self.done
        else:
            return self.board, -1, self.done  # Invalid move results in a penalty

    def check_winner(self):
        for i in range(3):
            if abs(sum(self.board[i, :])) == 3:  # Check rows
                self.done = True
                return 1 if self.current_player == 1 else -1
            if abs(sum(self.board[:, i])) == 3:  # Check columns
                self.done = True
                return 1 if self.current_player == 1 else -1

        # Check diagonals
        if abs(self.board.trace()) == 3 or abs(np.fliplr(self.board).trace()) == 3:
            self.done = True
            return 1 if self.current_player == 1 else -1

        if np.all(self.board != 0):
            self.done = True  # It's a draw
            return 0

        return 0  # No winner yet

env = TicTacToe()



In [45]:
import random

class QLearningAgent:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.q_table = {}  # Dictionary to store state-action values
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate

    def get_q_value(self, state, action):
        state_str = str(state)
        return self.q_table.get((state_str, tuple(action)), 0.0)

    def set_q_value(self, state, action, value):
        state_str = str(state)
        self.q_table[(state_str, tuple(action))] = value

    def choose_action(self, state, available_actions):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(available_actions)  # Explore
        else:
            # Exploit - choose the action with the highest Q-value
            q_values = [self.get_q_value(state, action) for action in available_actions]
            max_q_value = max(q_values)
            return available_actions[q_values.index(max_q_value)]

    def learn(self, state, action, reward, next_state, next_available_actions, done):
        old_q_value = self.get_q_value(state, action)
        
        if done:
            target = reward  # No future reward if the game is over
        else:
        # Get Q-values for the next state and its available actions 
            future_q_values = [self.get_q_value(next_state, a) for a in next_available_actions]
            target = reward + self.gamma * max(future_q_values)  # Bellman equation

    # Update Q-value using the learning rate (alpha)
            new_q_value = (1 - self.alpha) * old_q_value + self.alpha * target
            self.set_q_value(state, action, new_q_value)


agent = QLearningAgent(alpha=0.1, gamma=0.9, epsilon=0.1)



In [46]:
def train(agent, env, episodes=10000):
    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            available_actions = env.available_actions()
            action = agent.choose_action(state, available_actions)
            next_state, reward, done = env.step(action)

            # Update available actions for the next state
            next_available_actions = env.available_actions()

            # Learn from the action and outcome
            agent.learn(state, action, reward, next_state, next_available_actions, done)
            state = next_state

        if episode % 1000 == 0:
            print(f"Episode {episode}/{episodes} completed")

    print("Training finished!")


In [47]:
def test(agent, env):
    state = env.reset()
    done = False

    while not done:
        available_actions = env.available_actions()
        action = agent.choose_action(state, available_actions)
        state, reward, done = env.step(action)
        print("Current board state:\n", state)  # This prints the board after each move

        if done:
            if reward == 1:
                print("Agent wins!")
            elif reward == -1:
                print("Opponent wins!")
            else:
                print("It's a draw!")


In [48]:
# Define the environment
env = TicTacToe()

# Define the agent
agent = QLearningAgent(alpha=0.1, gamma=0.9, epsilon=0.1)

# Train the agent
train(agent, env)


Episode 0/10000 completed
Episode 1000/10000 completed
Episode 2000/10000 completed
Episode 3000/10000 completed
Episode 4000/10000 completed
Episode 5000/10000 completed
Episode 6000/10000 completed
Episode 7000/10000 completed
Episode 8000/10000 completed
Episode 9000/10000 completed
Training finished!


In [49]:
def test(agent, env):
    state = env.reset()
    done = False

    print("Starting a new game:")
    print(state)

    while not done:
        available_actions = env.available_actions()
        action = agent.choose_action(state, available_actions)
        state, reward, done = env.step(action)

        print("Agent's move:")
        print(state)

        if done:
            if reward == 1:
                print("Agent wins!")
            elif reward == -1:
                print("Opponent wins!")
            else:
                print("It's a draw!")


In [50]:
def play_against_agent(agent, env):
    state = env.reset()
    done = False

    while not done:
        # Player (Human) move
        print("Your turn (You are Player -1):")
        print(state)
        row, col = map(int, input("Enter row and col (0-2) for your move: ").split())
        state, reward, done = env.step([row, col])

        if done:
            print("Game Over!")
            if reward == 1:
                print("Agent wins!")
            elif reward == -1:
                print("You win!")
            else:
                print("It's a draw!")
            break

        # Agent's move
        print("Agent's turn:")
        available_actions = env.available_actions()
        action = agent.choose_action(state, available_actions)
        state, reward, done = env.step(action)

        print(state)  # Print board after agent's move

        if done:
            print("Game Over!")
            if reward == 1:
                print("Agent wins!")
            elif reward == -1:
                print("You win!")
            else:
                print("It's a draw!")


In [51]:
def track_training(agent, env, episodes=10000):
    results = {"win": 0, "loss": 0, "draw": 0}

    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            available_actions = env.available_actions()
            action = agent.choose_action(state, available_actions)
            next_state, reward, done = env.step(action)
            state = next_state

        if reward == 1:
            results["win"] += 1
        elif reward == -1:
            results["loss"] += 1
        else:
            results["draw"] += 1

    return results


In [52]:
results = track_training(agent, env, episodes=1000)
print(f"Agent's performance over 1000 games: {results}")


Agent's performance over 1000 games: {'win': 823, 'loss': 121, 'draw': 56}


In [64]:
def test(agent, env):
    state = env.reset()
    done = False

    print("Initial Board:")
    print(state)

    while not done:
        available_actions = env.available_actions()
        action = agent.choose_action(state, available_actions)  # Agent chooses an action
        state, reward, done = env.step(action)
        print("Current board state:\n", state)  # Print the board after agent's move

        if done:
            if reward == 1:
                print("Agent wins!")
            elif reward == -1:
                print("Opponent wins!")
            else:
                print("It's a draw!")
            break

        # Opponent's move (Random choice for simplicity)
        opponent_action = random.choice(env.available_actions())
        state, reward, done = env.step(opponent_action)
        print("Opponent's move:\n", state)

        if done:
            if reward == 1:
                print("Agent wins!")
            elif reward == -1:
                print("Opponent wins!")
            else:
                print("It's a draw!")


In [65]:
test(agent, env)


Initial Board:
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
Current board state:
 [[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
Opponent's move:
 [[ 1.  0.  0.]
 [-1.  0.  0.]
 [ 0.  0.  0.]]
Current board state:
 [[ 1.  1.  0.]
 [-1.  0.  0.]
 [ 0.  0.  0.]]
Opponent's move:
 [[ 1.  1. -1.]
 [-1.  0.  0.]
 [ 0.  0.  0.]]
Current board state:
 [[ 1.  1. -1.]
 [-1.  1.  0.]
 [ 0.  0.  0.]]
Opponent's move:
 [[ 1.  1. -1.]
 [-1.  1.  0.]
 [ 0.  0. -1.]]
Current board state:
 [[ 1.  1. -1.]
 [-1.  1.  1.]
 [ 0.  0. -1.]]
Opponent's move:
 [[ 1.  1. -1.]
 [-1.  1.  1.]
 [-1.  0. -1.]]
Current board state:
 [[ 1.  1. -1.]
 [-1.  1.  1.]
 [-1.  1. -1.]]
Agent wins!
