In [1]:
import numpy as np
import random

In [8]:
class TicTacToe:
    def __init__(self):
        self.reset()

    def reset(self):
        """Initialize or reset the board"""
        self.board = [' '] * 9  # 3x3 board
        self.current_player = 'X'
        return self.get_state()

    def available_actions(self):
        """Return available positions"""
        return [i for i, cell in enumerate(self.board) if cell == ' ']

    def step(self, action):
        """Make a move"""
        if self.board[action] != ' ':
            return self.get_state(), -10, True  # Invalid move penalty

        self.board[action] = self.current_player
        winner = self.check_winner()
        done = winner is not None or ' ' not in self.board
        reward = 0

        if winner == self.current_player:
            reward = 1
        elif done:
            reward = 0.5  # Draw reward

        # Switch player
        self.current_player = 'O' if self.current_player == 'X' else 'X'
        return self.get_state(), reward, done

    def check_winner(self):
        """Check if a player has won"""
        wins = [
            (0, 1, 2), (3, 4, 5), (6, 7, 8),  # rows
            (0, 3, 6), (1, 4, 7), (2, 5, 8),  # columns
            (0, 4, 8), (2, 4, 6)              # diagonals
        ]
        for (a, b, c) in wins:
            if self.board[a] == self.board[b] == self.board[c] != ' ':
                return self.board[a]
        return None

    def get_state(self):
        """Return tuple representation of the board"""
        return tuple(self.board)

    def render(self):
        """Display the board"""
        print("\n")
        for i in range(0, 9, 3):
            print(self.board[i], "|", self.board[i+1], "|", self.board[i+2])
        print("\n")

In [3]:
# b. Defining Q-learning
Q = {}
alpha = 0.1      # learning rate
gamma = 0.9      # discount factor
epsilon = 0.1    # exploration

def get_Q(state, action):
    return Q.get((state, action), 0.0)

# c. Training the model
env = TicTacToe()
for episode in range(50000):
    state = env.reset()
    player = 'X'
    while True:
        actions = env.available_actions()
        if random.uniform(0, 1) < epsilon:
            action = random.choice(actions)
        else:
            q_vals = [get_Q(state, a) for a in actions]
            action = actions[np.argmax(q_vals)]
        next_state, reward, done = env.step(action, player)
        if done:
            Q[(state, action)] = get_Q(state, action) + alpha * (reward - get_Q(state, action))
            break
        opp_action = random.choice(env.available_actions())
        next_state, opp_reward, done = env.step(opp_action, 'O')
        if done:
            reward = -1 if opp_reward == 1 else 0
            Q[(state, action)] = get_Q(state, action) + alpha * (reward - get_Q(state, action))
            break
        next_actions = env.available_actions()
        q_next = max([get_Q(next_state, a) for a in next_actions]) if next_actions else 0
        Q[(state, action)] = get_Q(state, action) + alpha * (reward + gamma * q_next - get_Q(state, action))
        state = next_state

print("Training done!")


Training done!


In [6]:
# d. Testing the model
def play_game():
    env = TicTacToe()
    state = env.reset()
    player = 'X'
    while True:
        actions = env.available_actions()
        
        q_vals = [get_Q(state, a) for a in actions]
        action = actions[np.argmax(q_vals)]
        state, reward, done = env.step(action, player)
        if done:
            print(np.array(state).reshape(3,3))
            print("Result:", "Win" if reward==1 else "Draw" if reward==0.5 else "Lose")
            break
        opp_action = random.choice(env.available_actions())
        state, reward, done = env.step(opp_action, 'O')
        if done:
            print(np.array(state).reshape(3,3))
            print("Result:", "Lose")
            break

In [7]:
# e. Play a few test games
for _ in range(3):
    play_game()


[['X' 'X' 'X']
 [' ' ' ' 'O']
 [' ' 'O' ' ']]
Result: Win
[['X' 'X' 'X']
 [' ' ' ' 'O']
 ['O' ' ' ' ']]
Result: Win
[['X' 'X' 'X']
 [' ' ' ' 'O']
 [' ' 'O' ' ']]
Result: Win
