In [None]:
import numpy as np
import random
import pickle
from IPython.display import clear_output
from collections import defaultdict

# --- Constants and Game Class (Unchanged from Phase 1) ---
ROWS = 6
COLS = 7
PLAYER_PIECE = 1
AI_PIECE = 2
PLAYER_EMOJI = '🔵'
AI_EMOJI = '🔴'
EMPTY_EMOJI = '⚫'

class Connect4Game:
    # (The Connect4Game class code from the previous step goes here)
    # (No changes are needed for the class itself)
    def __init__(self):
        self.board = np.zeros((ROWS, COLS), dtype=int)
        self.game_over = False
        self.turn = 0 # 0 for Player, 1 for AI

    def drop_piece(self, row, col, piece):
        self.board[row][col] = piece

    def is_valid_location(self, col):
        return self.board[ROWS-1][col] == 0

    def get_next_open_row(self, col):
        for r in range(ROWS):
            if self.board[r][col] == 0:
                return r

    def print_board(self):
        flipped_board = np.flip(self.board, 0)
        print("  ".join(map(str, range(COLS))))
        print("-" * (COLS * 3))
        for r in range(ROWS):
            row_str = ""
            for c in range(COLS):
                if flipped_board[r][c] == PLAYER_PIECE:
                    row_str += PLAYER_EMOJI + " "
                elif flipped_board[r][c] == AI_PIECE:
                    row_str += AI_EMOJI + " "
                else:
                    row_str += EMPTY_EMOJI + " "
            print(row_str)
        print("\n")

    def winning_move(self, piece):
        # Check all winning conditions (horizontal, vertical, diagonals)
        for c in range(COLS - 3):
            for r in range(ROWS):
                if all(self.board[r, c+i] == piece for i in range(4)): return True
        for c in range(COLS):
            for r in range(ROWS - 3):
                if all(self.board[r+i, c] == piece for i in range(4)): return True
        for c in range(COLS - 3):
            for r in range(ROWS - 3):
                if all(self.board[r+i, c+i] == piece for i in range(4)): return True
        for c in range(COLS - 3):
            for r in range(3, ROWS):
                if all(self.board[r-i, c+i] == piece for i in range(4)): return True
        return False

    def get_valid_locations(self):
        return [col for col in range(COLS) if self.is_valid_location(col)]

# --- Heuristic Bot Logic (Unchanged, our agent's opponent) ---
def heuristic_move(board):
    valid_locations = [c for c in range(COLS) if board[ROWS-1][c] == 0]
    for col in valid_locations:
        temp_board = board.copy()
        row = get_next_open_row_for_board(temp_board, col)
        temp_board[row][col] = AI_PIECE
        if winning_move_for_board(temp_board, AI_PIECE): return col
    for col in valid_locations:
        temp_board = board.copy()
        row = get_next_open_row_for_board(temp_board, col)
        temp_board[row][col] = PLAYER_PIECE
        if winning_move_for_board(temp_board, PLAYER_PIECE): return col
    return random.choice(valid_locations) if valid_locations else None

def get_next_open_row_for_board(board, col):
    for r in range(ROWS):
        if board[r, col] == 0: return r

def winning_move_for_board(board, piece):
    for c in range(COLS - 3):
        for r in range(ROWS):
            if all(board[r, c+i] == piece for i in range(4)): return True
    for c in range(COLS):
        for r in range(ROWS - 3):
            if all(board[r+i, c] == piece for i in range(4)): return True
    for c in range(COLS - 3):
        for r in range(ROWS - 3):
            if all(board[r+i, c+i] == piece for i in range(4)): return True
    for c in range(COLS - 3):
        for r in range(3, ROWS):
            if all(board[r-i, c+i] == piece for i in range(4)): return True
    return False

# --- NEW: Q-Learning Agent Class ---
class QLearningAgent:
    def __init__(self, alpha=0.1, gamma=0.99, epsilon=1.0, epsilon_decay=0.9999, epsilon_min=0.01):
        # Use defaultdict to handle unseen states gracefully
        self.q_table = defaultdict(lambda: np.zeros(COLS))
        self.alpha = alpha         # Learning rate
        self.gamma = gamma         # Discount factor for future rewards
        self.epsilon = epsilon     # Exploration rate
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

    def get_state_key(self, board):
        # Convert the numpy array board state into a hashable type (bytes)
        return board.tobytes()

    def choose_action(self, board, valid_locations):
        # Epsilon-greedy strategy
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(valid_locations) # Explore
        else:
            state_key = self.get_state_key(board)
            q_values = self.q_table[state_key]
            # Choose the best action among valid locations
            valid_q_values = [q_values[col] for col in valid_locations]
            return valid_locations[np.argmax(valid_q_values)] # Exploit

    def update_q_table(self, board_state, action, reward, next_board_state):
        state_key = self.get_state_key(board_state)
        next_state_key = self.get_state_key(next_board_state)

        old_value = self.q_table[state_key][action]
        next_max = np.max(self.q_table[next_state_key]) # Future reward

        # The Q-learning formula
        new_value = old_value + self.alpha * (reward + self.gamma * next_max - old_value)
        self.q_table[state_key][action] = new_value

        # Decay epsilon
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save_q_table(self, filename="q_table.pkl"):
        with open(filename, 'wb') as f:
            pickle.dump(dict(self.q_table), f)
            print(f"Q-table saved to {filename}")

    def load_q_table(self, filename="q_table.pkl"):
        with open(filename, 'rb') as f:
            self.q_table = defaultdict(lambda: np.zeros(COLS), pickle.load(f))
            print(f"Q-table loaded from {filename}")


# --- NEW: Training Loop ---
agent = QLearningAgent()
episodes = 100000 # The number of games the AI will play to learn
wins = 0
losses = 0
draws = 0

print("Starting Q-learning training...")

for episode in range(episodes):
    game = Connect4Game()
    # Let the Q-learning agent be Player 1

    while not game.game_over:
        if game.turn == 0: # Agent's turn
            state = game.board.copy()
            valid_locations = game.get_valid_locations()
            if not valid_locations: break

            col = agent.choose_action(state, valid_locations)

            row = game.get_next_open_row(col)
            game.drop_piece(row, col, PLAYER_PIECE)

            if game.winning_move(PLAYER_PIECE):
                reward = 20 # Big reward for winning
                agent.update_q_table(state, col, reward, game.board)
                wins += 1
                game.game_over = True

            game.turn += 1

        else: # Opponent's (Heuristic Bot) turn
            opponent_col = heuristic_move(game.board)
            if opponent_col is not None:
                row = game.get_next_open_row(opponent_col)
                game.drop_piece(row, opponent_col, AI_PIECE)

                if game.winning_move(AI_PIECE):
                    reward = -20 # Big penalty for losing
                    agent.update_q_table(state, col, reward, game.board)
                    losses += 1
                    game.game_over = True

            # If the game isn't over, give a small negative reward to encourage faster wins
            if not game.game_over:
                reward = -0.5
                agent.update_q_table(state, col, reward, game.board)

            game.turn -= 1

        # Check for draw
        if len(game.get_valid_locations()) == 0 and not game.game_over:
            reward = 5 # Small reward for a draw
            agent.update_q_table(state, col, reward, game.board)
            draws +=1
            game.game_over = True

    if (episode + 1) % 1000 == 0:
        clear_output(wait=True)
        print(f"Episode: {episode + 1}/{episodes}")
        win_rate = (wins / (episode + 1)) * 100
        print(f"Win Rate: {win_rate:.2f}% ({wins} W / {losses} L / {draws} D)")
        print(f"Current Epsilon: {agent.epsilon:.4f}")

print("\nTraining complete!")
agent.save_q_table() # Save the learned strategy

In [None]:
import numpy as np
import random
import pickle
import time
from IPython.display import clear_output
from collections import defaultdict

# --- We need to include the class definitions again in this new cell ---

# --- Constants and Game Class ---
ROWS = 6
COLS = 7
PLAYER_PIECE = 1
AI_PIECE = 2
PLAYER_EMOJI = '🔵' # You are Player 1
AI_EMOJI = '🔴'   # The Q-learning agent is Player 2
EMPTY_EMOJI = '⚫'

class Connect4Game:
    def __init__(self):
        self.board = np.zeros((ROWS, COLS), dtype=int)
        self.game_over = False
        self.turn = 0 # 0 for Player, 1 for AI

    def drop_piece(self, row, col, piece):
        self.board[row][col] = piece

    def is_valid_location(self, col):
        return self.board[ROWS-1][col] == 0

    def get_next_open_row(self, col):
        for r in range(ROWS):
            if self.board[r][col] == 0:
                return r

    def print_board(self):
        flipped_board = np.flip(self.board, 0)
        print("  ".join(map(str, range(COLS))))
        print("-" * (COLS * 3))
        for r in range(ROWS):
            row_str = ""
            for c in range(COLS):
                if flipped_board[r][c] == PLAYER_PIECE:
                    row_str += PLAYER_EMOJI + " "
                elif flipped_board[r][c] == AI_PIECE:
                    row_str += AI_EMOJI + " "
                else:
                    row_str += EMPTY_EMOJI + " "
            print(row_str)
        print("\n")

    def winning_move(self, piece):
        # Check all winning conditions
        for c in range(COLS - 3):
            for r in range(ROWS):
                if all(self.board[r, c+i] == piece for i in range(4)): return True
        for c in range(COLS):
            for r in range(ROWS - 3):
                if all(self.board[r+i, c] == piece for i in range(4)): return True
        for c in range(COLS - 3):
            for r in range(ROWS - 3):
                if all(self.board[r+i, c+i] == piece for i in range(4)): return True
        for c in range(COLS - 3):
            for r in range(3, ROWS):
                if all(self.board[r-i, c+i] == piece for i in range(4)): return True
        return False

    def get_valid_locations(self):
        return [col for col in range(COLS) if self.is_valid_location(col)]

# --- Q-Learning Agent Class ---
class QLearningAgent:
    def __init__(self, alpha=0.1, gamma=0.99, epsilon=1.0):
        self.q_table = defaultdict(lambda: np.zeros(COLS))
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon

    def get_state_key(self, board):
        return board.tobytes()

    def choose_action(self, board, valid_locations):
        # Epsilon-greedy strategy
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(valid_locations) # Explore
        else:
            state_key = self.get_state_key(board)
            q_values = self.q_table[state_key]
            valid_q_values = {col: q_values[col] for col in valid_locations}
            return max(valid_q_values, key=valid_q_values.get) # Exploit

    def load_q_table(self, filename="q_table.pkl"):
        try:
            with open(filename, 'rb') as f:
                self.q_table = defaultdict(lambda: np.zeros(COLS), pickle.load(f))
                print(f"Q-table loaded successfully from {filename}")
        except FileNotFoundError:
            print(f"Error: Could not find {filename}. The agent has not been trained.")
            # Exit or handle the error as needed
            raise

# --- NEW: Interactive Playing Loop ---

# 1. Initialize the agent and the game
agent = QLearningAgent()
game = Connect4Game()

# 2. Load the trained "brain"
agent.load_q_table("q_table.pkl")
agent.epsilon = 0 # IMPORTANT: Set epsilon to 0 to always use the best strategy

# You are Player 1 (Blue), the AI is Player 2 (Red)
game.turn = 0 # Player starts

clear_output()
game.print_board()

while not game.game_over:
    if game.turn == 0: # Your turn
        try:
            col = int(input(f"Your move, Player {PLAYER_EMOJI} (0-6): "))
            if 0 <= col < COLS and game.is_valid_location(col):
                row = game.get_next_open_row(col)
                game.drop_piece(row, col, PLAYER_PIECE)

                if game.winning_move(PLAYER_PIECE):
                    print("CONGRATULATIONS! YOU WIN!")
                    game.game_over = True

                game.turn += 1 # Switch to AI's turn
            else:
                print("Invalid column. Please try again.")

        except ValueError:
            print("Invalid input. Please enter a number.")

    else: # Agent's turn
        print("AI is thinking...")
        time.sleep(1) # Add a small delay to simulate thinking

        valid_locations = game.get_valid_locations()
        if not valid_locations:
            break

        col = agent.choose_action(game.board, valid_locations)
        row = game.get_next_open_row(col)
        game.drop_piece(row, col, AI_PIECE)

        if game.winning_move(AI_PIECE):
            print("THE Q-LEARNING AGENT WINS!")
            game.game_over = True

        game.turn -= 1 # Switch to your turn

    # Check for a draw
    if len(game.get_valid_locations()) == 0 and not game.game_over:
        print("IT'S A DRAW!")
        game.game_over = True

    clear_output(wait=True)
    game.print_board()