In [63]:
# To train the Monte Carlo Tree Search agent  and Q-learning agents do it in two steps :

# Train MCTS using random moves: This will allow the MCTS agent to explore and learn by playing against random moves, which should give it a chance to discover various strategies.
# Train Q learning with MCTS and random moves: The Q learning agent will train against the MCTS agent (with a mix of random moves), allowing it to explore various strategies.
# adjust the epsilon decay in the Q learning agent to ensure that it explores adequately in the beginning.



In [65]:
# 1. Training MCTS with Random Moves

In [67]:
import random
import numpy as np
from collections import defaultdict
import time
start_time = time.time()
# Connect Four Environment

start_time = time.time()

# Connect Four Environment
class ConnectFour:
    def __init__(self):
        self.rows = 6
        self.cols = 7
        self.board = np.zeros((self.rows, self.cols), dtype=int)
        self.current_player = 1

    def reset(self):
        self.board.fill(0)
        self.current_player = 1
        return self.get_state()

    def get_state(self):
        return tuple(map(tuple, self.board))

    def is_valid_move(self, col):
        return self.board[0, col] == 0

    def make_move(self, col):
        if not self.is_valid_move(col):
            return False, None
        for row in range(self.rows - 1, -1, -1):
            if self.board[row, col] == 0:
                self.board[row, col] = self.current_player
                reward, done = self.check_game_status()
                self.current_player = 3 - self.current_player
                return reward, done
        return 0, False

    def check_game_status(self):
        for row in range(self.rows):
            for col in range(self.cols - 3):
                if self.check_sequence(row, col, 0, 1):
                    return (1, True) if self.current_player == 1 else (-1, True)
        for row in range(self.rows - 3):
            for col in range(self.cols):
                if self.check_sequence(row, col, 1, 0):
                    return (1, True) if self.current_player == 1 else (-1, True)
        for row in range(self.rows - 3):
            for col in range(self.cols - 3):
                if self.check_sequence(row, col, 1, 1) or self.check_sequence(row + 3, col, -1, 1):
                    return (1, True) if self.current_player == 1 else (-1, True)
        if not any(self.is_valid_move(c) for c in range(self.cols)):
            return (0, True)  # Draw case
        return (0, False)

    def check_sequence(self, row, col, row_delta, col_delta):
        piece = self.board[row, col]
        if piece == 0:
            return False
        for i in range(1, 4):
            if self.board[row + i * row_delta, col + i * col_delta] != piece:
                return False
        return True

# Q learning Agent
class QLearningAgent:
    def __init__(self, alpha=0.3, gamma=0.95, epsilon=1.0, epsilon_decay=0.9995, epsilon_min=0.05):
        self.q_table = defaultdict(lambda: np.zeros(7))
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

    def choose_action(self, state, valid_moves):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(valid_moves)
        q_values = self.q_table[state]
        return max(valid_moves, key=lambda col: q_values[col])

    def update_q_table(self, state, action, reward, next_state, done):
        q_values = self.q_table[state]
        if done:
            q_values[action] += self.alpha * (reward - q_values[action])
        else:
            next_q_values = self.q_table[next_state]
            q_values[action] += self.alpha * (reward + self.gamma * np.max(next_q_values) - q_values[action])
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)

# MCTS Node
class Node:
    def __init__(self, state):
        self.state = state
        self.visits = 0
        self.wins = 0
        self.children = {}

    def best_child(self):
        return max(self.children.items(), key=lambda x: x[1].wins / (x[1].visits + 1e-6))[0]





    
class MCTS:
    def __init__(self, simulations=50):
        self.simulations = simulations

    def search(self, env, root):
        for _ in range(self.simulations):
            node = root
            temp_env = ConnectFour()
            temp_env.board = np.copy(env.board)
            temp_env.current_player = env.current_player

            # Simulate the game from the current node state
            while node.children:
                action = node.best_child()
                temp_env.make_move(action)
                node = node.children[action]

            # Add children nodes for all valid moves
            valid_moves = [c for c in range(env.cols) if temp_env.is_valid_move(c)]
            for move in valid_moves:
                temp_env.make_move(move)
                node.children[move] = Node(temp_env.get_state())

            # Simulate outcome
            reward, done = temp_env.check_game_status()
            if done:
                for child in node.children.values():
                    child.wins += reward
                    child.visits += 1

        return root.best_child()

def train_mcts_random(env, mcts_agent, games=100):
    random_wins, mcts_wins, draws = 0, 0, 0

    for game in range(games):
        state = env.reset()
        done = False

        while not done:
            valid_moves = [c for c in range(env.cols) if env.is_valid_move(c)]
            action = random.choice(valid_moves)  # Random agent
            reward, done = env.make_move(action)
            state = env.get_state()

        if reward == 1:
            random_wins += 1
        elif reward == -1:
            mcts_wins += 1
        else:
            draws += 1

        #print(f"Game {game + 1}/{games} completed.")

    print(f"Results after {games} games (MCTS vs Random):\nRandom Agent Wins: {random_wins}\nMCTS Agent Wins: {mcts_wins}\nDraws: {draws}")

# Train MCTS agent against random moves
train_mcts_random(ConnectFour(), MCTS(simulations=40), games=100)


Results after 100 games (MCTS vs Random):
Random Agent Wins: 54
MCTS Agent Wins: 46
Draws: 0


In [69]:
# 2. Training Q-learning with MCTS and Random Moves
# Now,  train the Q-learning agent by having it play against the MCTS agent and a random agent.
# use a mix of epsilon-greedy exploration for the Q-learning agent and ensure that the agent explores sufficiently early on.



In [71]:
# Q-learning Agent (same as before with epsilon decay adjustment)
class QLearningAgent:
    def __init__(self, alpha=0.3, gamma=0.95, epsilon=1.0, epsilon_decay=0.9995, epsilon_min=0.05):
        self.q_table = defaultdict(lambda: np.zeros(7))
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

    def choose_action(self, state, valid_moves):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(valid_moves)
        q_values = self.q_table[state]
        return max(valid_moves, key=lambda col: q_values[col])

    def update_q_table(self, state, action, reward, next_state, done):
        q_values = self.q_table[state]
        if done:
            q_values[action] += self.alpha * (reward - q_values[action])
        else:
            next_q_values = self.q_table[next_state]
            q_values[action] += self.alpha * (reward + self.gamma * np.max(next_q_values) - q_values[action])
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)

# Training function with Q-learning playing against MCTS and random moves
def train_q_with_mcts_and_random(q_agent, mcts_agent, episodes=50000):
    env = ConnectFour()
    
    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            valid_moves = [c for c in range(env.cols) if env.is_valid_move(c)]
            
            # Choose between Q learning, MCTS , or random
            if random.uniform(0, 1) < 0.33:
                action = q_agent.choose_action(state, valid_moves)  # Q-learning chooses move
            elif random.uniform(0, 1) < 0.66:
                root = Node(state)
                action = mcts_agent.search(env, root)  # MCTS chooses move
            else:
                action = random.choice(valid_moves)  # Random move

            reward, done = env.make_move(action)
            next_state = env.get_state()
            q_agent.update_q_table(state, action, reward, next_state, done)
            state = next_state

        # Adjust exploration exploitation balance during training
        if (episode + 1) % 500 == 0:
            print(f"Episode {episode + 1}/{episodes} - Epsilon: {q_agent.epsilon:.4f}")

# Test the Q learning agent against MCTS and random agents after training
def test_q_agent(q_agent, mcts_agent, games=100):
    env = ConnectFour()
    q_wins, mcts_wins, draws = 0, 0, 0
    
    for game in range(games):
        state = env.reset()
        done = False
        
        while not done:
            valid_moves = [c for c in range(env.cols) if env.is_valid_move(c)]
            action = q_agent.choose_action(state, valid_moves) if random.uniform(0, 1) < 0.5 else mcts_agent.search(env, Node(state))
            reward, done = env.make_move(action)
            state = env.get_state()
        
        if reward == 1:
            q_wins += 1
        elif reward == -1:
            mcts_wins += 1
        else:
            draws += 1
        
        print(f"Game {game + 1}/{games} completed.")
    
    print(f"Results after {games} games:\nQ-learning Agent Wins: {q_wins}\nMCTS Agent Wins: {mcts_wins}\nDraws: {draws}")
start_time = time.time()
# Train Q learning with MCTS and random agent , then test the agent
q_agent = QLearningAgent()
mcts_agent = MCTS(simulations=20)
train_q_with_mcts_and_random(q_agent, mcts_agent, episodes=10000)
test_q_agent(q_agent, mcts_agent, games=100)
print("--- %s seconds ---" % (time.time() - start_time))

Episode 500/10000 - Epsilon: 0.0500
Episode 1000/10000 - Epsilon: 0.0500
Episode 1500/10000 - Epsilon: 0.0500
Episode 2000/10000 - Epsilon: 0.0500
Episode 2500/10000 - Epsilon: 0.0500
Episode 3000/10000 - Epsilon: 0.0500
Episode 3500/10000 - Epsilon: 0.0500
Episode 4000/10000 - Epsilon: 0.0500
Episode 4500/10000 - Epsilon: 0.0500
Episode 5000/10000 - Epsilon: 0.0500
Episode 5500/10000 - Epsilon: 0.0500
Episode 6000/10000 - Epsilon: 0.0500
Episode 6500/10000 - Epsilon: 0.0500
Episode 7000/10000 - Epsilon: 0.0500
Episode 7500/10000 - Epsilon: 0.0500
Episode 8000/10000 - Epsilon: 0.0500
Episode 8500/10000 - Epsilon: 0.0500
Episode 9000/10000 - Epsilon: 0.0500
Episode 9500/10000 - Epsilon: 0.0500
Episode 10000/10000 - Epsilon: 0.0500
Game 1/100 completed.
Game 2/100 completed.
Game 3/100 completed.
Game 4/100 completed.
Game 5/100 completed.
Game 6/100 completed.
Game 7/100 completed.
Game 8/100 completed.
Game 9/100 completed.
Game 10/100 completed.
Game 11/100 completed.
Game 12/100 comp

In [72]:
 # both Q learning and MCTS are learning from each other while also training with random moves. The goal here is to train both algorithms simultaneously by allowing them to interact and adjust based on each other's actions , including random moves.

#Key changes:
# Q learning updates should occur based on the move chosen by MCTS or random moves.
# MCTS updates should occur based on the reward from the environment after each move, similar to how Q learning works.
# Random moves will also be involved in training by either the Q-learning or MCTS agent, simulating a diverse range of actions and scenarios.
# Here is a complete rewrite that incorporates these updates:
          

In [74]:
import random
import numpy as np
from collections import defaultdict
import time
from concurrent.futures import ThreadPoolExecutor


class ConnectFour:
    def __init__(self):
        self.rows = 6
        self.cols = 7
        self.board = np.zeros((self.rows, self.cols), dtype=int)
        self.current_player = 1

    def reset(self):
        self.board.fill(0)
        self.current_player = 1
        return self.get_state()

    def get_state(self):
        return tuple(map(tuple, self.board))

    def is_valid_move(self, col):
        return self.board[0, col] == 0

    def make_move(self, col):
        if not self.is_valid_move(col):
            return False, None
        for row in range(self.rows - 1, -1, -1):
            if self.board[row, col] == 0:
                self.board[row, col] = self.current_player
                reward, done = self.check_game_status()
                self.current_player = 3 - self.current_player
                return reward, done
        return 0, False

    def check_game_status(self):
        for row in range(self.rows):
            for col in range(self.cols - 3):
                if self.check_sequence(row, col, 0, 1):
                    return (1, True) if self.current_player == 1 else (-1, True)
        for row in range(self.rows - 3):
            for col in range(self.cols):
                if self.check_sequence(row, col, 1, 0):
                    return (1, True) if self.current_player == 1 else (-1, True)
        for row in range(self.rows - 3):
            for col in range(self.cols - 3):
                if self.check_sequence(row, col, 1, 1) or self.check_sequence(row + 3, col, -1, 1):
                    return (1, True) if self.current_player == 1 else (-1, True)
        if not any(self.is_valid_move(c) for c in range(self.cols)):
            return (0, True)  # Draw case
        return (0, False)

    def check_sequence(self, row, col, row_delta, col_delta):
        piece = self.board[row, col]
        if piece == 0:
            return False
        for i in range(1, 4):
            if self.board[row + i * row_delta, col + i * col_delta] != piece:
                return False
        return True


# Q learning Agent
class QLearningAgent:
    def __init__(self, alpha=0.3, gamma=0.95, epsilon=1.0, epsilon_decay=0.9995, epsilon_min=0.05):
        self.q_table = defaultdict(lambda: np.zeros(7))
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

    def choose_action(self, state, valid_moves):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(valid_moves)
        q_values = self.q_table[state]
        return max(valid_moves, key=lambda col: q_values[col])

    def update_q_table(self, state, action, reward, next_state, done):
        q_values = self.q_table[state]
        if done:
            q_values[action] += self.alpha * (reward - q_values[action])
        else:
            next_q_values = self.q_table[next_state]
            q_values[action] += self.alpha * (reward + self.gamma * np.max(next_q_values) - q_values[action])
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)


# MCTS Node
class Node:
    def __init__(self, state):
        self.state = state
        self.visits = 0
        self.wins = 0
        self.children = {}

    def best_child(self):
        return max(self.children.items(), key=lambda x: x[1].wins / (x[1].visits + 1e-6))[0]


class MCTS:
    def __init__(self, simulations=20, q_agent=None): 
        self.simulations = simulations
        self.q_agent = q_agent  # Allow MCTS to use Q-learning agent

    def search(self, env, root):
        with ThreadPoolExecutor() as executor:
            futures = [executor.submit(self.run_simulation, env, root) for _ in range(self.simulations)]
            for future in futures:
                future.result()

        return root.best_child()

    def run_simulation(self, env, root):
        node = root
        temp_env = ConnectFour()
        temp_env.board = np.copy(env.board)
        temp_env.current_player = env.current_player

        # Selection
        while node.children:
            action = node.best_child()
            temp_env.make_move(action)
            node = node.children[action]

        # Expansion
        valid_moves = [c for c in range(env.cols) if temp_env.is_valid_move(c)]
        for move in valid_moves:
            temp_env.make_move(move)
            node.children[move] = Node(temp_env.get_state())

        # Simulation: Random or Q learning move
        state = temp_env.get_state()
        done = False
        reward = 0  # Initialize reward to ensure it's always assigned 
        while not done:
            valid_moves = [c for c in range(env.cols) if temp_env.is_valid_move(c)]
            if not valid_moves:  # Handle case where no moves are left
                break

            if random.uniform(0, 1) < 0.5 and self.q_agent:
                action = self.q_agent.choose_action(state, valid_moves)
            else:
                action = random.choice(valid_moves)
            reward, done = temp_env.make_move(action) 
            state = temp_env.get_state()
        
        # Backpropagation
        for child in node.children.values():
            child.wins += reward
            child.visits += 1


# Simultaneous Training of Q learning and MCTS with Random Moves
def train_both_agents(env, q_agent, mcts_agent, episodes=50000):


    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            valid_moves = [c for c in range(env.cols) if env.is_valid_move(c)]
            
            # Alternate between Q learning , MCTS, or Random Move
            move_choice = random.uniform(0, 1)
            if move_choice < 0.33:
                action = q_agent.choose_action(state, valid_moves)  # Q learning move
            elif move_choice < 0.66:
                root = Node(state)
                action = mcts_agent.search(env, root)  # MCTS move
            else:
                action = random.choice(valid_moves)  # Random move

            reward, done = env.make_move(action)
            next_state = env.get_state()

            if move_choice < 0.33:  # Only update Q learning if it chose the move
                q_agent.update_q_table(state, action, reward, next_state, done)

            state = next_state

        # Print progress every 500 episodes
        if (episode + 1) % 500 == 0:
            print(f"Episode {episode + 1}/{episodes} - Epsilon: {q_agent.epsilon:.4f}")


# Testing the trained Q-learning agent vs. MCTS agent
def test_agents(q_agent, mcts_agent, games=100):
    env = ConnectFour()
    q_wins, mcts_wins, draws = 0, 0, 0
    
    for game in range(games):
        state = env.reset()
        done = False
        
        while not done:
            valid_moves = [c for c in range(env.cols) if env.is_valid_move(c)]
            
            # Alternate between Q-learning and MCTS during the game
            move_choice = random.uniform(0, 1)
            if move_choice < 0.5:
                action = q_agent.choose_action(state, valid_moves)
            else:
                root = Node(state)
                action = mcts_agent.search(env, root)
            
            reward, done = env.make_move(action)
            state = env.get_state()
        
        if reward == 1:
            q_wins += 1
        elif reward == -1:
            mcts_wins += 1
        else:
            draws += 1
        
        #print(f"Game {game + 1}/{games} completed.")
    
    print(f"Results after {games} games:\nQ-learning Agent Wins: {q_wins}\nMCTS Agent Wins: {mcts_wins}\nDraws: {draws}")


# Start training both agents
start_time = time.time()
q_agent = QLearningAgent()
mcts_agent = MCTS(simulations=20)
env = ConnectFour()

# Train both agents simultaneously with random moves
#train_both_agents(env, q_agent, mcts_agent, episodes=10000)
train_both_agents(env, q_agent, mcts_agent, episodes=50000)
# Test agents after training
#test_agents(q_agent, mcts_agent, games=100)

print("--- %s seconds ---" % (time.time() - start_time))


Episode 500/50000 - Epsilon: 0.1358
Episode 1000/50000 - Epsilon: 0.0500
Episode 1500/50000 - Epsilon: 0.0500
Episode 2000/50000 - Epsilon: 0.0500
Episode 2500/50000 - Epsilon: 0.0500
Episode 3000/50000 - Epsilon: 0.0500
Episode 3500/50000 - Epsilon: 0.0500
Episode 4000/50000 - Epsilon: 0.0500
Episode 4500/50000 - Epsilon: 0.0500
Episode 5000/50000 - Epsilon: 0.0500
Episode 5500/50000 - Epsilon: 0.0500
Episode 6000/50000 - Epsilon: 0.0500
Episode 6500/50000 - Epsilon: 0.0500
Episode 7000/50000 - Epsilon: 0.0500
Episode 7500/50000 - Epsilon: 0.0500
Episode 8000/50000 - Epsilon: 0.0500
Episode 8500/50000 - Epsilon: 0.0500
Episode 9000/50000 - Epsilon: 0.0500
Episode 9500/50000 - Epsilon: 0.0500
Episode 10000/50000 - Epsilon: 0.0500
Episode 10500/50000 - Epsilon: 0.0500
Episode 11000/50000 - Epsilon: 0.0500
Episode 11500/50000 - Epsilon: 0.0500
Episode 12000/50000 - Epsilon: 0.0500
Episode 12500/50000 - Epsilon: 0.0500
Episode 13000/50000 - Epsilon: 0.0500
Episode 13500/50000 - Epsilon: 0

In [76]:
import random
import numpy as np
from collections import defaultdict
import time
from concurrent.futures import ThreadPoolExecutor

# Connect Four Environment
class ConnectFour:
    def __init__(self):
        self.rows = 6
        self.cols = 7
        self.board = np.zeros((self.rows, self.cols), dtype=int)
        self.current_player = 1

    def reset(self):
        self.board.fill(0)
        self.current_player = 1
        return self.get_state()

    def get_state(self):
        return tuple(map(tuple, self.board))

    def is_valid_move(self, col):
        return self.board[0, col] == 0

    def make_move(self, col):
        if not self.is_valid_move(col):
            return False, None
        for row in range(self.rows - 1, -1, -1):
            if self.board[row, col] == 0:
                self.board[row, col] = self.current_player
                reward, done = self.check_game_status()
                self.current_player = 3 - self.current_player
                return reward, done
        return 0, False

    def check_game_status(self):
        for row in range(self.rows):
            for col in range(self.cols - 3):
                if self.check_sequence(row, col, 0, 1):
                    return (1, True) if self.current_player == 1 else (-1, True)
        for row in range(self.rows - 3):
            for col in range(self.cols):
                if self.check_sequence(row, col, 1, 0):
                    return (1, True) if self.current_player == 1 else (-1, True)
        for row in range(self.rows - 3):
            for col in range(self.cols - 3):
                if self.check_sequence(row, col, 1, 1) or self.check_sequence(row + 3, col, -1, 1):
                    return (1, True) if self.current_player == 1 else (-1, True)
        if not any(self.is_valid_move(c) for c in range(self.cols)):
            return (0, True)  # Draw case
        return (0, False)

    def check_sequence(self, row, col, row_delta, col_delta):
        piece = self.board[row, col]
        if piece == 0:
            return False
        for i in range(1, 4):
            if self.board[row + i * row_delta, col + i * col_delta] != piece:
                return False
        return True


# Q learning Agent
class QLearningAgent:
    def __init__(self, alpha=0.1, gamma=0.8, epsilon=1.0, epsilon_decay=0.999, epsilon_min=0.1):
        self.q_table = defaultdict(lambda: np.zeros(7))
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

    def choose_action(self, state, valid_moves):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(valid_moves)
        q_values = self.q_table[state]
        return max(valid_moves, key=lambda col: q_values[col])

    def update_q_table(self, state, action, reward, next_state, done):
        q_values = self.q_table[state]
        if done:
            q_values[action] += self.alpha * (reward - q_values[action])
        else:
            next_q_values = self.q_table[next_state]
            q_values[action] += self.alpha * (reward + self.gamma * np.max(next_q_values) - q_values[action])
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)


# MCTS Node
class Node:
    def __init__(self, state):
        self.state = state
        self.visits = 0
        self.wins = 0
        self.children = {}

    def best_child(self):
        return max(self.children.items(), key=lambda x: x[1].wins / (x[1].visits + 1e-6))[0]


class MCTS:
    def __init__(self, simulations=100, q_agent=None):  
        self.simulations = simulations
        self.q_agent = q_agent  # Allow MCTS to use Q learning agent

    def search(self, env, root):
        with ThreadPoolExecutor() as executor:
            futures = [executor.submit(self.run_simulation, env, root) for _ in range(self.simulations)]
            for future in futures:
                future.result()

        return root.best_child()

    def run_simulation(self, env, root):
        node = root
        temp_env = ConnectFour()
        temp_env.board = np.copy(env.board)
        temp_env.current_player = env.current_player

        # Selection
        while node.children:
            action = node.best_child()
            temp_env.make_move(action)
            node = node.children[action]

        # Expansion
        valid_moves = [c for c in range(env.cols) if temp_env.is_valid_move(c)]
        for move in valid_moves:
            temp_env.make_move(move)
            node.children[move] = Node(temp_env.get_state())

        # Simulation: More weight towards Q-learning moves
        state = temp_env.get_state()
        done = False
        reward = 0  
        while not done:
            valid_moves = [c for c in range(env.cols) if temp_env.is_valid_move(c)]
            if not valid_moves:  # Handle case where no moves are left
                break

            if random.uniform(0, 1) < 0.7 and self.q_agent:  # 70% chance to use Q learning
                action = self.q_agent.choose_action(state, valid_moves)
            else:
                action = random.choice(valid_moves)  # 30% random move
            reward, done = temp_env.make_move(action) 
            state = temp_env.get_state()
        
        # Backpropagation
        for child in node.children.values():
            child.wins += reward
            child.visits += 1


# Simultaneous Training of Q-learning and MCTS with Random Moves
def train_both_agents(env, q_agent, mcts_agent, episodes=50000):
    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            valid_moves = [c for c in range(env.cols) if env.is_valid_move(c)]
            
            # Alternate between Q-learning, MCTS, or Random Move
            move_choice = random.uniform(0, 1)
            if move_choice < 0.25:  # 25% chance for Q-learning
                action = q_agent.choose_action(state, valid_moves)  # Q-learning move
            elif move_choice < 0.5:  # 25% chance for MCTS
                root = Node(state)
                action = mcts_agent.search(env, root)  # MCTS move
            else:  # 50% random move
                action = random.choice(valid_moves)

            reward, done = env.make_move(action)
            next_state = env.get_state()

            if move_choice < 0.25:  # Only update Q-learning if it chose the move
                q_agent.update_q_table(state, action, reward, next_state, done)

            state = next_state

        # Print progress every 500 episodes
        if (episode + 1) % 500 == 0:
            print(f"Episode {episode + 1}/{episodes} - Epsilon: {q_agent.epsilon:.4f}")


# Testing the trained Q-learning agent vs. MCTS agent
def test_agents(q_agent, mcts_agent, games=100):
    env = ConnectFour()
    q_wins, mcts_wins, draws = 0, 0, 0
    
    for game in range(games):
        state = env.reset()
        done = False
        
        while not done:
            valid_moves = [c for c in range(env.cols) if env.is_valid_move(c)]
            
            # Alternate between Q-learning and MCTS during the game
            move_choice = random.uniform(0, 1)
            if move_choice < 0.5:
                action = q_agent.choose_action(state, valid_moves)
            else:
                root = Node(state)
                action = mcts_agent.search(env, root)
            
            reward, done = env.make_move(action)
            state = env.get_state()
        
        if reward == 1:
            q_wins += 1
        elif reward == -1:
            mcts_wins += 1
        else:
            draws += 1
    
    print(f"Results after {games} games:\nQ-learning Agent Wins: {q_wins}\nMCTS Agent Wins: {mcts_wins}\nDraws: {draws}")


# Start training both agents
#start_time = time.time()
#q_agent = QLearningAgent(alpha=0.1, gamma=0.8, epsilon=1.0, epsilon_decay=0.999, epsilon_min=0.1)
#mcts_agent = MCTS(simulations=100)  # Increased simulations
#env = ConnectFour()

#train_both_agents(env, q_agent, mcts_agent, episodes=50000)

# Start training both agents with optimized parameters
start_time = time.time()
#q_agent = QLearningAgent(alpha=0.2, gamma=0.8, epsilon=1.0, epsilon_decay=0.9999, epsilon_min=0.2)
#mcts_agent = MCTS(simulations=50)  # Reduced simulations for faster training
q_agent = QLearningAgent(alpha=0.25, gamma=0.8, epsilon=1.0, epsilon_decay=0.9999, epsilon_min=0.15)
mcts_agent = MCTS(simulations=75)  # More simulations to improve MCTS decisions

env = ConnectFour()

train_both_agents(env, q_agent, mcts_agent, episodes=5000)  # Reduced episodes for quicker results

print("--- %s seconds ---" % (time.time() - start_time))

#print("--- %s seconds ---" % (time.time() - start_time))


Episode 500/5000 - Epsilon: 0.7618
Episode 1000/5000 - Epsilon: 0.5723
Episode 1500/5000 - Epsilon: 0.4272
Episode 2000/5000 - Epsilon: 0.3208
Episode 2500/5000 - Epsilon: 0.2386
Episode 3000/5000 - Epsilon: 0.1782
Episode 3500/5000 - Epsilon: 0.1500
Episode 4000/5000 - Epsilon: 0.1500
Episode 4500/5000 - Epsilon: 0.1500
Episode 5000/5000 - Epsilon: 0.1500
--- 863.4163637161255 seconds ---


In [78]:
# Here are separate functions for testing:

#MCTS vs. Q-Learning
#MCTS vs. Random
#Q-Learning vs. Random


In [81]:
def test_mcts_vs_qlearning(mcts_agent, q_agent, games=100):
    """ Test Monte Carlo Tree Search vs. Q-learning Agent """
    env = ConnectFour()
    mcts_wins, q_wins, draws = 0, 0, 0
    
    for _ in range(games):
        state = env.reset()
        done = False

        while not done:
            valid_moves = [c for c in range(env.cols) if env.is_valid_move(c)]

            if env.current_player == 1:  # MCTS move
                root = Node(state)
                action = mcts_agent.search(env, root)
            else:  # Q learning move
                action = q_agent.choose_action(state, valid_moves)

            reward, done = env.make_move(action)
            state = env.get_state()

        if reward == 1:  # MCTS wins
            mcts_wins += 1
        elif reward == -1:  # Q learning wins
            q_wins += 1
        else:  # Draw
            draws += 1

    print(f"MCTS vs. Q-Learning ({games} games): MCTS Wins: {mcts_wins}, Q-Learning Wins: {q_wins}, Draws: {draws}")


def test_mcts_vs_random(mcts_agent, games=100):
    """ Test Monte Carlo Tree Search vs. Random Moves """
    env = ConnectFour()
    mcts_wins, random_wins, draws = 0, 0, 0
    
    for _ in range(games):
        state = env.reset()
        done = False

        while not done:
            valid_moves = [c for c in range(env.cols) if env.is_valid_move(c)]

            if env.current_player == 1:  # MCTS move
                root = Node(state)
                action = mcts_agent.search(env, root)
            else:  # Random move
                action = random.choice(valid_moves)

            reward, done = env.make_move(action)
            state = env.get_state()

        if reward == 1:  # MCTS wins
            mcts_wins += 1
        elif reward == -1:  # Random wins
            random_wins += 1
        else:  # Draw
            draws += 1

    print(f"MCTS vs. Random ({games} games): MCTS Wins: {mcts_wins}, Random Wins: {random_wins}, Draws: {draws}")


def test_qlearning_vs_random(q_agent, games=100):
    """ Test Q-learning Agent vs. Random Moves """
    env = ConnectFour()
    q_wins, random_wins, draws = 0, 0, 0
    
    for _ in range(games):
        state = env.reset()
        done = False

        while not done:
            valid_moves = [c for c in range(env.cols) if env.is_valid_move(c)]

            if env.current_player == 1:  # Q-learning move
                action = q_agent.choose_action(state, valid_moves)
            else:  # Random move
                action = random.choice(valid_moves)

            reward, done = env.make_move(action)
            state = env.get_state()

        if reward == 1:  # Q-learning wins
            q_wins += 1
        elif reward == -1:  # Random wins
            random_wins += 1
        else:  # Draw
            draws += 1

    print(f"Q-Learning vs. Random ({games} games): Q-Learning Wins: {q_wins}, Random Wins: {random_wins}, Draws: {draws}")

# Initialize agents


# Run tests
test_mcts_vs_qlearning(mcts_agent, q_agent, games=100)
test_mcts_vs_random(mcts_agent, games=100)
test_qlearning_vs_random(q_agent, games=100)


MCTS vs. Q-Learning (100 games): MCTS Wins: 62, Q-Learning Wins: 33, Draws: 5
MCTS vs. Random (100 games): MCTS Wins: 90, Random Wins: 10, Draws: 0
Q-Learning vs. Random (100 games): Q-Learning Wins: 78, Random Wins: 22, Draws: 0


In [82]:
# interface 

In [None]:
import pygame
import sys
import numpy as np
import random
from collections import defaultdict

# Initialize pygame
pygame.init()

# Board dimensions
ROW_COUNT = 6
COLUMN_COUNT = 7
SQUARESIZE = 100
width = COLUMN_COUNT * SQUARESIZE
height = (ROW_COUNT + 1) * SQUARESIZE  # Extra row for input
RADIUS = SQUARESIZE // 2 - 5

# Colors
BLUE = (0, 0, 255)
BLACK = (0, 0, 0)
RED = (255, 0, 0)
YELLOW = (255, 255, 0)

size = (width, height)
screen = pygame.display.set_mode(size)

class ConnectFour:
    def __init__(self):
        self.board = np.zeros((ROW_COUNT, COLUMN_COUNT), dtype=int)
        self.current_player = 1

    def reset(self):
        self.board.fill(0)
        self.current_player = 1

    def is_valid_move(self, col):
        return self.board[0, col] == 0

    def make_move(self, col):
        for row in range(ROW_COUNT - 1, -1, -1):
            if self.board[row, col] == 0:
                self.board[row, col] = self.current_player
                self.current_player = 3 - self.current_player
                return True
        return False

    def check_winner(self):
        for r in range(ROW_COUNT):
            for c in range(COLUMN_COUNT - 3):
                if self.board[r, c] == self.board[r, c+1] == self.board[r, c+2] == self.board[r, c+3] != 0:
                    return self.board[r, c]
        for r in range(ROW_COUNT - 3):
            for c in range(COLUMN_COUNT):
                if self.board[r, c] == self.board[r+1, c] == self.board[r+2, c] == self.board[r+3, c] != 0:
                    return self.board[r, c]
        for r in range(ROW_COUNT - 3):
            for c in range(COLUMN_COUNT - 3):
                if self.board[r, c] == self.board[r+1, c+1] == self.board[r+2, c+2] == self.board[r+3, c+3] != 0:
                    return self.board[r, c]
                if self.board[r+3, c] == self.board[r+2, c+1] == self.board[r+1, c+2] == self.board[r, c+3] != 0:
                    return self.board[r+3, c]
        return 0

    def get_valid_moves(self):
        return [c for c in range(COLUMN_COUNT) if self.is_valid_move(c)]

def draw_board(board):
    screen.fill(BLACK)
    for r in range(ROW_COUNT):
        for c in range(COLUMN_COUNT):
            pygame.draw.rect(screen, BLUE, (c*SQUARESIZE, (r+1)*SQUARESIZE, SQUARESIZE, SQUARESIZE))
            color = BLACK
            if board[r, c] == 1:
                color = RED
            elif board[r, c] == 2:
                color = YELLOW
            pygame.draw.circle(screen, color, (c*SQUARESIZE + SQUARESIZE//2, (r+1)*SQUARESIZE + SQUARESIZE//2), RADIUS)
    pygame.display.update()

def play_game(ai_type):
    env = ConnectFour()
    running = True
    turn = 1  # Human starts

    draw_board(env.board)
    
    while running:
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                sys.exit()

            if event.type == pygame.MOUSEMOTION:
                pygame.draw.rect(screen, BLACK, (0, 0, width, SQUARESIZE))
                posx = event.pos[0]
                pygame.draw.circle(screen, RED if turn == 1 else YELLOW, (posx, SQUARESIZE//2), RADIUS)
                pygame.display.update()

            if event.type == pygame.MOUSEBUTTONDOWN:
                if turn == 1:  # Human move
                    col = event.pos[0] // SQUARESIZE
                    if env.is_valid_move(col):
                        env.make_move(col)
                        draw_board(env.board)
                        if env.check_winner():
                            print(f"Player {turn} wins!")
                            running = False
                        turn = 2

        if turn == 2 and running:  # AI move
            pygame.time.delay(500)
            valid_moves = env.get_valid_moves()
            if ai_type == "random":
                col = random.choice(valid_moves)
            elif ai_type == "mcts":
                col = random.choice(valid_moves)  # Placeholder for MCTS 
            elif ai_type == "qlearning":
                col = random.choice(valid_moves)  # Placeholder for Q learning 
            env.make_move(col)
            draw_board(env.board)
            if env.check_winner():
                print(f"AI ({ai_type}) wins!")
                running = False
            turn = 1

        pygame.time.delay(100)

if __name__ == "__main__":
    ai_choice = input("Choose AI opponent: (random, mcts, qlearning): ").lower()
    play_game(ai_choice)
