# useful packages

In [None]:
# Install necessary libraries
!pip install numpy
!pip install stable-baselines3 gym
!pip install torch torchvision
!nvidia-smi

In [None]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
import pickle
from collections import defaultdict, deque
import pandas as pd
import gym
from gym import spaces
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# logics

In [None]:
class LiarDiceGame:
    def __init__(self):
        self.dice_count = {1: 5, 2: 5}
        self.players = {1: [random.randint(1, 6) for _ in range(self.dice_count[1])],
                        2: [random.randint(1, 6) for _ in range(self.dice_count[2])]}
        self.current_bid = (1, 1)  # Minimum bid to start each round
        self.current_player = 1
        self.last_action_was_challenge = False
        self.scores = {1: 0, 2: 0}
        self.player_names = {1: 'Player 1', 2: 'Player 2'} # default player namess

    def set_player_names(self, player1_name, player2_name):
        self.player_names[1] = player1_name
        self.player_names[2] = player2_name

    def roll_dice(self):
        for player in self.players:
            self.players[player] = [random.randint(1, 6) for _ in range(self.dice_count[player])]

    def reveal_dice(self):
        return self.players

    def make_bid(self, player, quantity, face_value):
        if face_value not in range(1, 7):
            return False

        if quantity > 10 or quantity < 1:
            return False

        if self.current_bid == (1, 1):
            if quantity < 1 or face_value < 1:
                return False
        else:
            if quantity < self.current_bid[0] or (quantity == self.current_bid[0] and face_value <= self.current_bid[1]):
                return False

        self.current_bid = (quantity, face_value)
        self.last_action_was_challenge = False
        self.switch_player()
        return True

    def adjust_scores(self, winner, loser):
        self.scores[winner] += 100
        self.scores[loser] = max(self.scores[loser] - 100, 0)


    def challenge(self, challenger):
        players_dice = self.reveal_dice()
        if self.current_bid[1] == 1:
            total_quantity = sum(dice.count(1) for dice in players_dice.values())
        else:
            total_quantity = sum(dice.count(self.current_bid[1]) + dice.count(1) for dice in players_dice.values())

        result = None
        dice_faces = {player: " ".join(str(die) for die in dice) for player, dice in players_dice.items()}

        if total_quantity >= self.current_bid[0]:
            self.switch_player()
            result = f"Challenge failed. Total dice count is {total_quantity}. {self.player_names[challenger]} loses a dice and 100 points. {self.player_names[self.current_player]} wins 100 points.\n" \
                     f"{self.player_names[1]}'s dice: {dice_faces[1]}\n{self.player_names[2]}'s dice: {dice_faces[2]}"
            self.dice_count[challenger] -= 1
            if self.dice_count[challenger] > 0:
                self.players[challenger].pop()
            self.adjust_scores(self.current_player, challenger)
        else:
            self.switch_player()
            result = f"Challenge successful. Total dice count is {total_quantity}. {self.player_names[self.current_player]} loses a dice and 100 points. {self.player_names[challenger]} wins 100 points.\n" \
                     f"{self.player_names[1]}'s dice: {dice_faces[1]}\n{self.player_names[2]}'s dice: {dice_faces[2]}"
            self.dice_count[self.current_player] -= 1
            if self.dice_count[self.current_player] > 0:
                self.players[self.current_player].pop()
            self.adjust_scores(challenger, self.current_player)

        self.last_action_was_challenge = True
        self.current_bid = (0, 0)
        self.roll_dice()

        if self.is_game_over():
            return result

        return result

    def switch_player(self):
        self.current_player = 1 if self.current_player == 2 else 2

    def is_game_over(self):
        return any(count == 0 for count in self.dice_count.values())

    def get_winner(self):
        if self.dice_count[1] == 0:
            return 2
        elif self.dice_count[2] == 0:
            return 1
        return None

    def random_bid(self):
        total_dice = 10

        min_quantity = self.current_bid[0] + 1
        if min_quantity > total_dice:
            min_quantity = total_dice

        quantity = random.randint(min_quantity, total_dice)
        face_value = random.randint(1, 6)
        return quantity, face_value

    def get_dice_counts(self):
        return list(self.dice_count.values())

    def get_game_state(self):
        return {
            "dice_count": self.get_dice_counts(),  # Use the method to get a list
            "players": self.players,
            "current_bid": self.current_bid,
            "current_player": self.current_player,
            "last_action_was_challenge": self.last_action_was_challenge,
            "player_names": self.player_names,
            "scores": self.scores,
        }

# env

In [None]:
class LiarDiceEnv(gym.Env):
    def __init__(self):
        super(LiarDiceEnv, self).__init__()
        self.game = LiarDiceGame()

        self.action_space = spaces.Tuple((
            spaces.Discrete(2),  # 0: make_bid, 1: challenge
            spaces.Discrete(11),  # quantity: 1-10
            spaces.Discrete(6)   # face_value: 1-6
        ))

        self.observation_space = spaces.Dict({
            "dice_count": spaces.Box(low=0, high=5, shape=(2,), dtype=np.int32),
            "current_bid": spaces.Box(low=0, high=10, shape=(2,), dtype=np.int32),
            "scores": spaces.Box(low=0, high=np.inf, shape=(2,), dtype=np.int32),
            "current_player": spaces.Discrete(2),
            "players_dice": spaces.Box(low=0, high=6, shape=(2, 5), dtype=np.int32)  # Padded players' dice values
        })

    def reset(self):
        self.game = LiarDiceGame()
        return self._get_obs()

    def _get_obs(self):
        state = self.game.get_game_state()
        players_dice_padded = [self.pad_dice(self.game.players[1]), self.pad_dice(self.game.players[2])]
        return {
            "dice_count": np.array(state['dice_count']),
            "current_bid": np.array(state['current_bid']),
            "scores": np.array([state['scores'][1], state['scores'][2]]),
            "current_player": state['current_player'] - 1,  # adjust to 0-indexed
            "players_dice": np.array(players_dice_padded)  # Padded players' dice values
        }

    def pad_dice(self, dice, max_length=5):
        return dice + [0] * (max_length - len(dice))

    def step(self, action):
        if isinstance(action, int):
            action_type, quantity, face_value = action // 60, (action % 60) // 6, action % 6 + 1
        else:
            action_type, quantity, face_value = action
        if action_type == 0:  # make_bid
            success = self.game.make_bid(self.game.current_player, quantity, face_value)
            if not success:
                reward = -10  # Penalize invalid bids
                done = True
                return self._get_obs(), reward, done, {}
            else:
                reward = 1  # Small reward for valid bids
        elif action_type == 1:  # challenge
            result = self.game.challenge(self.game.current_player)
            if "successful" in result:
                reward = 100
            else:
                reward = -100

        done = self.game.is_game_over()
        if done:
            if self.game.get_winner() == self.game.current_player:
                reward += 100  # Large reward for winning
            else:
                reward -= 100  # Large penalty for losing

        return self._get_obs(), reward, done, {}

    def random_bid(self):
        total_dice = 10
        min_quantity = self.game.current_bid[0] + 1
        if min_quantity > total_dice:
            min_quantity = total_dice
        quantity = random.randint(min_quantity, total_dice)
        face_value = random.randint(1, 6)
        return (0, quantity, face_value)

    def render(self, mode='human'):
        state = self.game.get_game_state()
        print(state)

    def close(self):
        pass


# models

## Q-Learning

In [None]:
class QLearningAgent:
    def __init__(self, state_size, action_size, alpha=0.1, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
        self.state_size = state_size
        self.action_size = action_size
        self.q_table = defaultdict(lambda: np.zeros(action_size))
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

    def preprocess_state(self, state):
        # Flatten state dictionary into a tuple
        return tuple(np.concatenate([
            state["dice_count"],
            state["current_bid"],
            state["scores"],
            [state["current_player"]],
            state["players_dice"].flatten()
        ]))

    def get_action_space(self):
        actions = [(0, quantity, face_value) for quantity in range(1, 11) for face_value in range(1, 7)]
        actions.append((1, 0, 0))  # Challenge action
        return actions

    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        return np.argmax(self.q_table[state])

    def update_q_table(self, state, action, reward, next_state):
        best_next_action = np.argmax(self.q_table[next_state])
        td_target = reward + self.gamma * self.q_table[next_state][best_next_action]
        td_error = td_target - self.q_table[state][action]
        self.q_table[state][action] += self.alpha * td_error

    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def reward_shaping(self, reward, action, done):
        if action // 60 == 1:  # Challenge action
            if reward > 0:
                reward += 50  # Additional reward for a successful challenge
            else:
                reward -= 50  # Additional penalty for a failed challenge
        if done:
            if reward > 0:
                reward += 100  # Additional reward for winning
            else:
                reward -= 100  # Additional penalty for losing
        return reward

    def save(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(dict(self.q_table), f)

    def load(self, filename):
        with open(filename, 'rb') as f:
            self.q_table = defaultdict(lambda: np.zeros(self.action_size), pickle.load(f))


## Deep Q Network - DQN

In [66]:
class DQNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class DQNAgent:
    def __init__(self, state_size, action_size, lr=0.001, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01, batch_size=64, memory_size=10000):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.batch_size = batch_size
        self.memory = deque(maxlen=memory_size)

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.model = DQNetwork(state_size, action_size).to(self.device)
        self.target_model = DQNetwork(state_size, action_size).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.update_target_model()

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        act_values = self.model(state)
        return torch.argmax(act_values, dim=1).item()

    def replay(self):
        if len(self.memory) < self.batch_size:
            return
        minibatch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            next_state = torch.FloatTensor(next_state).unsqueeze(0).to(self.device)
            reward = torch.tensor(reward, device=self.device)
            done = torch.tensor(done, device=self.device)

            target = reward
            if not done:
                target = reward + self.gamma * torch.max(self.target_model(next_state)).item()
            target_f = self.model(state)
            target_f[0][action] = target
            self.optimizer.zero_grad()
            loss = F.mse_loss(target_f, self.model(state))
            loss.backward()
            self.optimizer.step()
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump({
                'state_size': self.state_size,
                'action_size': self.action_size,
                'gamma': self.gamma,
                'epsilon': self.epsilon,
                'epsilon_decay': self.epsilon_decay,
                'epsilon_min': self.epsilon_min,
                'batch_size': self.batch_size,
                'memory': list(self.memory),
                'model_state_dict': self.model.state_dict(),
                'optimizer_state_dict': self.optimizer.state_dict(),
            }, f)

    def load(self, filename):
        with open(filename, 'rb') as f:
            checkpoint = pickle.load(f)
            self.state_size = checkpoint['state_size']
            self.action_size = checkpoint['action_size']
            self.gamma = checkpoint['gamma']
            self.epsilon = checkpoint['epsilon']
            self.epsilon_decay = checkpoint['epsilon_decay']
            self.epsilon_min = checkpoint['epsilon_min']
            self.batch_size = checkpoint['batch_size']
            self.memory = deque(checkpoint['memory'], maxlen=10000)
            self.model = DQNetwork(self.state_size, self.action_size).to(self.device)
            self.target_model = DQNetwork(self.state_size, self.action_size).to(self.device)
            self.model.load_state_dict(checkpoint['model_state_dict'])
            self.optimizer = optim.Adam(self.model.parameters())
            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            self.update_target_model()

## Bayesian model

In [84]:
class BayesianAgent:
    def __init__(self, num_dice, num_faces=6, bluff_probability=0.1):
        self.num_dice = num_dice
        self.num_faces = num_faces
        self.bluff_probability = bluff_probability
        self.opponent_bids = []

    def update_beliefs(self, observed_dice):
        counts = np.zeros(self.num_faces + 1)
        for die in observed_dice:
            counts[die] += 1
        return counts

    def make_bid(self, current_bid, observed_dice):
        counts = self.update_beliefs(observed_dice)
        estimated_opponent_counts = self.estimate_opponent_dice()
        total_estimated_counts = counts + estimated_opponent_counts

        possible_actions = [(q, f) for q in range(current_bid[0] + 1, min(self.num_dice * 2 + 1, 10 + 1)) for f in range(1, self.num_faces + 1)]

        if not possible_actions:
            print("Error: No possible actions available.")
            possible_actions = [(current_bid[0] + 1, 1)]

        if self.should_bluff():
            best_action = self.make_bluff_bid(current_bid)
        else:
            best_action = None
            max_expected_value = -np.inf
            for action in possible_actions:
                expected_value = self.calculate_expected_value(action, total_estimated_counts)
                if expected_value > max_expected_value:
                    max_expected_value = expected_value
                    best_action = action

            if best_action is None:
                best_action = random.choice(possible_actions)

        print(f"Making bid: Current bid {current_bid}, New bid {best_action}, Observed counts {counts}, Estimated opponent counts {estimated_opponent_counts}")
        return best_action

    def calculate_expected_value(self, action, counts):
        quantity, face_value = action
        total_count = sum(counts)
        if total_count == 0:
            return 0
        probability = counts[face_value] / total_count
        return quantity * probability

    def should_bluff(self):
        return np.random.rand() < self.bluff_probability

    def make_bluff_bid(self, current_bid):
        return (min(current_bid[0] + 2, 10), (current_bid[1] % self.num_faces) + 1)

    def should_challenge(self, current_bid, observed_dice):
        counts = self.update_beliefs(observed_dice)
        total_count = sum(counts[1:]) + counts[1]

        self.update_opponent_bids(current_bid)
        estimated_opponent_counts = self.estimate_opponent_dice()
        total_estimated_counts = counts + estimated_opponent_counts
        total_count_with_estimates = sum(total_estimated_counts[1:]) + total_estimated_counts[1]

        challenge = total_count_with_estimates < current_bid[0]
        print(f"Challenge decision: Current bid {current_bid}, Observed counts {counts}, Estimated opponent counts {estimated_opponent_counts}, Total count with estimates {total_count_with_estimates}, Challenge {challenge}")
        return challenge

    def update_opponent_bids(self, bid):
        self.opponent_bids.append(bid)

    def estimate_opponent_dice(self):
        if not self.opponent_bids:
            return np.zeros(self.num_faces + 1)
        estimated_counts = np.zeros(self.num_faces + 1)
        for bid in self.opponent_bids:
            estimated_counts[bid[1]] += bid[0] / len(self.opponent_bids)
        return estimated_counts

    def save(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump({
                'num_dice': self.num_dice,
                'num_faces': self.num_faces,
                'opponent_bids': self.opponent_bids,
            }, f)

    def load(self, filename):
        with open(filename, 'rb') as f:
            checkpoint = pickle.load(f)
            self.num_dice = checkpoint['num_dice']
            self.num_faces = checkpoint['num_faces']
            self.opponent_bids = checkpoint['opponent_bids']

## SARSA model

In [None]:
class SARSAAgent:
    def __init__(self, state_size, action_size, alpha=0.1, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
        self.state_size = state_size
        self.action_size = action_size
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.q_table = defaultdict(lambda: np.zeros(action_size))

    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        q_values = self.q_table[state]
        return np.argmax(q_values)

    def update_q_table(self, state, action, reward, next_state, next_action):
        td_target = reward + self.gamma * self.q_table[next_state][next_action]
        td_error = td_target - self.q_table[state][action]
        self.q_table[state][action] += self.alpha * td_error

    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(dict(self.q_table), f)

    def load(self, filename):
        with open(filename, 'rb') as f:
            self.q_table = defaultdict(lambda: np.zeros(self.action_size), pickle.load(f))


## MCTS model

In [68]:
class MCTSAgent:
    def __init__(self, num_simulations=500):
        self.num_simulations = num_simulations

    def select_action(self, state, env):
        root = Node(state, None, None, env)
        for _ in range(self.num_simulations):
            leaf = self.traverse(root, env)
            reward = self.rollout(leaf.state, env)
            self.backpropagate(leaf, reward)
        return self.best_action(root)

    def traverse(self, node, env):
        while not node.is_terminal():
            if node.is_fully_expanded():
                node = self.best_child(node)
            else:
                return self.expand(node, env)
        return node

    def expand(self, node, env):
        action = node.untried_actions.pop()
        action_type, quantity, face_value = action // 60, (action % 60) // 6, action % 6 + 1
        next_state, _, done, _ = env.step((action_type, quantity, face_value))
        child_node = Node(next_state, node, action, env)
        node.children.append(child_node)
        return child_node

    def rollout(self, state, env):
        current_state = state
        done = False
        total_reward = 0
        steps = 0  # Limit the depth of the rollout to avoid long rollouts

        while not done and steps < 50:  # Limit rollout depth to 50 steps
            action = self.rollout_policy(env, current_state)
            action_type, quantity, face_value = action
            next_state, reward, done, _ = env.step((action_type, quantity, face_value))
            current_state = next_state
            total_reward += reward
            steps += 1

        return total_reward

    def rollout_policy(self, env, state):
        # A simple heuristic policy for rollouts: choose the action with the highest quantity and face value
        action_space = [(0, quantity, face_value) for quantity in range(10) for face_value in range(1, 7)]
        return random.choice(action_space)  # This can be replaced with a more sophisticated policy

    def backpropagate(self, node, reward):
        while node is not None:
            node.visits += 1
            node.reward += reward
            node = node.parent

    def best_action(self, node):
        best_child = max(node.children, key=lambda child: child.reward / child.visits)
        return best_child.action

    def best_child(self, node):
        # Use UCT (Upper Confidence Bound for Trees) for selection
        C = 1.41  # Exploration-exploitation balance constant
        return max(node.children, key=lambda child: child.reward / child.visits + C * np.sqrt(np.log(node.visits) / child.visits))

    def save(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self, f)

    def load(self, filename):
        with open(filename, 'rb') as f:
            return pickle.load(f)

class Node:
    def __init__(self, state, parent, action, env):
        self.state = state
        self.parent = parent
        self.children = []
        self.visits = 0
        self.reward = 0
        self.untried_actions = list(range(2 * 10 * 6))  # 2 action types, 10 quantities, 6 face values
        self.env = env
        self.action = action

    def is_terminal(self):
        return self.env.game.is_game_over()

    def is_fully_expanded(self):
        return len(self.untried_actions) == 0

# Training

## Q-learning

In [None]:
def train_q_learning_agent(env, agent, episodes=1000):
    rewards = []
    for e in range(episodes):
        state = env.reset()
        state = agent.preprocess_state(state)
        done = False
        episode_reward = 0
        step = 0
        while not done:
            action_index = agent.get_action(state)
            action = agent.get_action_space()[action_index]
            next_state, reward, done, _ = env.step(action)
            next_state = agent.preprocess_state(next_state)
            shaped_reward = agent.reward_shaping(reward, action_index, done)
            agent.update_q_table(state, action_index, shaped_reward, next_state)
            state = next_state
            episode_reward += shaped_reward
            step += 1

            # Detailed logging for diagnosis
            if e % 1000 == 0:
                print(f"Episode: {e+1}, Step: {step}, Action: {action}, Reward: {reward}, Shaped Reward: {shaped_reward}, Next State: {next_state}")

        agent.decay_epsilon()
        rewards.append(episode_reward)
        if e % 50 == 0:
            agent.save(f'q_learning_agent_{e}.pkl')
        print(f"Episode: {e+1}/{episodes}, Steps: {step}, Reward: {episode_reward}, Epsilon: {agent.epsilon}")
    agent.save('q_learning_agent_final.pkl')
    return rewards

# Initializing the environment and agent
env = LiarDiceEnv()
state_size = 17  # dice_count(2) + current_bid(2) + scores(2) + current_player(1) + players_dice(10)
action_size = len([(0, quantity, face_value) for quantity in range(1, 11) for face_value in range(1, 7)]) + 1  # 2 types, 10 quantities, 6 face values
q_learning_agent = QLearningAgent(state_size, action_size)

# Training the agent
q_learning_rewards = train_q_learning_agent(env, q_learning_agent, episodes=10000)

# Plotting rewards
plt.plot(q_learning_rewards)
plt.xlabel('Episodes')
plt.ylabel('Rewards')
plt.title('Q-Learning Agent Training Rewards')
plt.show()


## DQN

In [None]:
def train_dqn_agent(env, agent, episodes=1000):
    rewards = []
    for e in range(episodes):
        state = env.reset()
        state = np.concatenate([state["dice_count"], state["current_bid"], state["scores"], [state["current_player"]], state["players_dice"].flatten()])
        done = False
        episode_reward = 0
        while not done:
            action = agent.act(state)
            action_type, quantity, face_value = action // 60, (action % 60) // 6, action % 6 + 1
            next_state, reward, done, _ = env.step((action_type, quantity, face_value))
            next_state = np.concatenate([next_state["dice_count"], next_state["current_bid"], next_state["scores"], [next_state["current_player"]], next_state["players_dice"].flatten()])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            episode_reward += reward
            if done:
                agent.update_target_model()
                print(f"Episode: {e+1}/{episodes}, Reward: {episode_reward}, Epsilon: {agent.epsilon}")
        agent.replay()
        rewards.append(episode_reward)
        if e % 50 == 0:
            agent.save(f'dqn_agent_{e}.pkl')
    agent.save('dqn_agent_final.pkl')
    return rewards

# Initializing the environment and agent
env = LiarDiceEnv()
state_size = 17  # dice_count(2) + current_bid(2) + scores(2) + current_player(1) + players_dice(10)
action_size = 2 * 10 * 6  # 2 types, 10 quantities, 6 face values
dqn_agent = DQNAgent(state_size, action_size)

# Training the agent
dqn_rewards = train_dqn_agent(env, dqn_agent, episodes=10000)

# Plotting rewards
plt.plot(dqn_rewards)
plt.xlabel('Episodes')
plt.ylabel('Rewards')
plt.title('DQN Agent Training Rewards')
plt.show()

## Bayesian

In [None]:
def train_bayesian_agent(env, agent, episodes=1000):
    rewards = []
    for e in range(episodes):
        state = env.reset()
        done = False
        episode_reward = 0
        step = 0

        while not done:
            current_player = state["current_player"]
            opponent = 1 if current_player == 0 else 0
            observed_dice = state["players_dice"][opponent]
            current_bid = state["current_bid"]

            if current_player == 0:
                if agent.should_challenge(current_bid, observed_dice):
                    action = (1, 0, 0)  # Challenge
                else:
                    quantity, face_value = agent.make_bid(current_bid, observed_dice)
                    action = (0, quantity, face_value)
            else:
                action = env.random_bid()

            next_state, reward, done, _ = env.step(action)
            agent.update_opponent_bids(current_bid)

            if current_player == 0:
                observed_dice = next_state["players_dice"][opponent]
                agent.update_beliefs(observed_dice)
                episode_reward += reward

            state = next_state
            step += 1

            if e % 1000 == 0:
                print(f"Episode: {e+1}, Step: {step}, Action: {action}, Reward: {reward}, Next State: {next_state}")

        rewards.append(episode_reward)
        if e % 50 == 0:
            agent.save(f'bayesian_agent_{e}.pkl')
        print(f"Episode: {e+1}/{episodes}, Steps: {step}, Reward: {episode_reward}")

    agent.save('bayesian_agent_final.pkl')
    return rewards

# Initialize environment and agent
env = LiarDiceEnv()
bayesian_agent = BayesianAgent(num_dice=5)

# Train the agent
bayesian_rewards = train_bayesian_agent(env, bayesian_agent, episodes=10000)

# Plot rewards
plt.plot(bayesian_rewards)
plt.xlabel('Episodes')
plt.ylabel('Rewards')
plt.title('Bayesian Agent Training Rewards')
plt.show()

## SARSA

In [None]:
def train_sarsa_agent(env, agent, episodes=1000):
    rewards = []
    for e in range(episodes):
        state = env.reset()
        state = tuple(state["dice_count"]) + tuple(state["current_bid"]) + tuple(state["scores"]) + (state["current_player"],)
        done = False
        episode_reward = 0
        action = agent.get_action(state)

        while not done:
            action_type, quantity, face_value = action // 60, (action % 60) // 6, action % 6 + 1
            next_state, reward, done, _ = env.step((action_type, quantity, face_value))
            next_state = tuple(next_state["dice_count"]) + tuple(next_state["current_bid"]) + tuple(next_state["scores"]) + (next_state["current_player"],)

            if done:
                if reward > 0:
                    reward = 100  # Winning the game
                elif reward < 0:
                    reward = -100  # Losing the game

            next_action = agent.get_action(next_state)
            agent.update_q_table(state, action, reward, next_state, next_action)
            episode_reward += reward

            state = next_state
            action = next_action

        agent.decay_epsilon()
        rewards.append(episode_reward)
        if e % 50 == 0:
            agent.save(f'simple_sarsa_agent_{e}.pkl')
        print(f"Episode: {e+1}/{episodes}, Total Reward: {episode_reward}, Epsilon: {agent.epsilon}")
    agent.save('simple_sarsa_agent_final.pkl')
    return rewards

# Initializing the SARSA agent with adjusted hyperparameters
sarsa_agent = SARSAAgent(state_size, action_size, alpha=0.1, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01)

# Training the SARSA Agent
sarsa_rewards = train_sarsa_agent(env, sarsa_agent, episodes=10000)

# Plotting rewards
plt.plot(sarsa_rewards)
plt.xlabel('Episodes')
plt.ylabel('Rewards')
plt.title('SARSA Agent Training Rewards')
plt.show()

## MCTS

In [None]:
def train_mcts_agent(env, agent, episodes=10000):
    rewards = []

    for e in range(episodes):
        state = env.reset()
        done = False
        episode_reward = 0

        while not done:
            action = agent.select_action(state, env)
            action_type, quantity, face_value = action // 60, (action % 60) // 6, action % 6 + 1
            next_state, reward, done, _ = env.step((action_type, quantity, face_value))
            state = next_state
            episode_reward += reward

        rewards.append(episode_reward)
        print(f"Episode: {e+1}/{episodes}, Reward: {episode_reward}")

    average_reward = np.mean(rewards)
    std_reward = np.std(rewards)
    min_reward = np.min(rewards)
    max_reward = np.max(rewards)

    print(f"Average Reward over {episodes} episodes: {average_reward}")
    print(f"Standard Deviation of Reward: {std_reward}")
    print(f"Minimum Reward: {min_reward}")
    print(f"Maximum Reward: {max_reward}")

    agent.save('mcts_agent.pkl')

    return rewards

# Initializing and train the MCTS agent
mcts_agent = MCTSAgent(num_simulations=500)

# Training MCTS Agent
print("Training MCTS Agent")
mcts_rewards = train_mcts_agent(env, mcts_agent, episodes=10000)

# Plotting rewards
plt.plot(mcts_rewards)
plt.xlabel('Episodes')
plt.ylabel('Rewards')
plt.title('MCTS Agent Training Rewards')
plt.show()

statistics

In [None]:
def plot_training_statistics(rewards, title):
    plt.figure(figsize=(12, 6))
    plt.plot(rewards, label='Rewards')
    plt.xlabel('Episodes')
    plt.ylabel('Rewards')
    plt.title(title)
    plt.legend()
    plt.show()

# Plotting the training statistics for each agent
plot_training_statistics(q_learning_rewards, "Q-Learning Agent Training Rewards")
plot_training_statistics(dqn_rewards, "DQN Agent Training Rewards")
plot_training_statistics(bayesian_rewards, "Bayesian Agent Training Rewards")
plot_training_statistics(sarsa_rewards, "SARSA Agent Training Rewards")
plot_training_statistics(mcts_rewards, "MCTS Agent Training Rewards")

In [None]:
def plot_rolling_average(rewards, window_size=50, title='Reward Trend'):
    rolling_avg = pd.Series(rewards).rolling(window=window_size).mean()
    plt.figure(figsize=(12, 6))
    plt.plot(rolling_avg)
    plt.xlabel('Episodes')
    plt.ylabel('Rewards')
    plt.title(title)
    plt.grid(True)
    plt.show()

plot_rolling_average(q_learning_rewards, title='Q-Learning Agent Reward Trend')
plot_rolling_average(dqn_rewards, title='DQN Agent Reward Trend')
plot_rolling_average(bayesian_rewards, title='Bayesian Agent Reward Trend')
plot_rolling_average(sarsa_rewards, title='SARSA Agent Reward Trend')
plot_rolling_average(mcts_rewards, title='MCTS Agent Reward Trend')

# Testing

In [91]:
class HeuristicAgent:
    def __init__(self, num_dice, num_faces=6):
        self.num_dice = num_dice
        self.num_faces = num_faces

    def make_bid(self, current_bid):
        quantity, face_value = current_bid
        if face_value < self.num_faces:
            return (quantity, face_value + 1)
        else:
            return (quantity + 1, 1)

    def should_challenge(self, current_bid, observed_dice):
        total_dice = self.num_dice * 2
        return current_bid[0] > total_dice // 2


In [92]:
def preprocess_state(state):
    return np.concatenate([
        state["dice_count"],
        state["current_bid"],
        state["scores"],
        [state["current_player"]],
        state["players_dice"].flatten()
    ])


def get_action_space():
    actions = [(0, quantity, face_value) for quantity in range(1, 11) for face_value in range(1, 7)]
    actions.append((1, 0, 0))  # Challenge action
    return actions


def test_against_heuristic(env, agent, heuristic_agent, episodes=100):
    rewards = []
    heuristic_rewards = []

    for e in range(episodes):
        state = env.reset()
        done = False
        episode_reward = 0
        heuristic_episode_reward = 0

        while not done:
            current_player = state["current_player"]
            opponent = 1 if current_player == 0 else 0
            observed_dice = state["players_dice"][opponent]
            current_bid = state["current_bid"]

            if current_player == 0:
                if isinstance(agent, BayesianAgent):
                    if agent.should_challenge(current_bid, observed_dice):
                        action = (1, 0, 0)  # Challenge
                    else:
                        quantity, face_value = agent.make_bid(current_bid, observed_dice)
                        action = (0, quantity, face_value)
                else:
                    state_flat = preprocess_state(state)
                    action_index = agent.act(state_flat)
                    action = get_action_space()[action_index]
            else:
                if heuristic_agent.should_challenge(current_bid, observed_dice):
                    action = (1, 0, 0)  # Challenge
                else:
                    quantity, face_value = heuristic_agent.make_bid(current_bid)
                    action = (0, quantity, face_value)

            next_state, reward, done, _ = env.step(action)

            if current_player == 0:
                episode_reward += reward
            else:
                heuristic_episode_reward += reward

            state = next_state

        rewards.append(episode_reward)
        heuristic_rewards.append(heuristic_episode_reward)

    avg_reward = np.mean(rewards)
    avg_heuristic_reward = np.mean(heuristic_rewards)

    print(f"Average Reward for Agent over {episodes} episodes: {avg_reward}")
    print(f"Average Reward for Heuristic Agent over {episodes} episodes: {avg_heuristic_reward}")

    return avg_reward, avg_heuristic_reward

In [None]:
# Initialize environment and agents
env = LiarDiceEnv()
state_size = 17

# Print the action space size
action_space = get_action_space()
action_size = len(action_space)
print(f"Action Size: {action_size}")

# Initialize agents
q_learning_agent = QLearningAgent(state_size, action_size)
dqn_agent = DQNAgent(state_size, action_size)
bayesian_agent = BayesianAgent(num_dice=5)
sarsa_agent = SARSAAgent(state_size, action_size)
mcts_agent = MCTSAgent(num_simulations=1000)

# Load trained agents
q_learning_agent.load('q_learning_agent_final.pkl')
dqn_agent.load('dqn_agent_final.pkl')
bayesian_agent.load('bayesian_agent_final.pkl')
sarsa_agent.load('simple_sarsa_agent_final.pkl')
mcts_agent.load('mcts_agent.pkl')

# Initialize heuristic agent
heuristic_agent = HeuristicAgent(num_dice=5)

### Test agents against heuristic

In [None]:
print("Testing Q-Learning Agent against Heuristic")
q_learning_avg_reward, q_learning_heuristic_avg_reward = test_against_heuristic(env, q_learning_agent, heuristic_agent, episodes=100)


In [None]:
print("Testing DQN Agent against Heuristic")
dqn_avg_reward, dqn_heuristic_avg_reward = test_against_heuristic(env, dqn_agent, heuristic_agent, episodes=100)


In [None]:
print("Testing Bayesian Agent against Heuristic")
bayesian_avg_reward, bayesian_heuristic_avg_reward = test_against_heuristic(env, bayesian_agent, heuristic_agent, episodes=100)


In [None]:
print("Testing SARSA Agent against Heuristic")
sarsa_avg_reward, sarsa_heuristic_avg_reward = test_against_heuristic(env, sarsa_agent, heuristic_agent, episodes=100)


In [None]:
print("Testing MCTS Agent against Heuristic")
mcts_avg_reward, mcts_heuristic_avg_reward = test_against_heuristic(env, mcts_agent, heuristic_agent, episodes=100)

## Testing 2

In [None]:
def preprocess_state(state):
    return np.concatenate([
        state["dice_count"],
        state["current_bid"],
        state["scores"],
        [state["current_player"]],
        state["players_dice"].flatten()
    ])

def get_action_space():
    actions = [(0, quantity, face_value) for quantity in range(1, 11) for face_value in range(1, 7)]
    actions.append((1, 0, 0))  # Challenge action
    return actions

def test_agent(env, agent, episodes=1000, render=False):
    rewards = []
    action_space = get_action_space()
    action_size = len(action_space)
    print(f"Testing agent with action size: {action_size}")

    for e in range(episodes):
        state = env.reset()
        state = preprocess_state(state)
        done = False
        episode_reward = 0
        while not done:
            if isinstance(agent, DQNAgent):
                action_index = agent.act(state)
                if action_index >= action_size or action_index < 0: # Skip invalid action index
                    print(f"Invalid action index {action_index} for DQNAgent with action size {action_size}")
                    continue  
            elif isinstance(agent, QLearningAgent) or isinstance(agent, SARSAAgent):
                action_index = agent.get_action(state)
                if action_index >= action_size or action_index < 0: # Skip invalid action index
                    print(f"Invalid action index {action_index} for agent with action size {action_size}")
                    continue 
            elif isinstance(agent, BayesianAgent):
                original_state = env.reset() 
                observed_dice = original_state["players_dice"][1] if original_state["current_player"] == 0 else original_state["players_dice"][0]
                current_bid = original_state["current_bid"]
                if agent.should_challenge(current_bid, observed_dice):
                    action_index = len(action_space) - 1  # Challenge action
                else:
                    quantity, face_value = agent.make_bid(current_bid, observed_dice)
                    action_index = action_space.index((0, quantity, face_value))
            elif isinstance(agent, MCTSAgent):
                action = agent.select_action(state, env)
                action_index = action_space.index(action)

            action = action_space[action_index]
            next_state, reward, done, _ = env.step(action)
            next_state = preprocess_state(next_state)
            state = next_state
            episode_reward += reward
            if render:
                env.render()
        rewards.append(episode_reward)
    return rewards

# Initialize environment and agents
env = LiarDiceEnv()
state_size = 17

# Print the action space size
action_space = get_action_space()
action_size = len(action_space)
print(f"Action Size: {action_size}")

# Initialize agents
q_learning_agent = QLearningAgent(state_size, action_size)
dqn_agent = DQNAgent(state_size, action_size)
bayesian_agent = BayesianAgent(num_dice=5)
sarsa_agent = SARSAAgent(state_size, action_size)
mcts_agent = MCTSAgent(num_simulations=1000)

# Load trained agents
q_learning_agent.load('q_learning_agent_final.pkl')
dqn_agent.load('dqn_agent_final.pkl')
bayesian_agent.load('bayesian_agent_final.pkl')
sarsa_agent.load('simple_sarsa_agent_final.pkl')
mcts_agent.load('mcts_agent.pkl')

In [None]:
# Test agents
print("Testing Q-Learning Agent")
q_learning_test_rewards = test_agent(env, q_learning_agent, episodes=100, render=False)

In [None]:
print("Testing DQN Agent")
dqn_test_rewards = test_agent(env, dqn_agent, episodes=100, render=False)

In [None]:
print("Testing Bayesian Agent")
bayesian_test_rewards = test_agent(env, bayesian_agent, episodes=100, render=False)

In [None]:
print("Testing SARSA Agent")
sarsa_test_rewards = test_agent(env, sarsa_agent, episodes=100, render=False)

In [None]:
print("Testing MCTS Agent")
mcts_test_rewards = test_agent(env, mcts_agent, episodes=100, render=False)

In [None]:
# Creating a DataFrame to display the statistics
data = {
    "Agent": ["Q-Learning", "DQN", "Bayesian", "SARSA", "MCTS"],
    "Average Reward": [np.mean(q_learning_test_rewards), np.mean(dqn_test_rewards), np.mean(bayesian_test_rewards), np.mean(sarsa_test_rewards), np.mean(mcts_test_rewards)],
    "Std Dev Reward": [np.std(q_learning_test_rewards), np.std(dqn_test_rewards), np.std(bayesian_test_rewards), np.std(sarsa_test_rewards), np.std(mcts_test_rewards)],
    "Min Reward": [np.min(q_learning_test_rewards), np.min(dqn_test_rewards), np.min(bayesian_test_rewards), np.min(sarsa_test_rewards), np.min(mcts_test_rewards)],
    "Max Reward": [np.max(q_learning_test_rewards), np.max(dqn_test_rewards), np.max(bayesian_test_rewards), np.max(sarsa_test_rewards), np.max(mcts_test_rewards)],
}

df = pd.DataFrame(data)
print("\nAgent Performance Summary:")
print(df)


In [None]:
# Plotting the rewards for each agent
plt.figure(figsize=(14, 8))
sns.boxplot(data=[q_learning_rewards, dqn_rewards, bayesian_rewards, sarsa_rewards, mcts_rewards], palette="Set2")
plt.xticks(ticks=range(5), labels=["Q-Learning", "DQN", "Bayesian", "SARSA", "MCTS"])
plt.xlabel("Agent")
plt.ylabel("Rewards")
plt.title("Reward Distribution for Each Agent")
plt.grid(True)
plt.show()

# Plotting the average rewards with error bars
plt.figure(figsize=(14, 8))
mean_rewards = df["Average Reward"]
std_rewards = df["Std Dev Reward"]
agents = df["Agent"]

plt.errorbar(agents, mean_rewards, yerr=std_rewards, fmt='o', capsize=5, capthick=2, ecolor='gray')
plt.xlabel("Agent")
plt.ylabel("Average Reward")
plt.title("Average Rewards with Standard Deviation for Each Agent")
plt.grid(True)
plt.show()

# Plotting the minimum and maximum rewards for each agent
plt.figure(figsize=(14, 8))
x = np.arange(len(df["Agent"]))
width = 0.35

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, df["Min Reward"], width, label='Min Reward')
rects2 = ax.bar(x + width/2, df["Max Reward"], width, label='Max Reward')

ax.set_xlabel('Agent')
ax.set_ylabel('Reward')
ax.set_title('Minimum and Maximum Rewards for Each Agent')
ax.set_xticks(x)
ax.set_xticklabels(df["Agent"])
ax.legend()

plt.grid(True)
plt.show()

# Cumulative Reward Plot
plt.figure(figsize=(14, 8))
plt.plot(np.cumsum(q_learning_rewards), label="Q-Learning")
plt.plot(np.cumsum(dqn_rewards), label="DQN")
plt.plot(np.cumsum(bayesian_rewards), label="Bayesian")
plt.plot(np.cumsum(sarsa_rewards), label="SARSA")
plt.plot(np.cumsum(mcts_rewards), label="MCTS")
plt.xlabel("Episodes")
plt.ylabel("Cumulative Reward")
plt.title("Cumulative Reward Over Episodes for Each Agent")
plt.legend()
plt.grid(True)
plt.show()

# Reward Trends Plot (Rolling Average)
plt.figure(figsize=(14, 8))
window_size = 50
plt.plot(pd.Series(q_learning_rewards).rolling(window_size).mean(), label="Q-Learning")
plt.plot(pd.Series(dqn_rewards).rolling(window_size).mean(), label="DQN")
plt.plot(pd.Series(bayesian_rewards).rolling(window_size).mean(), label="Bayesian")
plt.plot(pd.Series(sarsa_rewards).rolling(window_size).mean(), label="SARSA")
plt.plot(pd.Series(mcts_rewards).rolling(window_size).mean(), label="MCTS")
plt.xlabel("Episodes")
plt.ylabel("Reward (Rolling Average)")
plt.title(f"Reward Trends Over Episodes (Window Size = {window_size})")
plt.legend()
plt.grid(True)
plt.show()

# All Agents

In [None]:
def switch_agent(agent1, agent2, current_player):
    return agent2 if current_player == 1 else agent1

def train_agents_against_each_other(env, agent1, agent2, episodes=1000):
    rewards_agent1 = []
    rewards_agent2 = []

    for e in range(episodes):
        state = env.reset()
        current_agent = agent1 if state["current_player"] == 0 else agent2
        done = False
        episode_reward_agent1 = 0
        episode_reward_agent2 = 0

        while not done:
            if isinstance(current_agent, DQNAgent):
                state_np = np.concatenate([state["dice_count"], state["current_bid"], state["scores"], [state["current_player"]]])
                action = current_agent.act(state_np)
                action_type, quantity, face_value = action // 60, (action % 60) // 6, action % 6 + 1
                next_state, reward, done, _ = env.step((action_type, quantity, face_value))
                next_state_np = np.concatenate([next_state["dice_count"], next_state["current_bid"], next_state["scores"], [next_state["current_player"]]])
                current_agent.remember(state_np, action, reward, next_state_np, done)
                state = next_state
            elif isinstance(current_agent, QLearningAgent):
                state_np = tuple(state["dice_count"]) + tuple(state["current_bid"]) + tuple(state["scores"]) + (state["current_player"],)
                action = current_agent.get_action(state_np)
                action_type, quantity, face_value = action // 60, (action % 60) // 6, action % 6 + 1
                next_state, reward, done, _ = env.step((action_type, quantity, face_value))
                next_state_np = tuple(next_state["dice_count"]) + tuple(next_state["current_bid"]) + tuple(next_state["scores"]) + (next_state["current_player"],)
                current_agent.update_q_table(state_np, action, reward, next_state_np)
                state = next_state
            elif isinstance(current_agent, SARSAAgent):
                state_np = tuple(state["dice_count"]) + tuple(state["current_bid"]) + tuple(state["scores"]) + (state["current_player"],)
                action = current_agent.get_action(state_np)
                action_type, quantity, face_value = action // 60, (action % 60) // 6, action % 6 + 1
                next_state, reward, done, _ = env.step((action_type, quantity, face_value))
                next_state_np = tuple(next_state["dice_count"]) + tuple(next_state["current_bid"]) + tuple(next_state["scores"]) + (next_state["current_player"],)
                next_action = current_agent.get_action(next_state_np)
                current_agent.update_q_table(state_np, action, reward, next_state_np, next_action)
                state = next_state
            elif isinstance(current_agent, BayesianAgent):
                observed_dice = state["players_dice"][1] if state["current_player"] == 0 else state["players_dice"][0]
                current_bid = state["current_bid"]
                if current_agent.should_challenge(current_bid, observed_dice):
                    action = (1, 0, 0)  # Challenge
                else:
                    quantity, face_value = current_agent.make_bid(current_bid, observed_dice)
                    action = (0, quantity, face_value)
                next_state, reward, done, _ = env.step(action)
                state = next_state
            elif isinstance(current_agent, MCTSAgent):
              action = current_agent.select_action(state, env)
              action_type, quantity, face_value = action // 60, (action % 60) // 6, action % 6 + 1
              next_state, reward, done, _ = env.step((action_type, quantity, face_value))
              state = next_state
            else:
                raise ValueError("Unsupported agent type")

            if state["current_player"] == 0:
                episode_reward_agent1 += reward
            else:
                episode_reward_agent2 += reward

            current_agent = switch_agent(agent1, agent2, state["current_player"])

        rewards_agent1.append(episode_reward_agent1)
        rewards_agent2.append(episode_reward_agent2)

        # Update target model for DQN agents periodically
        if isinstance(agent1, DQNAgent) and e % 50 == 0:
            agent1.update_target_model()
        if isinstance(agent2, DQNAgent) and e % 50 == 0:
            agent2.update_target_model()

        print(f"Episode: {e+1}/{episodes}, Reward Agent 1: {episode_reward_agent1}, Reward Agent 2: {episode_reward_agent2}")

    return rewards_agent1, rewards_agent2


# Initializing the environment and agents
# env = LiarDiceEnv()
# state_size = 17  # dice_count(2) + current_bid(2) + scores(2) + current_player(1)
# action_size = 2 * 10 * 6  # 2 types, 10 quantities, 6 face values
# q_learning_agent = QLearningAgent(state_size, action_size)
# dqn_agent = DQNAgent(state_size, action_size)
# bayesian_agent = BayesianAgent(num_dice=5)
# sarsa_agent = SARSAAgent(state_size, action_size)
# mcts_agent = MCTSAgent(state_size, action_size)

In [None]:
# Train Q-Learning Agent vs DQN Agent
rewards_q_learning_vs_dqn = train_agents_against_each_other(env, q_learning_agent, dqn_agent, episodes=1000)

# Train Q-Learning Agent vs Bayesian Agent
rewards_q_learning_vs_bayesian = train_agents_against_each_other(env, q_learning_agent, bayesian_agent, episodes=1000)

In [None]:
# Train DQN Agent vs Bayesian Agent
rewards_dqn_vs_bayesian = train_agents_against_each_other(env, dqn_agent, bayesian_agent, episodes=1000)

In [None]:
# Train SARSA Agent vs other agents
rewards_sarsa_vs_dqn = train_agents_against_each_other(env, sarsa_agent, dqn_agent, episodes=1000)
rewards_sarsa_vs_q_learning = train_agents_against_each_other(env, sarsa_agent, q_learning_agent, episodes=1000)
rewards_sarsa_vs_bayesian = train_agents_against_each_other(env, sarsa_agent, bayesian_agent, episodes=1000)
rewards_sarsa_vs_mcts = train_agents_against_each_other(env, sarsa_agent, mcts_agent, episodes=1000)

In [None]:
# Train MCTS Agent vs other agents
rewards_mcts_vs_dqn = train_agents_against_each_other(env, mcts_agent, dqn_agent, episodes=1000)
rewards_mcts_vs_q_learning = train_agents_against_each_other(env, mcts_agent, q_learning_agent, episodes=1000)
rewards_mcts_vs_bayesian = train_agents_against_each_other(env, mcts_agent, bayesian_agent, episodes=1000)
rewards_mcts_vs_sarsa = train_agents_against_each_other(env, mcts_agent, sarsa_agent, episodes=1000)

In [None]:
def plot_smoothed_training_statistics(rewards, title, window=50):
    smoothed_rewards = pd.Series(rewards).rolling(window=window).mean()
    plt.figure(figsize=(12, 6))
    plt.plot(smoothed_rewards, label='Smoothed Rewards')
    plt.xlabel('Episodes')
    plt.ylabel('Rewards')
    plt.title(title)
    plt.legend()
    plt.show()

# Plot the smoothed training statistics for each matchup
plot_smoothed_training_statistics(rewards_q_learning_vs_dqn[0], "Q-Learning Agent vs DQN Agent (Smoothed)")
plot_smoothed_training_statistics(rewards_q_learning_vs_dqn[1], "DQN Agent vs Q-Learning Agent (Smoothed)")

plot_smoothed_training_statistics(rewards_q_learning_vs_bayesian[0], "Q-Learning Agent vs Bayesian Agent (Smoothed)")
plot_smoothed_training_statistics(rewards_q_learning_vs_bayesian[1], "Bayesian Agent vs Q-Learning Agent (Smoothed)")

plot_smoothed_training_statistics(rewards_dqn_vs_bayesian[0], "DQN Agent vs Bayesian Agent (Smoothed)")
plot_smoothed_training_statistics(rewards_dqn_vs_bayesian[1], "Bayesian Agent vs DQN Agent (Smoothed)")

# Plot the smoothed training statistics for SARSA matchups
plot_smoothed_training_statistics(rewards_sarsa_vs_dqn[0], "SARSA Agent vs DQN Agent (Smoothed)")
plot_smoothed_training_statistics(rewards_sarsa_vs_dqn[1], "DQN Agent vs SARSA Agent (Smoothed)")

plot_smoothed_training_statistics(rewards_sarsa_vs_q_learning[0], "SARSA Agent vs Q-Learning Agent (Smoothed)")
plot_smoothed_training_statistics(rewards_sarsa_vs_q_learning[1], "Q-Learning Agent vs SARSA Agent (Smoothed)")

# # Plot the smoothed training statistics for MCTS matchups
plot_smoothed_training_statistics(rewards_mcts_vs_dqn[0], "MCTS Agent vs DQN Agent (Smoothed)")
plot_smoothed_training_statistics(rewards_mcts_vs_dqn[1], "DQN Agent vs MCTS Agent (Smoothed)")

plot_smoothed_training_statistics(rewards_mcts_vs_q_learning[0], "MCTS Agent vs Q-Learning Agent (Smoothed)")
plot_smoothed_training_statistics(rewards_mcts_vs_q_learning[1], "Q-Learning Agent vs MCTS Agent (Smoothed)")


In [None]:
# Plot the training statistics for each matchup
def plot_training_statistics(rewards1, rewards2, title1, title2):
    plt.figure(figsize=(12, 6))
    plt.plot(rewards1, label=title1)
    plt.plot(rewards2, label=title2)
    plt.xlabel('Episodes')
    plt.ylabel('Rewards')
    plt.title(f'{title1} vs {title2}')
    plt.legend()
    plt.show()

# Plot the training statistics for each matchup
plot_training_statistics(rewards_q_learning_vs_dqn[0], rewards_q_learning_vs_dqn[1], "Q-Learning Agent", "DQN Agent")
plot_training_statistics(rewards_q_learning_vs_bayesian[0], rewards_q_learning_vs_bayesian[1], "Q-Learning Agent", "Bayesian Agent")
plot_training_statistics(rewards_dqn_vs_bayesian[0], rewards_dqn_vs_bayesian[1], "DQN Agent", "Bayesian Agent")
plot_training_statistics(rewards_sarsa_vs_dqn[0], rewards_sarsa_vs_dqn[1], "SARSA Agent", "DQN Agent")
plot_training_statistics(rewards_sarsa_vs_q_learning[0], rewards_sarsa_vs_q_learning[1], "SARSA Agent", "Q-Learning Agent")
plot_training_statistics(rewards_sarsa_vs_bayesian[0], rewards_sarsa_vs_bayesian[1], "SARSA Agent", "Bayesian Agent")
plot_training_statistics(rewards_sarsa_vs_mcts[0], rewards_sarsa_vs_mcts[1], "SARSA Agent", "MCTS Agent")
plot_training_statistics(rewards_mcts_vs_dqn[0], rewards_mcts_vs_dqn[1], "MCTS Agent", "DQN Agent")
plot_training_statistics(rewards_mcts_vs_q_learning[0], rewards_mcts_vs_q_learning[1], "MCTS Agent", "Q-Learning Agent")
plot_training_statistics(rewards_mcts_vs_bayesian[0], rewards_mcts_vs_bayesian[1], "MCTS Agent", "Bayesian Agent")
plot_training_statistics(rewards_mcts_vs_sarsa[0], rewards_mcts_vs_sarsa[1], "MCTS Agent", "SARSA Agent")
