## Dice Game - Parameter Optimisation Using WandB
### This notebook is for reference of parameter tuning only

In [None]:
!pip install wandb

In [None]:
from dice_game import DiceGame
import numpy as np

In [None]:
from abc import ABC, abstractmethod

class DiceGameAgent(ABC):
    def __init__(self, game):
        self.game = game
    
    @abstractmethod
    def play(self, state):
        pass

def play_game_with_agent(agent, game, verbose=False):
    state = game.reset()
    
    if(verbose): print(f"Testing agent: \n\t{type(agent).__name__}")
    if(verbose): print(f"Starting dice: \n\t{state}\n")
    
    game_over = False
    actions = 0
    while not game_over:
        action = agent.play(state)
        actions += 1
        
        if(verbose): print(f"Action {actions}: \t{action}")
        _, state, game_over = game.roll(action)
        if(verbose and not game_over): print(f"Dice: \t\t{state}")

    if(verbose): print(f"\nFinal dice: {state}, score: {game.score}")
        
    return game.score

In [None]:
class MyAgent(DiceGameAgent):
    def __init__(self, game, theta, gamma):
        """
        The initial constructor for the MyAgent class.
        Upon intialisation calculates self.V, the dictionary containing all 
        possible states as keys and the best expected reward & optimal policy as values
        
        param/attribute game: the dice game object
        
        attribute gamma: the discount rate
        attribute theta: the convergence threshold
        attribute V: the state:[expected_reward, optimal policy] dictionary
        """
        super().__init__(game)
        self.gamma = gamma
        self.theta = theta
        self.num_actions = len(self.game.actions)
        self.V = self.value_iteration()

    
    def one_step_look_ahead(self, V, state):
        """
        Given a current state, iterates over every possible action and every possible next state to
        calculate the Bellman update (the expected reward associated with choosing that action).
        
        The expected reward is the sum of the reward of the current state plus the expected reward of
        the next state,
        
        param V: the (either converged or unconverged) dictionary of states: [expected rewards, optimal policy]
        param state: the current state of the game
        
        returns A: an dictionary of actions containing the expected rewards of taking that action
        
        """
        A = {a:0 for a in self.game.actions}

        # For each possible action
        for a in A.keys():
            
            # Get the possible next_states, game_over, reward and probabilities of the next state occurring
            next_states, game_over, reward, probabilities = self.game.get_next_states(a, state)
            
            # For each next_state and associated probability of that action & given state
            for next_state, probability in zip(next_states, probabilities): 
                # If there is a next_state
                if not game_over:
                    # Calculate Bellman Update
                    A[a] += probability * (reward+self.gamma*V[next_state][0])
                else:
                    # next_state = None
                    A[a] += probability * reward
        return A


    def value_iteration(self):
        """
        The core function to perform value iteration.
        
        Loops until the difference between the expected reward for each iteration and the 
        previous run-through for that state is less than self.theta i.e. convergence.
        
        returns V : the converged dictionary of states:[expected maximum reward, optimal policy]
        """
        
        # Initialise dictionary with each state as keys and values [expected reward, optimal policy]
        V = {i: [0, None] for i in self.game.states}
        
        # Loop until delta < self.theta
        while True:
            delta = 0
    
            # For each state
            for s in self.game.states:
                # Get the list of reward values pertaining to each action given that state
                A = self.one_step_look_ahead(V, s)
                
                # Take the highest expected reward value 
                best_action_value = max(A.values())
                
                # Update delta with either delta or the absolute difference between the 
                # new maximum value for that state and the previous maximum value for that state
                delta = max(delta, np.abs(best_action_value - V[s][0]))
                
                # Update the maximum value and optimal policy for that state   
                V[s] = [best_action_value,max(A, key = A.get)]
                
            # If the change is less than self.theta, end the loop
            if delta < self.theta:
                break
        return V

        
    def play(self, state):
        """
        For the given state, obtains the optimal policy
        
        param state : the current state of the game
        
        returns self.V[state][1] : the optimal action associated with that state
        """

        return self.V[state][1]

In [None]:
# Import and log in to wandb
import wandb
!wandb login

In [None]:
# Set config dict - parameters to iterate over, search method, and objective
sweep_config = {
  "method": "grid",
  "metric": {"name": "Avg Score", "goal": "maximise"},
  "parameters": {
        "gamma": {
                "values": [0.7, 0.8, 0.9, 0.99, 1]},
      "theta": {
            "values": [0.1, 1e-2, 1e-3, 1e-4,1e-5,1e-6,1e-7,1e-8,1e-9,1e-10, 2e-2, 2e-3,2e-4,2e-5,2e-6,2e-7,2e-8,2e-9,2e-10]
        }
    }
}

In [None]:
import time

def train():
    """
    The core train function that the wandb program will run iteratively for each parameter in the sweep config
    """
    wandb.init()
    config = wandb.config
    np.random.seed(10)
    n = 10000

    game = DiceGame()
    total_score = 0
    total_time = 0
    start_time = time.process_time()
    test_agent = MyAgent(game, config.theta, config.gamma)
    total_time += time.process_time() - start_time
    for i in range(n):
        start_time = time.process_time()
        score = play_game_with_agent(test_agent, game, verbose=False)
        total_time += time.process_time() - start_time
        total_score += score
    wandb.log({"Theta": config.theta, "Gamma": config.gamma, "Avg Score": total_score/n, "Avg Time": total_time/n})
    print(f"Average score: {total_score/n}")

In [None]:
# A sweep is the name of the iterative parameter selection process
sweep_id = wandb.sweep(sweep_config, project='DiceGame')

# Run the sweep
wandb.agent(sweep_id, function=train)