# Tutorial 1: Learn to play games with RL

**By Neuromatch Academy**

In [None]:
!pip3 install -U torch tqdm ipywidgets datatops vibecheck

In [None]:
#@markdown What is your Pennkey and pod? (text, not numbers, e.g. bfranklin)
my_pennkey = '' #@param {type:"string"}
my_pod = '' #@param {type:"string"}
my_email = '' #@param {type:"string"}
tutorial = 'W12D1'

In [None]:
from vibecheck import DatatopsContentReviewContainer
from datatops import Datatops

feedback_dtid = "62a48t3w"
feedback_name = "cis522_feedback"
quiz_dtid = "lxx8szk1"
quiz_name = "cis522_quiz"
dt_url = "https://pmyvdlilci.execute-api.us-east-1.amazonaws.com/klab/"

# Instantiate the Datatops client
dt = Datatops(dt_url)
quizdt = dt.get_project(quiz_name, user_key=quiz_dtid)


---
# Tutorial Objectives

In this tutorial, you will learn how to implement a game loop and improve the performance of a random player. 

The specific objectives for this tutorial:
*   Understand the format of two-players games
*   Learn about value network and policy network

In the Bonus sections you will learn about Monte Carlo Tree Search (MCTS) and compare its performance to policy-based and value-based players.

---
# Setup

In [None]:
# @title Install dependencies
!pip install coloredlogs --quiet

!pip install git+https://github.com/NeuromatchAcademy/evaltools --quiet

In [None]:
# Imports
import os
import math
import time
import torch
import random
import logging
import coloredlogs

import numpy as np

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from tqdm.notebook import tqdm
from pickle import Unpickler

log = logging.getLogger(__name__)
coloredlogs.install(level="INFO")  # Change this to DEBUG to see more info.


In [None]:
# @title Set random seed

# @markdown Executing `set_seed(seed=seed)` you are setting the seed

# For DL its critical to set the random seed so that students can have a
# baseline to compare their results to expected results.
# Read more here: https://pytorch.org/docs/stable/notes/randomness.html

# Call `set_seed` function in the exercises to ensure reproducibility.
import random
import torch


def set_seed(seed=None, seed_torch=True):
    """
    Function that controls randomness. NumPy and random modules must be imported.

    Args:
      seed : Integer
        A non-negative integer that defines the random state. Default is `None`.
      seed_torch : Boolean
        If `True` sets the random seed for pytorch tensors, so pytorch module
        must be imported. Default is `True`.

    Returns:
      Nothing.
    """
    if seed is None:
        seed = np.random.choice(2**32)
    random.seed(seed)
    np.random.seed(seed)
    if seed_torch:
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    print(f"Random seed {seed} has been set.")


# In case that `DataLoader` is used
def seed_worker(worker_id):
    """
    DataLoader will reseed workers following randomness in
    multi-process data loading algorithm.

    Args:
      worker_id: integer
        ID of subprocess to seed. 0 means that
        the data will be loaded in the main process
        Refer: https://pytorch.org/docs/stable/data.html#data-loading-randomness for more details

    Returns:
      Nothing
    """
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)


In [None]:
# @title Set device (GPU or CPU). Execute `set_device()`
# especially if torch modules used.

# Inform the user if the notebook uses GPU or CPU.


def set_device():
    """
    Set the device. CUDA if available, CPU otherwise

    Args:
      None

    Returns:
      Nothing
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    if device != "cuda":
        print(
            "WARNING: For this notebook to perform best, "
            "if possible, in the menu under `Runtime` -> "
            "`Change runtime type.`  select `GPU` "
        )
    else:
        print("GPU is enabled in this notebook.")

    return device


In [None]:
SEED = 2021
set_seed(seed=SEED)
DEVICE = set_device()

In [None]:
# @title Download the modules

# @markdown Run this cell!

# Download from OSF. Original repo: https://github.com/raymondchua/nma_rl_games.git

import os, io, sys, shutil, zipfile
from urllib.request import urlopen

# download from github repo directly
#!git clone git://github.com/raymondchua/nma_rl_games.git --quiet
REPO_PATH = "nma_rl_games"

if os.path.exists(REPO_PATH):
    download_string = "Redownloading"
    shutil.rmtree(REPO_PATH)
else:
    download_string = "Downloading"

zipurl = "https://osf.io/kf4p9/download"
print(f"{download_string} and unzipping the file... Please wait.")
with urlopen(zipurl) as zipresp:
    with zipfile.ZipFile(io.BytesIO(zipresp.read())) as zfile:
        zfile.extractall()
print("Download completed.")

print(f"Add the {REPO_PATH} in the path and import the modules.")
# add the repo in the path
sys.path.append("nma_rl_games/alpha-zero")

# @markdown Import modules designed for use in this notebook
import Arena

from utils import *
from Game import Game
from MCTS import MCTS
from NeuralNet import NeuralNet

from othello.OthelloPlayers import *
from othello.OthelloLogic import Board
from othello.OthelloGame import OthelloGame
from othello.pytorch.NNet import NNetWrapper as NNet


The hyperparameters used throughout the notebook.

In [None]:
args = dotdict(
    {
        "numIters": 1,  # In training, number of iterations = 1000 and num of episodes = 100
        "numEps": 1,  # Number of complete self-play games to simulate during a new iteration.
        "tempThreshold": 15,  # To control exploration and exploitation
        "updateThreshold": 0.6,  # During arena playoff, new neural net will be accepted if threshold or more of games are won.
        "maxlenOfQueue": 200,  # Number of game examples to train the neural networks.
        "numMCTSSims": 15,  # Number of games moves for MCTS to simulate.
        "arenaCompare": 10,  # Number of games to play during arena play to determine if new net will be accepted.
        "cpuct": 1,
        "maxDepth": 5,  # Maximum number of rollouts
        "numMCsims": 5,  # Number of monte carlo simulations
        "mc_topk": 3,  # Top k actions for monte carlo rollout
        "checkpoint": "./temp/",
        "load_model": False,
        "load_folder_file": ("/dev/models/8x100x50", "best.pth.tar"),
        "numItersForTrainExamplesHistory": 20,
        # Define neural network arguments
        "lr": 0.001,  # lr: Learning Rate
        "dropout": 0.3,
        "epochs": 10,
        "batch_size": 64,
        "device": DEVICE,
        "num_channels": 512,
    }
)


---
# Section 0: Introduction

In [None]:
# @title Video 0: Introduction
from ipywidgets import widgets

out1 = widgets.Output()
with out1:
    from IPython.display import YouTubeVideo

    video = YouTubeVideo(id=f"710MdAhq4ZY", width=854, height=480, fs=1, rel=0)
    print("Video available at https://youtube.com/watch?v=" + video.id)
    display(video)

out = widgets.Tab([out1])
out.set_title(0, "Youtube")


display(out)


**NOTE:** We have converted some of the original exercises to regular code blocks to lift the workload a bit this week. Some videos still reference these as exercises – just make sure you understand the code we provided, no need to actively code anything in these parts youself :) 

---
# Section 1: Create a game/agent loop for RL

In [None]:
# @title Video 1: A game loop for RL
from ipywidgets import widgets

out2 = widgets.Output()
with out2:
    from IPython.display import IFrame

    class BiliVideo(IFrame):
        def __init__(self, id, page=1, width=400, height=300, **kwargs):
            self.id = id
            src = "https://player.bilibili.com/player.html?bvid={0}&page={1}".format(
                id, page
            )
            super(BiliVideo, self).__init__(src, width, height, **kwargs)

    video = BiliVideo(id=f"BV1Wy4y1V7bt", width=854, height=480, fs=1)
    print("Video available at https://www.bilibili.com/video/{0}".format(video.id))
    display(video)

out1 = widgets.Output()
with out1:
    from IPython.display import YouTubeVideo

    video = YouTubeVideo(id=f"aH2Hs8f6KrQ", width=854, height=480, fs=1, rel=0)
    print("Video available at https://youtube.com/watch?v=" + video.id)
    display(video)

out = widgets.Tab([out1, out2])
out.set_title(0, "Youtube")
out.set_title(1, "Bilibili")

display(out)


### Introduction to OthelloGame

**Game Components**: 
1. A square 8x8 board (you could use a chess board)
2. 64 discs coloured black on one side and white on the opposite side.

**Setup**:
The board will start with 2 black discs and 2 white discs at the centre of the board. They are arranged with black forming a North-East to South-West direction. White is forming a North-West to South-East direction. The goal is to get the majority of colour discs on the board at the end of the game.

**Strategy**:
Two players ~ each player gets 32 discs and black always starts the game. Then the game alternates between white and black until:

    - One player can not make a valid move to outflank the opponent.
    - Both players have no valid moves.

When a player has no valid moves, he pass his turn and the opponent continues.
A player can not voluntarily forfeit his turn.
When both players can not make a valid move the game ends.

You can play Othello online: https://www.eothello.com/ if you like!


***Goal***: How to setup a game environment with multiple players for reinforcement learning experiments.

***Steps***: 

*   Build an agent that plays random moves
*   Connect with connect 4 game
*   Generate games including wins and losses

In [None]:
class OthelloGame(Game):
    """
    Instantiate Othello Game
    """

    square_content = {-1: "X", +0: "-", +1: "O"}

    @staticmethod
    def getSquarePiece(piece):
        return OthelloGame.square_content[piece]

    def __init__(self, n):
        self.n = n

    def getInitBoard(self):
        # Return initial board (numpy board)
        b = Board(self.n)
        return np.array(b.pieces)

    def getBoardSize(self):
        # (a,b) tuple
        return (self.n, self.n)

    def getActionSize(self):
        # Return number of actions, n is the board size and +1 is for no-op action
        return self.n * self.n + 1

    def getCanonicalForm(self, board, player):
        # Return state if player==1, else return -state if player==-1
        return player * board

    def stringRepresentation(self, board):
        return board.tobytes()

    def stringRepresentationReadable(self, board):
        board_s = "".join(
            self.square_content[square] for row in board for square in row
        )
        return board_s

    def getScore(self, board, player):
        b = Board(self.n)
        b.pieces = np.copy(board)
        return b.countDiff(player)

    @staticmethod
    def display(board):
        n = board.shape[0]
        print("   ", end="")
        for y in range(n):
            print(y, end=" ")
        print("")
        print("-----------------------")
        for y in range(n):
            print(y, "|", end="")  # Print the row
            for x in range(n):
                piece = board[y][x]  # Get the piece to print
                print(OthelloGame.square_content[piece], end=" ")
            print("|")
        print("-----------------------")

    def getNextState(self, board, player, action):
        """
        Helper function to make valid move
        If player takes action on board, return next (board,player)
        and action must be a valid move

        Args:
          board: np.ndarray
            Board of size n x n [6x6 in this case]
          player: Integer
            ID of current player
          action: np.ndarray
            Space of actions

        Returns:
          (board,player) tuple signifying next state
        """
        if action == self.n * self.n:
            return (board, -player)
        b = Board(self.n)
        b.pieces = np.copy(board)
        move = (int(action / self.n), action % self.n)
        b.execute_move(move, player)
        return (b.pieces, -player)

    def getValidMoves(self, board, player):
        """
        Helper function to make valid move
        If player takes action on board, return next (board,player)
        and action must be a valid move

        Args:
          board: np.ndarray
            Board of size n x n [6x6 in this case]
          player: Integer
            ID of current player
          action: np.ndarray
            Space of action

        Returns:
          valids: np.ndarray
            Returns a fixed size binary vector
        """
        valids = [0] * self.getActionSize()
        b = Board(self.n)
        b.pieces = np.copy(board)
        legalMoves = b.get_legal_moves(player)
        if len(legalMoves) == 0:
            valids[-1] = 1
            return np.array(valids)
        for x, y in legalMoves:
            valids[self.n * x + y] = 1
        return np.array(valids)

    def getGameEnded(self, board, player):
        """
        Helper function to signify if game has ended

        Args:
          board: np.ndarray
            Board of size n x n [6x6 in this case]
          player: Integer
            ID of current player

        Returns:
          0 if not ended, 1 if player 1 won, -1 if player 1 lost
        """
        b = Board(self.n)
        b.pieces = np.copy(board)
        if b.has_legal_moves(player):
            return 0
        if b.has_legal_moves(-player):
            return 0
        if b.countDiff(player) > 0:
            return 1
        return -1

    def getSymmetries(self, board, pi):
        """
        Get mirror/rotational configurations of board

        Args:
          board: np.ndarray
            Board of size n x n [6x6 in this case]
          pi: np.ndarray
            Dimension of board

        Returns:
          l: list
            90 degree of board, 90 degree of pi_board
        """
        assert len(pi) == self.n**2 + 1  # 1 for pass
        pi_board = np.reshape(pi[:-1], (self.n, self.n))
        l = []

        for i in range(1, 5):
            for j in [True, False]:
                newB = np.rot90(board, i)
                newPi = np.rot90(pi_board, i)
                if j:
                    newB = np.fliplr(newB)
                    newPi = np.fliplr(newPi)
                l += [(newB, list(newPi.ravel()) + [pi[-1]])]
        return l


## Section 1.1: Create a random player

In [None]:
class RandomPlayer:
    # Simulates Random Player

    def __init__(self, game):
        self.game = game

    def play(self, board):
        """Simulates game play
        Args:
         board: np.ndarray
            Board of size n x n [6x6 in this case]
        Returns:
          a: int
            Randomly chosen move"""

        valids = self.game.getValidMoves(board, 1)
        prob = valids / valids.sum()
        a = np.random.choice(self.game.getActionSize(), p=prob)
        return a


## Section 1.2. Initiate the game board


In [None]:
# Display the board
set_seed(seed=SEED)
game = OthelloGame(6)
board = game.getInitBoard()
game.display(board)


In [None]:
# Observe the game board size
print(f"Board size = {game.getBoardSize()}")

# Observe the action size
print(f"Action size = {game.getActionSize()}")


## Section 1.3. Create two random agents to play against each other

In [None]:
# Define the random player
player1 = RandomPlayer(game).play  # Player 1 is a random player
player2 = RandomPlayer(game).play  # Player 2 is a random player

# Define number of games
num_games = 20

# Start the competition
set_seed(seed=SEED)
arena = Arena.Arena(
    player1, player2, game, display=None
)  # To see the steps of the competition set "display=OthelloGame.display"
result = arena.playGames(
    num_games, verbose=False
)  # return  ( number of games won by player1, num of games won by player2, num of games won by nobody)
print(f"\n\n{result}")


## Section 1.4. Compute win rate for the random player (player 1)

In [None]:
print(
    f"Number of games won by player1 = {result[0]}, "
    f"Number of games won by player2 = {result[1]} out of {num_games} games"
)
win_rate_player1 = result[0] / num_games
print(f"\nWin rate for player1 over 20 games: {round(win_rate_player1*100, 1)}%")


In [None]:
#@title .
DatatopsContentReviewContainer(
    "",
    "W12D1_RandomVsRandom",
    {
        "url": dt_url,
        "name": feedback_name,
        "user_key": feedback_dtid,
    }
).render()

---
# Section 2: Train a value function from expert game data



**Goal:** Learn how to train a value function from a dataset of games played by an expert.

**Exercise:** 

* Load a dataset of expert generated games.
* Train a network to minimize MSE for win/loss predictions given board states sampled throughout the game. This will be done on a very small number of games. We will provide a network trained on a larger dataset.

In [None]:
# @title Video 2: Train a value function
from ipywidgets import widgets

out2 = widgets.Output()
with out2:
    from IPython.display import IFrame

    class BiliVideo(IFrame):
        def __init__(self, id, page=1, width=400, height=300, **kwargs):
            self.id = id
            src = "https://player.bilibili.com/player.html?bvid={0}&page={1}".format(
                id, page
            )
            super(BiliVideo, self).__init__(src, width, height, **kwargs)

    video = BiliVideo(id=f"BV1pg411j7f7", width=854, height=480, fs=1)
    print("Video available at https://www.bilibili.com/video/{0}".format(video.id))
    display(video)

out1 = widgets.Output()
with out1:
    from IPython.display import YouTubeVideo

    video = YouTubeVideo(id=f"f9lZq0WQJFg", width=854, height=480, fs=1, rel=0)
    print("Video available at https://youtube.com/watch?v=" + video.id)
    display(video)

out = widgets.Tab([out1, out2])
out.set_title(0, "Youtube")
out.set_title(1, "Bilibili")

display(out)


## Section 2.1. Load expert data

In [None]:
def loadTrainExamples(folder, filename):
    """
    Helper function to load Training examples

    Args:
      folder: string
        Path specifying training examples
      filename: string
        File name of training examples

    Returns:
      trainExamplesHistory: list
        Returns examples based on the model were already collected (loaded)
    """
    trainExamplesHistory = []
    modelFile = os.path.join(folder, filename)
    examplesFile = modelFile + ".examples"
    if not os.path.isfile(examplesFile):
        print(f'File "{examplesFile}" with trainExamples not found!')
        r = input("Continue? [y|n]")
        if r != "y":
            sys.exit()
    else:
        print("File with train examples found. Loading it...")
        with open(examplesFile, "rb") as f:
            trainExamplesHistory = Unpickler(f).load()
        print("Loading done!")
        return trainExamplesHistory


In [None]:
path = "nma_rl_games/alpha-zero/pretrained_models/data/"
loaded_games = loadTrainExamples(folder=path, filename="checkpoint_1.pth.tar")


## Section 2.2. Define the Neural Network Architecture for Othello


### Coding Exercise 2.2: Implement the NN `OthelloNNet` for Othello

In [None]:
class OthelloNNet(nn.Module):
    """
    Instantiate Othello Neural Net with the following configuration
    nn.Conv2d(1, args.num_channels, 3, stride=1, padding=1) # Convolutional Layer 1
    nn.Conv2d(args.num_channels, args.num_channels, 3, stride=1, padding=1) # Convolutional Layer 2
    nn.Conv2d(args.num_channels, args.num_channels, 3, stride=1) # Convolutional Layer 3
    nn.Conv2d(args.num_channels, args.num_channels, 3, stride=1) # Convolutional Layer 4
    nn.BatchNorm2d(args.num_channels) X 4
    nn.Linear(args.num_channels * (self.board_x - 4) * (self.board_y - 4), 1024) # Fully-connected Layer 1
    nn.Linear(1024, 512) # Fully-connected Layer 2
    nn.Linear(512, self.action_size) # Fully-connected Layer 3
    nn.Linear(512, 1) # Fully-connected Layer 4
    """

    def __init__(self, game, args):
        """
        Initialise game parameters

        Args:
          game: OthelloGame instance
            Instance of the OthelloGame class above;
          args: dictionary
            Instantiates number of iterations and episodes, controls temperature threshold, queue length,
            arena, checkpointing, and neural network parameters:
            learning-rate: 0.001, dropout: 0.3, epochs: 10, batch_size: 64,
            num_channels: 512

        Returns:
          Nothing
        """
        self.board_x, self.board_y = game.getBoardSize()
        self.action_size = game.getActionSize()
        self.args = args

        super(OthelloNNet, self).__init__()
        self.conv1 = nn.Conv2d(1, args.num_channels, 3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(
            args.num_channels, args.num_channels, 3, stride=1, padding=1
        )
        self.conv3 = nn.Conv2d(args.num_channels, args.num_channels, 3, stride=1)
        self.conv4 = nn.Conv2d(args.num_channels, args.num_channels, 3, stride=1)

        self.bn1 = nn.BatchNorm2d(args.num_channels)
        self.bn2 = nn.BatchNorm2d(args.num_channels)
        self.bn3 = nn.BatchNorm2d(args.num_channels)
        self.bn4 = nn.BatchNorm2d(args.num_channels)

        self.fc1 = nn.Linear(
            args.num_channels * (self.board_x - 4) * (self.board_y - 4), 1024
        )
        self.fc_bn1 = nn.BatchNorm1d(1024)

        self.fc2 = nn.Linear(1024, 512)
        self.fc_bn2 = nn.BatchNorm1d(512)

        self.fc3 = nn.Linear(512, self.action_size)

        self.fc4 = nn.Linear(512, 1)

    def forward(self, s):
        """
        Controls forward pass of OthelloNNet

        Args:
          s: np.ndarray
            Array of size (batch_size x board_x x board_y)

        Returns:
          Probability distribution over actions at the current state and the value of the current state.
        """
        s = s.view(
            -1, 1, self.board_x, self.board_y
        )  # batch_size x 1 x board_x x board_y
        s = F.relu(
            self.bn1(self.conv1(s))
        )  # batch_size x num_channels x board_x x board_y
        s = F.relu(
            self.bn2(self.conv2(s))
        )  # batch_size x num_channels x board_x x board_y
        s = F.relu(
            self.bn3(self.conv3(s))
        )  # batch_size x num_channels x (board_x-2) x (board_y-2)
        s = F.relu(
            self.bn4(self.conv4(s))
        )  # batch_size x num_channels x (board_x-4) x (board_y-4)
        s = s.view(-1, self.args.num_channels * (self.board_x - 4) * (self.board_y - 4))

        s = F.dropout(
            F.relu(self.fc_bn1(self.fc1(s))),
            p=self.args.dropout,
            training=self.training,
        )  # batch_size x 1024
        s = F.dropout(
            F.relu(self.fc_bn2(self.fc2(s))),
            p=self.args.dropout,
            training=self.training,
        )  # batch_size x 512

        pi = self.fc3(s)  # batch_size x action_size
        v = self.fc4(s)  # batch_size x 1
        #################################################
        ## TODO for students: Please compute a probability distribution over 'pi' using log softmax (for numerical stability)
        # Fill out function and remove
        raise NotImplementedError(
            "Calculate the probability distribution and the value"
        )
        #################################################
        # Returns probability distribution over actions at the current state and the value of the current state.
        return ..., ...


## Section 2.3. Define the Value network
 During training, the ground truth will be uploaded from the **MCTS simulations** available at `checkpoint_x.path.tar.examples`.

### Coding Exercise 2.3: Implement the `ValueNetwork`

In [None]:
class ValueNetwork(NeuralNet):
    """
    Initiates the Value Network
    """

    def __init__(self, game):
        """
        Initialise network parameters

        Args:
          game: OthelloGame instance
            Instance of the OthelloGame class above;

        Returns:
          Nothing
        """
        self.nnet = OthelloNNet(game, args)
        self.board_x, self.board_y = game.getBoardSize()
        self.action_size = game.getActionSize()
        self.nnet.to(args.device)

    def train(self, games):
        """
        Function to train value network

        Args:
          games: list
            List of examples with each example is of form (board, pi, v)

        Returns:
          Nothing
        """
        optimizer = optim.Adam(self.nnet.parameters())
        for examples in games:
            for epoch in range(args.epochs):
                print("EPOCH ::: " + str(epoch + 1))
                self.nnet.train()
                v_losses = []  # To store the losses per epoch
                batch_count = int(
                    len(examples) / args.batch_size
                )  # len(examples)=200, batch-size=64, batch_count=3
                t = tqdm(range(batch_count), desc="Training Value Network")
                for _ in t:
                    sample_ids = np.random.randint(
                        len(examples), size=args.batch_size
                    )  # Read the ground truth information from MCTS simulation using the loaded examples
                    boards, pis, vs = list(
                        zip(*[examples[i] for i in sample_ids])
                    )  # Length of boards, pis, vis = 64
                    boards = torch.FloatTensor(np.array(boards).astype(np.float64))
                    target_vs = torch.FloatTensor(np.array(vs).astype(np.float64))

                    # Predict
                    # To run on GPU if available
                    boards, target_vs = boards.contiguous().to(
                        args.device
                    ), target_vs.contiguous().to(args.device)

                    #################################################
                    ## TODO for students:
                    ## 1. Compute the value predicted by OthelloNNet() ##
                    ## 2. First implement the loss_v() function below and then use it to update the value loss. ##
                    # Fill out function and remove
                    raise NotImplementedError("Compute the output")
                    #################################################
                    # Compute output
                    _, out_v = ...
                    l_v = ...  # Total loss

                    # Record loss
                    v_losses.append(l_v.item())
                    t.set_postfix(Loss_v=l_v.item())

                    # Compute gradient and do SGD step
                    optimizer.zero_grad()
                    l_v.backward()
                    optimizer.step()

    def predict(self, board):
        """
        Function to perform prediction

        Args:
          board: np.ndarray
            Board of size n x n [6x6 in this case]

        Returns:
          v: OthelloNet instance
            Data of the OthelloNet class instance above;
        """
        # Timing
        start = time.time()

        # Preparing input
        board = torch.FloatTensor(board.astype(np.float64))
        board = board.contiguous().to(args.device)
        board = board.view(1, self.board_x, self.board_y)
        self.nnet.eval()
        with torch.no_grad():
            _, v = self.nnet(board)
        return v.data.cpu().numpy()[0]

    def loss_v(self, targets, outputs):
        """
        Calculates Mean squared error

        Args:
          targets: np.ndarray
            Ground Truth variables corresponding to input
          outputs: np.ndarray
            Predictions of Network

        Returns:
          MSE Loss calculated as: square of the difference between your model's predictions
          and the ground truth and average across the whole dataset
        """
        #################################################
        ## TODO for students: Please compute Mean squared error and return as output. ##
        # Fill out function and remove
        raise NotImplementedError("Calculate the loss")
        #################################################
        # Mean squared error (MSE)
        return ...

    def save_checkpoint(self, folder="checkpoint", filename="checkpoint.pth.tar"):
        """
        Code Checkpointing

        Args:
          folder: string
            Path specifying training examples
          filename: string
            File name of training examples

        Returns:
          Nothing
        """
        filepath = os.path.join(folder, filename)
        if not os.path.exists(folder):
            print(
                "Checkpoint Directory does not exist! Making directory {}".format(
                    folder
                )
            )
            os.mkdir(folder)
        else:
            print("Checkpoint Directory exists! ")
        torch.save(
            {
                "state_dict": self.nnet.state_dict(),
            },
            filepath,
        )
        print("Model saved! ")

    def load_checkpoint(self, folder="checkpoint", filename="checkpoint.pth.tar"):
        """
        Load code checkpoint

        Args:
          folder: string
            Path specifying training examples
          filename: string
            File name of training examples

        Returns:
          Nothing
        """
        # https://github.com/pytorch/examples/blob/master/imagenet/main.py#L98
        filepath = os.path.join(folder, filename)
        if not os.path.exists(filepath):
            raise ("No model in path {}".format(filepath))

        checkpoint = torch.load(filepath, map_location=args.device)
        self.nnet.load_state_dict(checkpoint["state_dict"])


## Section 2.4. Train the value network and observe the MSE loss progress

**Important:** Run this cell ONLY if you do not have access to the pretrained models in the `rl_for_games` repository.

In [None]:
if not os.listdir("nma_rl_games/alpha-zero/pretrained_models/models/"):
    set_seed(seed=SEED)
    game = OthelloGame(6)
    vnet = ValueNetwork(game)
    vnet.train(loaded_games)


In [None]:
#@title .
DatatopsContentReviewContainer(
    "",
    "W12D1_NeuralOthello",
    {
        "url": dt_url,
        "name": feedback_name,
        "user_key": feedback_dtid,
    }
).render()

---
# Section 3: Use a trained value network to play games


**Goal**: Learn how to use a value function in order to make a player that works better than a random player.

**Exercise:**
* Sample random valid moves and use the value function to rank them
* Choose the best move as the action and play it
Show that doing so beats the random player

**Hint:** You might need to change the sign of the value based on the player.

In [None]:
# @title Video 3: Play games using a value function
from ipywidgets import widgets

out2 = widgets.Output()
with out2:
    from IPython.display import IFrame

    class BiliVideo(IFrame):
        def __init__(self, id, page=1, width=400, height=300, **kwargs):
            self.id = id
            src = "https://player.bilibili.com/player.html?bvid={0}&page={1}".format(
                id, page
            )
            super(BiliVideo, self).__init__(src, width, height, **kwargs)

    video = BiliVideo(id=f"BV1Ug411j7ig", width=854, height=480, fs=1)
    print("Video available at https://www.bilibili.com/video/{0}".format(video.id))
    display(video)

out1 = widgets.Output()
with out1:
    from IPython.display import YouTubeVideo

    video = YouTubeVideo(id=f"tvmzVHPBKKs", width=854, height=480, fs=1, rel=0)
    print("Video available at https://youtube.com/watch?v=" + video.id)
    display(video)

out = widgets.Tab([out1, out2])
out.set_title(0, "Youtube")
out.set_title(1, "Bilibili")

display(out)


## Coding Exercise 3: Value-based player

In [None]:
model_save_name = "ValueNetwork.pth.tar"
path = "nma_rl_games/alpha-zero/pretrained_models/models/"
set_seed(seed=SEED)
game = OthelloGame(6)
vnet = ValueNetwork(game)
vnet.load_checkpoint(folder=path, filename=model_save_name)


In [None]:
class ValueBasedPlayer:
    """
    Simulate Value Based Player
    """

    def __init__(self, game, vnet):
        """
        Initialise value based player parameters

        Args:
          game: OthelloGame instance
            Instance of the OthelloGame class above;
          vnet: Value Network instance
            Instance of the Value Network class above;

        Returns:
          Nothing
        """
        self.game = game
        self.vnet = vnet

    def play(self, board):
        """
        Simulate game play

        Args:
          board: np.ndarray
            Board of size n x n [6x6 in this case]

        Returns:
          candidates: List
            Collection of tuples describing action and values of future predicted states
        """
        valids = self.game.getValidMoves(board, 1)
        candidates = []
        max_num_actions = 4
        va = np.where(valids)[0]
        va_list = va.tolist()
        random.shuffle(va_list)
        #################################################
        ## TODO for students: In the first part, please return the next board state using getNextState(), then predict
        ## the value of next state using value network, and finally add the value and action as a tuple to the candidate list.
        ## Note that you need to reverse the sign of the value. In zero-sum games the players flip every turn. In detail, we train
        ## a value function to think about the game from one player's (either black or white) perspective. In order to use the same
        ## value function to estimate how good the position is for the other player, we need to take the negative of the output of
        ## the function. E.g., if the value function is trained for white's perspective and says that white is likely to win the game
        ## from the current state with an output of 0.75, this similarly means that it would suggest that black is very unlikely (-0.75)
        ## to win the game from the current state.##
        # Fill out function and remove
        raise NotImplementedError("Implement the value-based player")
        #################################################
        for a in va_list:
            # Return next board state using getNextState() function
            nextBoard, _ = ...
            # Predict the value of next state using value network
            value = ...
            # Add the value and the action as a tuple to the candidate lists, note that you might need to change the sign of the value based on the player
            candidates += ...

            if len(candidates) == max_num_actions:
                break

        candidates.sort()

        return candidates[0][1]


# Playing games between a value-based player and a random player
set_seed(seed=SEED)
num_games = 20
player1 = ValueBasedPlayer(game, vnet).play
player2 = RandomPlayer(game).play
arena = Arena.Arena(player1, player2, game, display=OthelloGame.display)
## Uncomment the code below to check your code!
# result = arena.playGames(num_games, verbose=False)
# print(f"\n\n{result}")


**Result of pitting a value-based player against a random player**

In [None]:
print(
    f"Number of games won by player1 = {result[0]}, "
    f"Number of games won by player2 = {result[1]}, out of {num_games} games"
)
win_rate_player1 = (
    result[0] / num_games
)  # result[0] is the number of times that player 1 wins
print(
    f"\nWin rate for player1 over {num_games} games: {round(win_rate_player1*100, 1)}%"
)


In [None]:
#@title .
DatatopsContentReviewContainer(
    "",
    "W12D1_ValuePlayer",
    {
        "url": dt_url,
        "name": feedback_name,
        "user_key": feedback_dtid,
    }
).render()

---
# Section 4: Train a policy network from expert game data


**Goal**: How to train a policy network via supervised learning / behavioural cloning.

**Steps**:
* Train a network to predict the next move in an expert dataset by maximizing the log likelihood of the next action.

## Quiz!

Before you run the code, let's explore our intuitions. 

**We provide this model with experts' behavior, and the model uses this to learn how to play the game. If we instead trained the model on _deliberately bad_ moves, and then continued training on new (unsupervised) games as before, how would you expect the model to perform as #-epochs goes to infinity? Would the model perform poorly forever? Or would it eventually learn to play well? Explain your intuition in one or two sentences.**

In [None]:
eventual_othello_bad_training = "" #@param {type:"string"}

In [None]:
# @title Video 4: Train a policy network
from ipywidgets import widgets

out2 = widgets.Output()
with out2:
    from IPython.display import IFrame

    class BiliVideo(IFrame):
        def __init__(self, id, page=1, width=400, height=300, **kwargs):
            self.id = id
            src = "https://player.bilibili.com/player.html?bvid={0}&page={1}".format(
                id, page
            )
            super(BiliVideo, self).__init__(src, width, height, **kwargs)

    video = BiliVideo(id=f"BV1hQ4y127GJ", width=854, height=480, fs=1)
    print("Video available at https://www.bilibili.com/video/{0}".format(video.id))
    display(video)

out1 = widgets.Output()
with out1:
    from IPython.display import YouTubeVideo

    video = YouTubeVideo(id=f"vj9gKNJ19D8", width=854, height=480, fs=1, rel=0)
    print("Video available at https://youtube.com/watch?v=" + video.id)
    display(video)

out = widgets.Tab([out1, out2])
out.set_title(0, "Youtube")
out.set_title(1, "Bilibili")

display(out)


## Section 4.1. Implement `PolicyNetwork`

In this section, we'll implement the policy network.

In [None]:
class PolicyNetwork(NeuralNet):
    """
    Initialise Policy Network
    """

    def __init__(self, game):
        """
        Initalise policy network paramaters

        Args:
          game: OthelloGame instance
            Instance of the OthelloGame class above;

        Returns:
          Nothing
        """
        self.nnet = OthelloNNet(game, args)
        self.board_x, self.board_y = game.getBoardSize()
        self.action_size = game.getActionSize()
        self.nnet.to(args.device)

    def train(self, games):
        """
        Function for Policy Network Training

        Args:
          games: list
            List of examples where each example is of form (board, pi, v)

        Return:
          Nothing
        """
        optimizer = optim.Adam(self.nnet.parameters())

        for examples in games:
            for epoch in range(args.epochs):
                print("EPOCH ::: " + str(epoch + 1))
                self.nnet.train()
                pi_losses = []

                batch_count = int(len(examples) / args.batch_size)

                t = tqdm(range(batch_count), desc="Training Policy Network")
                for _ in t:
                    sample_ids = np.random.randint(len(examples), size=args.batch_size)
                    boards, pis, _ = list(zip(*[examples[i] for i in sample_ids]))
                    boards = torch.FloatTensor(np.array(boards).astype(np.float64))
                    target_pis = torch.FloatTensor(np.array(pis))

                    # Predict
                    boards, target_pis = boards.contiguous().to(
                        args.device
                    ), target_pis.contiguous().to(args.device)

                    # Compute output
                    out_pi, _ = self.nnet(boards)
                    l_pi = self.loss_pi(target_pis, out_pi)

                    # Record loss
                    pi_losses.append(l_pi.item())
                    t.set_postfix(Loss_pi=l_pi.item())

                    # Compute gradient and do SGD step
                    optimizer.zero_grad()
                    l_pi.backward()
                    optimizer.step()

    def predict(self, board):
        """
        Function to perform prediction

        Args:
          board: np.ndarray
            Board of size n x n [6x6 in this case]

        Returns:
          Data from the OthelloNet class instance above;
        """
        # Timing
        start = time.time()

        # Preparing input
        board = torch.FloatTensor(board.astype(np.float64))
        board = board.contiguous().to(args.device)
        board = board.view(1, self.board_x, self.board_y)
        self.nnet.eval()
        with torch.no_grad():
            pi, _ = self.nnet(board)
        return torch.exp(pi).data.cpu().numpy()[0]

    def loss_pi(self, targets, outputs):
        """
        Calculates Negative Log Likelihood(NLL) of Targets

        Args:
          targets: np.ndarray
            Ground Truth variables corresponding to input
          outputs: np.ndarray
            Predictions of Network

        Returns:
          Negative Log Likelihood calculated as: When training a model, we aspire to find the minima of a
          loss function given a set of parameters (in a neural network, these are the weights and biases).
          Sum the loss function to all the correct classes. So, whenever the network assigns high confidence at
          the correct class, the NLL is low, but when the network assigns low confidence at the correct class,
          the NLL is high.
        """
        ## To implement the loss function, please compute and return the negative log likelihood of targets.
        ## For more information, here is a reference that connects the expression to the neg-log-prob: https://gombru.github.io/2018/05/23/cross_entropy_loss/
        return -torch.sum(targets * outputs) / targets.size()[0]

    def save_checkpoint(self, folder="checkpoint", filename="checkpoint.pth.tar"):
        """
        Code Checkpointing

        Args:
          folder: string
            Path specifying training examples
          filename: string
            File name of training examples

        Returns:
          Nothing
        """
        filepath = os.path.join(folder, filename)
        if not os.path.exists(folder):
            print(
                "Checkpoint Directory does not exist! Making directory {}".format(
                    folder
                )
            )
            os.mkdir(folder)
        else:
            print("Checkpoint Directory exists! ")
        torch.save(
            {
                "state_dict": self.nnet.state_dict(),
            },
            filepath,
        )
        print("Model saved! ")

    def load_checkpoint(self, folder="checkpoint", filename="checkpoint.pth.tar"):
        """
        Load code checkpoint

        Args:
          folder: string
            Path specifying training examples
          filename: string
            File name of training examples

        Returns:
          Nothing
        """
        # https://github.com/pytorch/examples/blob/master/imagenet/main.py#L98
        filepath = os.path.join(folder, filename)
        if not os.path.exists(filepath):
            raise ("No model in path {}".format(filepath))

        checkpoint = torch.load(filepath, map_location=args.device)
        self.nnet.load_state_dict(checkpoint["state_dict"])


### Train the policy network

**Important:** Only run this cell if you do not have access to the pretrained models in the `rl_for_games` repository.

In [None]:
if not os.listdir("nma_rl_games/alpha-zero/pretrained_models/models/"):
    set_seed(seed=SEED)
    game = OthelloGame(6)
    pnet = PolicyNetwork(game)
    pnet.train(loaded_games)


---
# Section 5: Use a trained policy network to play games



**Goal**: How to use a policy network to play games.

**Exercise:** 
* Use the policy network to give probabilities for the next move.
* Build a player that takes the move given the maximum probability by the network.
* Compare this to another player that samples moves according to the probability distribution output by the network.

In [None]:
# @title Video 5: Play games using a policy network
from ipywidgets import widgets

out2 = widgets.Output()
with out2:
    from IPython.display import IFrame

    class BiliVideo(IFrame):
        def __init__(self, id, page=1, width=400, height=300, **kwargs):
            self.id = id
            src = "https://player.bilibili.com/player.html?bvid={0}&page={1}".format(
                id, page
            )
            super(BiliVideo, self).__init__(src, width, height, **kwargs)

    video = BiliVideo(id=f"BV1aq4y1S7o4", width=854, height=480, fs=1)
    print("Video available at https://www.bilibili.com/video/{0}".format(video.id))
    display(video)

out1 = widgets.Output()
with out1:
    from IPython.display import YouTubeVideo

    video = YouTubeVideo(id=f"yHtVqT2Nstk", width=854, height=480, fs=1, rel=0)
    print("Video available at https://youtube.com/watch?v=" + video.id)
    display(video)

out = widgets.Tab([out1, out2])
out.set_title(0, "Youtube")
out.set_title(1, "Bilibili")

display(out)


## Coding Exercise 5: Implement the `PolicyBasedPlayer`

In [None]:
model_save_name = "PolicyNetwork.pth.tar"
path = "nma_rl_games/alpha-zero/pretrained_models/models/"
set_seed(seed=SEED)
game = OthelloGame(6)
pnet = PolicyNetwork(game)
pnet.load_checkpoint(folder=path, filename=model_save_name)


In [None]:
class PolicyBasedPlayer:
    """
    Simulate Policy Based Player
    """

    def __init__(self, game, pnet, greedy=True):
        """
        Initialize Policy based player parameters

        Args:
          game: OthelloGame instance
            Instance of the OthelloGame class above;
          pnet: Policy Network instance
            Instance of the Policy Network class above
          greedy: Boolean
            If true, implement greedy approach
            Else, implement random sample policy based player

        Returns:
          Nothing
        """
        self.game = game
        self.pnet = pnet
        self.greedy = greedy

    def play(self, board):
        """
        Simulate game play

        Args:
          board: np.ndarray
            Board of size n x n [6x6 in this case]

        Returns:
          a: np.ndarray
            If greedy, implement greedy policy player
            Else, implement random sample policy based player
        """
        valids = self.game.getValidMoves(board, 1)
        #################################################
        ## TODO for students:  ##
        ## 1. Compute the action probabilities using policy network pnet()
        ## 2. Mask invalid moves using valids variable and the action probabilites computed above.
        ## 3. Compute the sum over valid actions and store them in sum_vap.
        # Fill out function and remove
        raise NotImplementedError("Define the play")
        #################################################
        action_probs = ...
        vap = ...  # Masking invalid moves
        sum_vap = ...

        if sum_vap > 0:
            vap /= sum_vap  # Renormalize
        else:
            # If all valid moves were masked we make all valid moves equally probable
            print("All valid moves were masked, doing a workaround.")
            vap = vap + valids
            vap /= np.sum(vap)

        if self.greedy:
            # Greedy policy player
            a = np.where(vap == np.max(vap))[0][0]
        else:
            # Sample-based policy player
            a = np.random.choice(self.game.getActionSize(), p=vap)

        return a


# Playing games
set_seed(seed=SEED)
num_games = 20
player1 = PolicyBasedPlayer(game, pnet, greedy=True).play
player2 = RandomPlayer(game).play
arena = Arena.Arena(player1, player2, game, display=OthelloGame.display)
## Uncomment below to test!
# result = arena.playGames(num_games, verbose=False)
# print(f"\n\n{result}")
# win_rate_player1 = result[0] / num_games
# print(f"\nWin rate for player1 over {num_games} games: {round(win_rate_player1*100, 1)}%")


In [None]:
model_save_name = "PolicyNetwork.pth.tar"
path = "nma_rl_games/alpha-zero/pretrained_models/models/"
set_seed(seed=SEED)
game = OthelloGame(6)
pnet = PolicyNetwork(game)
pnet.load_checkpoint(folder=path, filename=model_save_name)


### Comparing a policy based player versus a random player

There's often randomness in the results as we are running the players for a low number of games (only 20 games due compute + time costs). So, when students are running the cells they might not get the expected result. To better measure the strength of players you can run more games!

In [None]:
set_seed(seed=SEED)
num_games = 20
game = OthelloGame(6)
player1 = PolicyBasedPlayer(game, pnet, greedy=False).play
player2 = RandomPlayer(game).play
arena = Arena.Arena(player1, player2, game, display=OthelloGame.display)
result = arena.playGames(num_games, verbose=False)
print(f"\n\n{result}")


In [None]:
win_rate_player1 = result[0] / num_games
print(f"Win rate for player1 over {num_games} games: {round(win_rate_player1*100, 1)}%")


### Compare greedy policy based player versus value based player 

In [None]:
set_seed(seed=SEED)
num_games = 20
game = OthelloGame(6)
player1 = PolicyBasedPlayer(game, pnet).play
player2 = ValueBasedPlayer(game, vnet).play
arena = Arena.Arena(player1, player2, game, display=OthelloGame.display)
result = arena.playGames(num_games, verbose=False)
print(f"\n\n{result}")


In [None]:
win_rate_player1 = result[0] / num_games
print(
    f"Win rate for player 1 over {num_games} games: {round(win_rate_player1*100, 1)}%"
)


### Compare greedy policy based player versus sample-based policy player 

In [None]:
set_seed(seed=SEED)
num_games = 20
game = OthelloGame(6)
player1 = PolicyBasedPlayer(game, pnet).play  # greedy player
player2 = PolicyBasedPlayer(game, pnet, greedy=False).play  # sample-based player
arena = Arena.Arena(player1, player2, game, display=OthelloGame.display)
result = arena.playGames(num_games, verbose=False)
print(f"\n\n{result}")


In [None]:
win_rate_player1 = result[0] / num_games
print(
    f"Win rate for player 1 over {num_games} games: {round(win_rate_player1*100, 1)}%"
)


In [None]:
#@title .
DatatopsContentReviewContainer(
    "",
    "W12D1_PolicyNetwork",
    {
        "url": dt_url,
        "name": feedback_name,
        "user_key": feedback_dtid,
    }
).render()

---
# Section 6: Plan using Monte Carlo Rollouts



**Goal**: Teach the students the core idea behind using simulated rollouts to understand the future and value actions.

**Steps**:
* Build a loop to run Monte Carlo simulations using the policy network.
* Use this to obtain better estimates of the value of moves.

In [None]:
# @title Video 6: Play using Monte-Carlo rollouts
from ipywidgets import widgets

out2 = widgets.Output()
with out2:
    from IPython.display import IFrame

    class BiliVideo(IFrame):
        def __init__(self, id, page=1, width=400, height=300, **kwargs):
            self.id = id
            src = "https://player.bilibili.com/player.html?bvid={0}&page={1}".format(
                id, page
            )
            super(BiliVideo, self).__init__(src, width, height, **kwargs)

    video = BiliVideo(id=f"BV1Rb4y1U7BW", width=854, height=480, fs=1)
    print("Video available at https://www.bilibili.com/video/{0}".format(video.id))
    display(video)

out1 = widgets.Output()
with out1:
    from IPython.display import YouTubeVideo

    video = YouTubeVideo(id=f"DtCWDIlSo18", width=854, height=480, fs=1, rel=0)
    print("Video available at https://youtube.com/watch?v=" + video.id)
    display(video)

out = widgets.Tab([out1, out2])
out.set_title(0, "Youtube")
out.set_title(1, "Bilibili")

display(out)


## Section 6.1. `MonteCarlo` 

In [None]:
class MonteCarlo:
    """
    Implementation of Monte Carlo Algorithm
    """

    def __init__(self, game, nnet, args):
        """
        Initialize Monte Carlo Parameters

        Args:
          game: OthelloGame instance
            Instance of the OthelloGame class above;
          nnet: OthelloNet instance
            Instance of the OthelloNNet class above;
          args: dictionary
            Instantiates number of iterations and episodes, controls temperature threshold, queue length,
            arena, checkpointing, and neural network parameters:
            learning-rate: 0.001, dropout: 0.3, epochs: 10, batch_size: 64,
            num_channels: 512

        Returns:
          Nothing
        """
        self.game = game
        self.nnet = nnet
        self.args = args

        self.Ps = {}  # Stores initial policy (returned by neural net)
        self.Es = {}  # Stores game.getGameEnded ended for board s

    # Call this rollout
    def simulate(self, canonicalBoard):
        """
        Helper function to simulate one Monte Carlo rollout

        Args:
          canonicalBoard: np.ndarray
            Canonical Board of size n x n [6x6 in this case]

        Returns:
          temp_v:
            Terminal State
        """
        s = self.game.stringRepresentation(canonicalBoard)
        init_start_state = s
        temp_v = 0
        isfirstAction = None

        for i in range(self.args.maxDepth):  # maxDepth
            if s not in self.Es:
                self.Es[s] = self.game.getGameEnded(canonicalBoard, 1)
            if self.Es[s] != 0:
                # Terminal state
                temp_v = -self.Es[s]
                break

            self.Ps[s], v = self.nnet.predict(canonicalBoard)
            valids = self.game.getValidMoves(canonicalBoard, 1)
            self.Ps[s] = self.Ps[s] * valids  # Masking invalid moves
            sum_Ps_s = np.sum(self.Ps[s])

            if sum_Ps_s > 0:
                self.Ps[s] /= sum_Ps_s  # Renormalize
            else:
                # If all valid moves were masked make all valid moves equally probable
                # NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else.
                # If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process.
                log.error("All valid moves were masked, doing a workaround.")
                self.Ps[s] = self.Ps[s] + valids
                self.Ps[s] /= np.sum(self.Ps[s])

            # Take a random action
            a = np.random.choice(self.game.getActionSize(), p=self.Ps[s])
            # Find the next state and the next player
            next_s, next_player = self.game.getNextState(canonicalBoard, 1, a)
            next_s = self.game.getCanonicalForm(next_s, next_player)

            s = self.game.stringRepresentation(next_s)
            temp_v = v

        return temp_v


---
# Section 7: Use Monte Carlo simulations to play games

**Goal:** Teach students how to use simple Monte Carlo planning to play games.

In [None]:
# @title Video 7: Play with planning
from ipywidgets import widgets

out2 = widgets.Output()
with out2:
  from IPython.display import IFrame
  class BiliVideo(IFrame):
    def __init__(self, id, page=1, width=400, height=300, **kwargs):
      self.id=id
      src = "https://player.bilibili.com/player.html?bvid={0}&page={1}".format(id, page)
      super(BiliVideo, self).__init__(src, width, height, **kwargs)

  video = BiliVideo(id=f"BV1bh411B7S4", width=854, height=480, fs=1)
  print("Video available at https://www.bilibili.com/video/{0}".format(video.id))
  display(video)

out1 = widgets.Output()
with out1:
  from IPython.display import YouTubeVideo
  video = YouTubeVideo(id=f"plmFzAy3H5s", width=854, height=480, fs=1, rel=0)
  print("Video available at https://youtube.com/watch?v=" + video.id)
  display(video)

out = widgets.Tab([out1, out2])
out.set_title(0, 'Youtube')
out.set_title(1, 'Bilibili')

display(out)

## Coding Exercise 7: Monte-Carlo simulations

* Incorporate Monte Carlo simulations into an agent.
* Run the resulting player versus the random, value-based, and policy-based players.

In [None]:
# Load MC model from the repository
mc_model_save_name = "MC.pth.tar"
path = "nma_rl_games/alpha-zero/pretrained_models/models/"


In [None]:
class MonteCarloBasedPlayer:
    """
    Simulate Player based on Monte Carlo Algorithm
    """

    def __init__(self, game, nnet, args):
        """
        Initialize Monte Carlo Parameters

        Args:
          game: OthelloGame instance
            Instance of the OthelloGame class above;
          nnet: OthelloNet instance
            Instance of the OthelloNNet class above;
          args: dictionary
            Instantiates number of iterations and episodes, controls temperature threshold, queue length,
            arena, checkpointing, and neural network parameters:
            learning-rate: 0.001, dropout: 0.3, epochs: 10, batch_size: 64,
            num_channels: 512

        Returns:
          Nothing
        """
        self.game = game
        self.nnet = nnet
        self.args = args
        self.mc = MonteCarlo(game, nnet, args)
        self.K = self.args.mc_topk

    def play(self, canonicalBoard):
        """
        Simulate Play on Canonical Board

        Args:
          canonicalBoard: np.ndarray
            Canonical Board of size n x n [6x6 in this case]

        Returns:
          best_action: tuple
            (avg_value, action) i.e., Average value associated with corresponding action
            i.e., Action with the highest topK probability
        """
        self.qsa = []
        s = self.game.stringRepresentation(canonicalBoard)
        Ps, v = self.nnet.predict(canonicalBoard)
        valids = self.game.getValidMoves(canonicalBoard, 1)
        Ps = Ps * valids  # Masking invalid moves
        sum_Ps_s = np.sum(Ps)

        if sum_Ps_s > 0:
            Ps /= sum_Ps_s  # Renormalize
        else:
            # If all valid moves were masked make all valid moves equally probable
            # NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else.
            # If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process.
            log = logging.getLogger(__name__)
            log.error("All valid moves were masked, doing a workaround.")
            Ps = Ps + valids
            Ps /= np.sum(Ps)

        num_valid_actions = np.shape(np.nonzero(Ps))[1]

        if num_valid_actions < self.K:
            top_k_actions = np.argpartition(Ps, -num_valid_actions)[-num_valid_actions:]
        else:
            top_k_actions = np.argpartition(Ps, -self.K)[
                -self.K :
            ]  # To get actions that belongs to top k prob
        #################################################
        ## TODO for students:
        ## 1. For each action in the top-k actions
        ## 2. Get the next state using getNextState() function. You can find the implementation of this function in Section 1 in the OthelloGame() class.
        ## 3. Get the canonical form of the getNextState().
        # Fill out function and remove
        raise NotImplementedError("Loop for the top actions")
        #################################################
        for action in ...:
            next_s, next_player = self.game.getNextState(..., ..., ...)
            next_s = self.game.getCanonicalForm(..., ...)

            values = []

            # Do some rollouts
            for rollout in range(self.args.numMCsims):
                value = self.mc.simulate(canonicalBoard)
                values.append(value)

            # Average out values
            avg_value = np.mean(values)
            self.qsa.append((avg_value, action))

        self.qsa.sort(key=lambda a: a[0])
        self.qsa.reverse()
        best_action = self.qsa[0][1]
        return best_action

    def getActionProb(self, canonicalBoard, temp=1):
        """
        Helper function to get probabilities associated with each action

        Args:
          canonicalBoard: np.ndarray
            Canonical Board of size n x n [6x6 in this case]
          temp: Integer
            Signifies if game is in terminal state

        Returns:
          action_probs: List
            Probability associated with corresponding action
        """
        if self.game.getGameEnded(canonicalBoard, 1) != 0:
            return np.zeros((self.game.getActionSize()))

        else:
            action_probs = np.zeros((self.game.getActionSize()))
            best_action = self.play(canonicalBoard)
            action_probs[best_action] = 1

        return action_probs


set_seed(seed=SEED)
game = OthelloGame(6)
# Run the resulting player versus the random player
rp = RandomPlayer(game).play
num_games = 20  # Feel free to change this number

n1 = NNet(game)  # nNet players
n1.load_checkpoint(folder=path, filename=mc_model_save_name)
args1 = dotdict({"numMCsims": 10, "maxRollouts": 5, "maxDepth": 5, "mc_topk": 3})

## Uncomment below to check Monte Carlo agent!
# print('\n******MC player versus random player******')
# mc1 = MonteCarloBasedPlayer(game, n1, args1)
# n1p = lambda x: np.argmax(mc1.getActionProb(x))
# arena = Arena.Arena(n1p, rp, game, display=OthelloGame.display)
# MC_result = arena.playGames(num_games, verbose=False)
# print(f"\n\n{MC_result}")
# print(f"\nNumber of games won by player1 = {MC_result[0]}, "
#       f"number of games won by player2 = {MC_result[1]}, out of {num_games} games")
# win_rate_player1 = MC_result[0]/num_games
# print(f"\nWin rate for player1 over {num_games} games: {round(win_rate_player1*100, 1)}%")


### Monte-Carlo player against Value-based player

In [None]:
print("\n******MC player versus value-based player******")
set_seed(seed=SEED)
vp = ValueBasedPlayer(game, vnet).play  # Value-based player
arena = Arena.Arena(n1p, vp, game, display=OthelloGame.display)
MC_result = arena.playGames(num_games, verbose=False)
print(f"\n\n{MC_result}")
print(
    f"\nNumber of games won by player1 = {MC_result[0]}, "
    f"number of games won by player2 = {MC_result[1]}, out of {num_games} games"
)
win_rate_player1 = MC_result[0] / num_games
print(
    f"\nWin rate for player1 over {num_games} games: {round(win_rate_player1*100, 1)}%"
)


### Monte-Carlo player against Policy-based player

In [None]:
print("\n******MC player versus policy-based player******")
set_seed(seed=SEED)
pp = PolicyBasedPlayer(game, pnet).play  # Policy player
arena = Arena.Arena(n1p, pp, game, display=OthelloGame.display)
MC_result = arena.playGames(num_games, verbose=False)
print(f"\n\n{MC_result}")
print(
    f"\nNumber of games won by player1 = {MC_result[0]}, "
    f"number of games won by player2 = {MC_result[1]}, out of {num_games} games"
)
win_rate_player1 = MC_result[0] / num_games
print(
    f"\nWin rate for player1 over {num_games} games: {round(win_rate_player1*100, 1)}%"
)


In [None]:
#@title .
DatatopsContentReviewContainer(
    "",
    "W12D1_MonteCarlo",
    {
        "url": dt_url,
        "name": feedback_name,
        "user_key": feedback_dtid,
    }
).render()

---
# Section 8: Ethical aspects

In [None]:
# @title Video 8: Unstoppable opponents
from ipywidgets import widgets

out2 = widgets.Output()
with out2:
    from IPython.display import IFrame

    class BiliVideo(IFrame):
        def __init__(self, id, page=1, width=400, height=300, **kwargs):
            self.id = id
            src = "https://player.bilibili.com/player.html?bvid={0}&page={1}".format(
                id, page
            )
            super(BiliVideo, self).__init__(src, width, height, **kwargs)

    video = BiliVideo(id=f"BV1WA411w7mw", width=854, height=480, fs=1)
    print("Video available at https://www.bilibili.com/video/{0}".format(video.id))
    display(video)

out1 = widgets.Output()
with out1:
    from IPython.display import YouTubeVideo

    video = YouTubeVideo(id=f"q7181lvoNpM", width=854, height=480, fs=1, rel=0)
    print("Video available at https://youtube.com/watch?v=" + video.id)
    display(video)

out = widgets.Tab([out1, out2])
out.set_title(0, "Youtube")
out.set_title(1, "Bilibili")

display(out)


## Quiz

**Modern reinforcement learning is now commonly used to refine the behavior of large language models, with reward signals granted by human annotators (see [this article on RLHF](https://huggingface.co/blog/rlhf)). Ignoring implementation details and technical challenges, what are TWO ethical concerns you have about this approach? Explain in a few sentences.**

In [None]:
rlhf_ethics_concerns = "" #@param {type:"string"}

---
# Summary

In this tutorial, you have learned how to implement a game loop and improve the performance of a random player. More specifically, you are now able to understand the format of two-players games. We learned about value-based and policy-based players, and we compared them with the MCTS method.

In [None]:
# @title Video 9: Outro
from ipywidgets import widgets

out1 = widgets.Output()
with out1:
    from IPython.display import YouTubeVideo

    video = YouTubeVideo(id=f"uEe5ErMpH_U", width=854, height=480, fs=1, rel=0)
    print("Video available at https://youtube.com/watch?v=" + video.id)
    display(video)

out = widgets.Tab([out1])
out.set_title(0, "Youtube")


display(out)


---
## Wrap-up

In [None]:
#@markdown Submit your answers (run this to submit)
quizdt.store(
    {
        "my_pennkey": my_pennkey,
        "my_pod": my_pod,
        "my_email": my_email,
        "tutorial": tutorial,
        "eventual_othello_bad_training": eventual_othello_bad_training,
        "rlhf_ethics_concerns": rlhf_ethics_concerns,
    }
)

## Feedback
How could this session have been better? How happy are you in your group? How do you feel right now?

Feel free to use the embeded form below or use this link:
<a target="_blank" rel="noopener noreferrer" href="https://airtable.com/shr1TzujOOOY21kFQ">https://airtable.com/shr1TzujOOOY21kFQ</a>

In [None]:
# @title Feedback form
display(
    IFrame(src="https://airtable.com/embed/shr1TzujOOOY21kFQ", width=800, height=400)
)


---
# Bonus 1: Plan using Monte Carlo Tree Search (MCTS)

*Time estimate: ~30mins

**Goal:** Teach students to understand the core ideas behind Monte Carlo Tree Search (MCTS).

In [None]:
# @title Video 10: Plan with MCTS
from ipywidgets import widgets

out2 = widgets.Output()
with out2:
    from IPython.display import IFrame

    class BiliVideo(IFrame):
        def __init__(self, id, page=1, width=400, height=300, **kwargs):
            self.id = id
            src = "https://player.bilibili.com/player.html?bvid={0}&page={1}".format(
                id, page
            )
            super(BiliVideo, self).__init__(src, width, height, **kwargs)

    video = BiliVideo(id=f"BV1yQ4y127Sr", width=854, height=480, fs=1)
    print("Video available at https://www.bilibili.com/video/{0}".format(video.id))
    display(video)

out1 = widgets.Output()
with out1:
    from IPython.display import YouTubeVideo

    video = YouTubeVideo(id=f"Hhw6Ed0Zmco", width=854, height=480, fs=1, rel=0)
    print("Video available at https://youtube.com/watch?v=" + video.id)
    display(video)

out = widgets.Tab([out1, out2])
out.set_title(0, "Youtube")
out.set_title(1, "Bilibili")

display(out)


## Bonus Coding Exercise 1: MCTS planner

* Plug together pre-built Selection, Expansion & Backpropagation code to complete an MCTS planner.
* Deploy the MCTS planner to understand an interesting position, producing value estimates and action counts.

In [None]:
class MCTS:
    """
    This class handles MCTS (Monte Carlo Tree Search).
    """

    def __init__(self, game, nnet, args):
        """
        Initialize parameters of MCTS

        Args:
          game: OthelloGame instance
            Instance of the OthelloGame class above;
          nnet: OthelloNet instance
            Instance of the OthelloNNet class above;
          args: dictionary
            Instantiates number of iterations and episodes, controls temperature threshold, queue length,
            arena, checkpointing, and neural network parameters:
            learning-rate: 0.001, dropout: 0.3, epochs: 10, batch_size: 64,
            num_channels: 512

        Returns:
          Nothing
        """
        self.game = game
        self.nnet = nnet
        self.args = args
        self.Qsa = {}  # Stores Q values for s,a (as defined in the paper)
        self.Nsa = {}  # Stores #times edge s,a was visited
        self.Ns = {}  # Stores #times board s was visited
        self.Ps = {}  # Stores initial policy (returned by neural net)
        self.Es = {}  # Stores game.getGameEnded ended for board s
        self.Vs = {}  # Stores game.getValidMoves for board s

    def search(self, canonicalBoard):
        """
        This function performs one iteration of MCTS. It is recursively called
        till a leaf node is found. The action chosen at each node is one that
        has the maximum upper confidence bound as in the paper.
        Once a leaf node is found, the neural network is called to return an
        initial policy P and a value v for the state. This value is propagated
        up the search path. In case the leaf node is a terminal state, the
        outcome is propagated up the search path. The values of Ns, Nsa, Qsa are
        updated.
        NOTE: the return values are the negative of the value of the current
        state. This is done since v is in [-1,1] and if v is the value of a
        state for the current player, then its value is -v for the other player.

        Args:
          canonicalBoard: np.ndarray
            Canonical Board of size n x n [6x6 in this case]

        Returns:
            v: Float
              The negative of the value of the current canonicalBoard
        """
        s = self.game.stringRepresentation(canonicalBoard)

        if s not in self.Es:
            self.Es[s] = self.game.getGameEnded(canonicalBoard, 1)
        if self.Es[s] != 0:
            # Terminal node
            return -self.Es[s]

        if s not in self.Ps:
            # Leaf node
            self.Ps[s], v = self.nnet.predict(canonicalBoard)
            valids = self.game.getValidMoves(canonicalBoard, 1)
            self.Ps[s] = self.Ps[s] * valids  # Masking invalid moves
            sum_Ps_s = np.sum(self.Ps[s])
            if sum_Ps_s > 0:
                self.Ps[s] /= sum_Ps_s  # Renormalize
            else:
                # If all valid moves were masked make all valid moves equally probable
                # NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else.
                # If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process.
                log = logging.getLogger(__name__)
                log.error("All valid moves were masked, doing a workaround.")
                self.Ps[s] = self.Ps[s] + valids
                self.Ps[s] /= np.sum(self.Ps[s])

            self.Vs[s] = valids
            self.Ns[s] = 0

            return -v

        valids = self.Vs[s]
        cur_best = -float("inf")
        best_act = -1

        #################################################
        ## TODO for students:
        ## Implement the highest upper confidence bound depending whether we observed the state-action pair which is stored in self.Qsa[(s, a)]. You can find the formula in the slide 52 in video 8 above.
        # Fill out function and remove
        raise NotImplementedError("Complete the for loop")
        #################################################
        # Pick the action with the highest upper confidence bound
        for a in range(self.game.getActionSize()):
            if valids[a]:
                if (s, a) in self.Qsa:
                    u = ... + ... * ... * math.sqrt(...) / (1 + ...)
                else:
                    u = ... * ... * math.sqrt(... + 1e-8)

                if u > cur_best:
                    cur_best = u
                    best_act = a

        a = best_act
        next_s, next_player = self.game.getNextState(canonicalBoard, 1, a)
        next_s = self.game.getCanonicalForm(next_s, next_player)

        v = self.search(next_s)

        if (s, a) in self.Qsa:
            self.Qsa[(s, a)] = (self.Nsa[(s, a)] * self.Qsa[(s, a)] + v) / (
                self.Nsa[(s, a)] + 1
            )
            self.Nsa[(s, a)] += 1

        else:
            self.Qsa[(s, a)] = v
            self.Nsa[(s, a)] = 1

        self.Ns[s] += 1
        return -v

    def getNsa(self):
        return self.Nsa


[*Click for solution*](https://github.com/NeuromatchAcademy/course-content-dl/tree/main//tutorials/W3D3_ReinforcementLearningForGames/solutions/W3D3_Tutorial1_Solution_6150f11c.py)



---
# Bonus 2: Use MCTS to play games

*Time estimate: ~10mins*


**Goal:** Teach the students how to use the results of MCTS to play games.

**Exercise:** 
* Plug the MCTS planner into an agent.
* Play games against other agents.
* Explore the contributions of prior network, value function, number of simulations/time to play and explore/exploit parameters.

In [None]:
# @title Video 11: Play with MCTS
from ipywidgets import widgets

out2 = widgets.Output()
with out2:
    from IPython.display import IFrame

    class BiliVideo(IFrame):
        def __init__(self, id, page=1, width=400, height=300, **kwargs):
            self.id = id
            src = "https://player.bilibili.com/player.html?bvid={0}&page={1}".format(
                id, page
            )
            super(BiliVideo, self).__init__(src, width, height, **kwargs)

    video = BiliVideo(id=f"BV13q4y1H7H6", width=854, height=480, fs=1)
    print("Video available at https://www.bilibili.com/video/{0}".format(video.id))
    display(video)

out1 = widgets.Output()
with out1:
    from IPython.display import YouTubeVideo

    video = YouTubeVideo(id=f"1BRXb-igKAU", width=854, height=480, fs=1, rel=0)
    print("Video available at https://youtube.com/watch?v=" + video.id)
    display(video)

out = widgets.Tab([out1, out2])
out.set_title(0, "Youtube")
out.set_title(1, "Bilibili")

display(out)


## Bonus Coding Exercise 2: Agent that uses an MCTS planner

* Plug the MCTS planner into an agent.
* Play games against other agents.
* Explore the contributions of prior network, value function, number of simulations/time to play and explore/exploit parameters.

In [None]:
# Load MCTS model from the repository
mcts_model_save_name = "MCTS.pth.tar"
path = "nma_rl_games/alpha-zero/pretrained_models/models/"


In [None]:
class MonteCarloTreeSearchBasedPlayer:
    """
    Simulate Player based on MCTS
    """

    def __init__(self, game, nnet, args):
        """
        Initialize parameters of MCTS

        Args:
          game: OthelloGame instance
            Instance of the OthelloGame class above;
          nnet: OthelloNet instance
            Instance of the OthelloNNet class above;
          args: dictionary
            Instantiates number of iterations and episodes, controls temperature threshold, queue length,
            arena, checkpointing, and neural network parameters:
            learning-rate: 0.001, dropout: 0.3, epochs: 10, batch_size: 64,
            num_channels: 512

        Returns:
          Nothing
        """
        self.game = game
        self.nnet = nnet
        self.args = args
        self.mcts = MCTS(game, nnet, args)

    def play(self, canonicalBoard, temp=1):
        """
        Simulate Play on Canonical Board

        Args:
          canonicalBoard: np.ndarray
            Canonical Board of size n x n [6x6 in this case]
          temp: Integer
            Signifies if game is in terminal state

        Returns:
          List of probabilities for all actions if temp is 0
          Best action based on max probability otherwise
        """
        for i in range(self.args.numMCTSSims):
            #################################################
            ## TODO for students:
            #  Run MCTS search function.
            #  Fill out function and remove
            raise NotImplementedError("Plug the planner")
            #################################################
            ...

        s = self.game.stringRepresentation(canonicalBoard)
        #################################################
        ## TODO for students:
        #  Call the Nsa function from MCTS class and store it in the self.Nsa
        #  Fill out function and remove
        raise NotImplementedError("Compute Nsa (number of times edge s,a was visited)")
        #################################################
        self.Nsa = ...
        self.counts = [
            self.Nsa[(s, a)] if (s, a) in self.Nsa else 0
            for a in range(self.game.getActionSize())
        ]

        if temp == 0:
            bestAs = np.array(np.argwhere(self.counts == np.max(self.counts))).flatten()
            bestA = np.random.choice(bestAs)
            probs = [0] * len(self.counts)
            probs[bestA] = 1
            return probs

        self.counts = [x ** (1.0 / temp) for x in self.counts]
        self.counts_sum = float(sum(self.counts))
        probs = [x / self.counts_sum for x in self.counts]
        return np.argmax(probs)

    def getActionProb(self, canonicalBoard, temp=1):
        """
        Helper function to get probabilities associated with each action

        Args:
          canonicalBoard: np.ndarray
            Canonical Board of size n x n [6x6 in this case]
          temp: Integer
            Signifies if game is in terminal state

        Returns:
          action_probs: List
            Probability associated with corresponding action
        """
        action_probs = np.zeros((self.game.getActionSize()))
        best_action = self.play(canonicalBoard)
        action_probs[best_action] = 1

        return action_probs


set_seed(seed=SEED)
game = OthelloGame(6)
rp = RandomPlayer(game).play  # All players
num_games = 20  # Games
n1 = NNet(game)  # nnet players
n1.load_checkpoint(folder=path, filename=mcts_model_save_name)
args1 = dotdict({"numMCTSSims": 50, "cpuct": 1.0})

## Uncomment below to check your agent!
# print('\n******MCTS player versus random player******')
# mcts1 = MonteCarloTreeSearchBasedPlayer(game, n1, args1)
# n1p = lambda x: np.argmax(mcts1.getActionProb(x, temp=0))
# arena = Arena.Arena(n1p, rp, game, display=OthelloGame.display)
# MCTS_result = arena.playGames(num_games, verbose=False)
# print(f"\n\n{MCTS_result}")
# print(f"\nNumber of games won by player1 = {MCTS_result[0]}, "
#       f"number of games won by player2 = {MCTS_result[1]}, out of {num_games} games")
# win_rate_player1 = MCTS_result[0]/num_games
# print(f"\nWin rate for player1 over {num_games} games: {round(win_rate_player1*100, 1)}%")


[*Click for solution*](https://github.com/NeuromatchAcademy/course-content-dl/tree/main//tutorials/W3D3_ReinforcementLearningForGames/solutions/W3D3_Tutorial1_Solution_e2bee612.py)



```
Number of games won by player1 = 19, num of games won by player2 = 1, out of 20 games

Win rate for player1 over 20 games: 95.0%
```

### MCTS player against Value-based player

In [None]:
print("\n******MCTS player versus value-based player******")
set_seed(seed=SEED)
vp = ValueBasedPlayer(game, vnet).play  # Value-based player
arena = Arena.Arena(n1p, vp, game, display=OthelloGame.display)
MC_result = arena.playGames(num_games, verbose=False)
print(f"\n\n{MC_result}")
print(
    f"\nNumber of games won by player1 = {MC_result[0]}, "
    f"number of games won by player2 = {MC_result[1]}, out of {num_games} games"
)
win_rate_player1 = MC_result[0] / num_games
print(
    f"\nWin rate for player1 over {num_games} games: {round(win_rate_player1*100, 1)}%"
)


```
Number of games won by player1 = 14, number of games won by player2 = 6, out of 20 games

Win rate for player1 over 20 games: 70.0%
```

### MCTS player against Policy-based player

In [None]:
print("\n******MCTS player versus policy-based player******")
set_seed(seed=SEED)
pp = PolicyBasedPlayer(game, pnet).play  # Policy-based player
arena = Arena.Arena(n1p, pp, game, display=OthelloGame.display)
MC_result = arena.playGames(num_games, verbose=False)
print(f"\n\n{MC_result}")
print(
    f"\nNumber of games won by player1 = {MC_result[0]}, "
    f"number of games won by player2 = {MC_result[1]}, out of {num_games} games"
)
win_rate_player1 = MC_result[0] / num_games
print(
    f"\nWin rate for player1 over {num_games} games: {round(win_rate_player1*100, 1)}%"
)


```
Number of games won by player1 = 20, number of games won by player2 = 0, out of 20 games

Win rate for player1 over 20 games: 100.0%
```

In [None]:
#@title .
DatatopsContentReviewContainer(
    "",
    "W12D1_MCTS",
    {
        "url": dt_url,
        "name": feedback_name,
        "user_key": feedback_dtid,
    }
).render()