Notebook demonstrating the game

In [29]:
from copy import copy
import numpy as np
import torch
import torch.nn as nn
from torch.distributions import Categorical

import matplotlib.pyplot as plt

In [30]:
# Environment
SIZE = 3


class GOPS:
    def __init__(self):
        self.size = SIZE
        self.state, self.score = np.zeros((1, self.size * 3 + 1)), 0.0  # Placeholders
        self.reset()

    def reset(self):
        self.state = np.ones((1, self.size * 3 + 1))
        self.score = 0.0
        self.draw()
        return

    def draw(self):
        remaining = np.nonzero(self.state[0, 1:self.size + 1])[0]
        idx = np.random.choice(remaining, 1) + 1

        # Update the state
        self.state[0, 0] = idx - 1
        self.state[0, idx] = 0  # Note conventions with the index
        return

    def get_illegal_actions(self, player=1):
        illegal_actions = np.where(self.state[0, self.size * player + 1:self.size * (player + 1) + 1] == 0)[0]
        return illegal_actions

    def step(self, action, action_opp):
        self.state[0, action + self.size + 1] = 0
        self.state[0, action_opp + self.size * 2 + 1] = 0

        # Update score
        if action > action_opp:
            self.score += self.state[0, 0]
        elif action < action_opp:
            self.score -= self.state[0, 0]

        # Game end conditions
        done = False
        if np.sum(self.state[0, 1:self.size + 1]) == 0:
            if self.score > 0:
                self.score += SIZE**2  # Best score for one player is nChoose2 so use n^2 to denote winning
            elif self.score < 0:
                self.score -= SIZE**2
            done = True
            self.state[0, 0] = -1  # This signifies no current value card. Probably unnecessary
        else:
            self.draw()
        return self.state.copy(), copy(self.score), done  # These copies are necessary I think
    
    def show(self):
        # Pygame integration here?
#         current_card = self.state[0, 0]
#         value_cards = self.state[0, 1:self.size + 1]
#         player_1 = self.state[0, self.size + 1:2*self.size + 1]
#         player_2 = self.state[0, 2*self.size + 1:3*self.size + 1]
        
#         print('Current score: ', self.state[0, -1])
#         print('Current value card: ', self.state[0, 0])
#         print('Player 1 hand: ', np.nonzero(player_1)[0])
#         print('Player 2 hand: ', np.nonzero(player_2)[0])
#         print('Remaining value cards: ', np.nonzero(value_cards)[0])
        print(self.state, self.score)
        return

In [31]:
# Interface for human moves
class HumanAgent:
    def __init__(self):
        return
    
    def get_action(self, state, illegal_actions):
        action = int(input('Enter a move: '))
        while action in illegal_actions:
            int(input('Illegal move! Enter a move: '))
        return action


# class RandomAgent:
#     def __init__(self):
#         return
    
#     def get_action(self, state, illegal_actions):
#         return np.random.choice(legal_actions)

In [32]:
# Bots
PATH = None


class PolicyNet(nn.Module):
    def __init__(self, widths=[8, 8], path=None):
        super().__init__()

        self.num_features = SIZE * 3 + 1  # Player hands and value cards, and the current card
        self.num_actions = SIZE

        self.layers = nn.Sequential(
            nn.Linear(self.num_features, widths[0]),
            nn.ReLU()
        )
        for i in range(len(widths) - 1):
            self.layers.append(nn.Linear(widths[i], widths[i + 1]))
            self.layers.append(nn.ReLU())
        self.layers.append(nn.Linear(widths[-1], self.num_actions))
        self.layers.append(nn.Softmax(dim=1))

        if path is not None:
            self.load_state_dict(torch.load(path))
        return

    def forward(self, state):
        state = torch.tensor(state, dtype=torch.float32)  # Environment is numpy-based; convert
        action_probs = self.layers(state)
        return action_probs

    def get_action(self, state, illegal_actions):
        # Use legal actions to mask
        action_probs = self.forward(state)
        action_probs[0, illegal_actions] = 0

        cat = Categorical(probs=action_probs)  # Constructs multinomial from the probs
        action = cat.sample()
        return action.item()

In [35]:
# Playing the game, and benchmarking functions
def play_game(agent, agent_opp):
    game = GOPS()
    state = game.state.copy()
    done = False
    while not done:
        game.show()
        action = agent.get_action(state, game.get_illegal_actions(player=1))
        action_opp = agent_opp.get_action(state, game.get_illegal_actions(player=2))

        # Main loop
        state_new, reward, done = game.step(action, action_opp)
        state = state_new 
    print('Final Score: {}'.format(self.score))
    return 

In [34]:
A = HumanAgent()
B = PolicyNet()

play_game(A, B)

[[2. 1. 1. 0. 1. 1. 1. 1. 1. 1.]] 0.0
Enter a move: 2
[[1. 1. 0. 0. 1. 1. 0. 0. 1. 1.]] 2.0
Enter a move: 1
[[0. 0. 0. 0. 1. 0. 0. 0. 0. 1.]] 2.0
Enter a move: 0
won
