In [1]:
from player import Player
from dealer import Dealer
from game import TrucoGame
from environment import TrucoEnvironment
from agent import Agent, save_agent, load_agent
from actions import game_actions, game_actions_list
import random
import itertools
import torch as T
import numpy as np

In [2]:
device = T.device("cuda:0" if T.cuda.is_available() else "cpu")

In [3]:
num_players = 6
ids = list(range(0,num_players))
names = ['John', 'Holie', 'Alice', 'Bob', 'Chris', 'Kevin']


players = [Player(id) for id in names]

agents = [Agent(player=players[i], state_space_dim=339, action_space_dim=game_actions.shape[0],model_type='shallow', device=device, replay_buffer_size=15000 + 10000*i, batch_size=64, target_update_freq=1500, epsilon_end=0.05, epsilon_decay=1000000, learning_rate=0.001) for i in range(6)]

random.shuffle(agents)

envs = [TrucoEnvironment([a1.player, a2.player]) for a1, a2 in zip(agents[0::2], agents[1::2])]

In [4]:
def get_env_by_agent(envs, agent):
    for env in envs:
        if agent.player in env.players:
            return env
    return None

def get_agents_by_env(agents, env):
    playing = []
    for agent in agents:
        if agent.player in env.players:
            playing.append(agent)
    return playing[0], playing[1]

In [5]:
for env in envs:
    
    playing_agents = get_agents_by_env(agents, env)
    
    for agent in playing_agents:

        # reset env
        starting_player, legal_actions, game_state = env.reset(False)

        # Initialize the ReplayBuffer
        while len(agent.replay_buffer) < agent.min_replay_size:
            # pick legal action given uniform distribution
            action = np.random.choice(legal_actions, 1, [1/len(legal_actions) for i in legal_actions])
            action = game_actions_list.index(action)

            # Take action, observer outcome
            rew, done, next_player, next_legal_actions, new_game_state = env.step(starting_player, action)

            #Save transition for training later
            transition = (game_state, action, rew, done, new_game_state)
            agent.save_transition(transition)

            starting_player = next_player
            legal_actions = next_legal_actions
            game_state = new_game_state

            if done:
                starting_player, legal_actions, game_state = env.reset()

In [6]:
PLAYER_SWAP_FREQ=50000

In [11]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

for epoch in itertools.count():
    print(f"Starting epoch: {epoch}")
    
    for env in envs:

        starting_player, legal_actions, game_state = env.reset(False, goes_first=1)
        playing_agents = get_agents_by_env(agents, env)

        # Main training loop
        for step in itertools.count():
            agent = [agent for agent in playing_agents if agent.player == starting_player][0]

            action = agent.choose_action(legal_actions, game_state)

            # Take action, observe outcome
            rew, done, next_player, next_legal_actions, new_game_state = env.step(agent.player, action)

            #Save transition for training later
            transition = (game_state, action, rew, done, new_game_state)
            agent.save_transition(transition)

            starting_player = next_player
            legal_actions = next_legal_actions
            game_state = new_game_state

            if done:
                # Give agents their episode reward
                for agent in playing_agents:
                    agent.save_reward(env.game.get_score(agent.player))

                starting_player, legal_actions, game_state = env.reset(False)

            # Learn from replay buffer
            agent.learn()

            # Logging
            if step % 25000 == 0 and step > 0:
                for agent in playing_agents:
                    avg_reward = np.mean(agent.reward_buffer)
                    print(f"Step: {step} | Player: {agent.player.get_id()} | Avg reward: {avg_reward} | W/L: {max([w if p == agent.player else 0 for p, w in env.games_won])/env.games_played} | Games: {env.games_played}")   

            # Break out of training loop when swap rate is reached
            if step % PLAYER_SWAP_FREQ == 0 and step > 0:
                print(f"Player {playing_agents[0].player.get_id()} vs Player {playing_agents[1].player.get_id()} finished.")
                
                # Clear GPU cache
                T.cuda.empty_cache()
                break;

    
                
    # Destroy all envs
    envs.clear()

    random.shuffle(agents)

    # Create new Envs
    envs = [TrucoEnvironment([a1.player, a2.player]) for a1, a2 in zip(agents[0::2], agents[1::2])]
    
    # Save models
    if epoch % 2 == 0 and epoch > 0:
        for agent in agents:
            save_agent(agent)
    

Starting epoch: 0
Step: 25000 | Player: Alice | Avg reward: 1.09 | W/L: 0.5362177250523378 | Games: 7165
Step: 25000 | Player: John | Avg reward: 0.95 | W/L: 0.4637822749476622 | Games: 7165
Step: 50000 | Player: Alice | Avg reward: 1.07 | W/L: 0.5342197035745423 | Games: 13764
Step: 50000 | Player: John | Avg reward: 1.1 | W/L: 0.4657802964254577 | Games: 13764
Player Alice vs Player John finished.
Step: 25000 | Player: Bob | Avg reward: 1.0 | W/L: 0.5246563060370592 | Games: 6692
Step: 25000 | Player: Kevin | Avg reward: 1.03 | W/L: 0.47534369396294085 | Games: 6692
Step: 50000 | Player: Bob | Avg reward: 1.01 | W/L: 0.5223010833022039 | Games: 13385
Step: 50000 | Player: Kevin | Avg reward: 0.9 | W/L: 0.477698916697796 | Games: 13385
Player Bob vs Player Kevin finished.
Step: 25000 | Player: Chris | Avg reward: 1.0 | W/L: 0.5338167825958265 | Games: 6757
Step: 25000 | Player: Holie | Avg reward: 1.1 | W/L: 0.46618321740417346 | Games: 6757
Step: 50000 | Player: Chris | Avg reward: 1

NameError: name 'save_agent' is not defined

In [4]:
from agent import load_agent, Agent, save_agent

a = load_agent('John', device)