In [1]:
from player import Player
from dealer import Dealer
from game import TrucoGame
from environment import TrucoEnvironment
from agent import Agent, save_agent, load_agent, RandomAgent
from actions import game_actions, game_actions_list
import random
import itertools
import torch as T
import numpy as np
import logging

In [2]:
device = T.device("cuda:0" if T.cuda.is_available() else "cpu")
log_level = logging.WARNING

In [3]:
load_names = ['Coors']

loaded_agents = [load_agent(name, device) for name in load_names]

agents = [*loaded_agents, RandomAgent(Player("Random Bob"))]

random.shuffle(agents)

envs = [TrucoEnvironment([a1.player, a2.player], logging_level=log_level) for a1, a2 in zip(agents[0::2], agents[1::2])]

In [4]:
def get_env_by_agent(envs, agent):
    for env in envs:
        if agent.player in env.players:
            return env
    return None

def get_agents_by_env(agents, env):
    playing = []
    for agent in agents:
        if agent.player in env.players:
            playing.append(agent)
    return playing[0], playing[1]

In [5]:
PLAYER_SWAP_FREQ=35000

In [6]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from timeit import default_timer as timer
import datetime

for epoch in itertools.count():
    print(f"Starting epoch: {epoch}")
    epoch_start = timer()
    
    for env in envs:
        
        env_start = timer()

        starting_player, legal_actions, game_state = env.reset(False)
        playing_agents = get_agents_by_env(agents, env)

        # Main training loop
        for step in itertools.count():
            agent = [agent for agent in playing_agents if agent.player == starting_player][0]

            action = agent.choose_action(legal_actions, game_state)

            # Take action, observe outcome
            rew, done, next_player, next_legal_actions, new_game_state = env.step(agent.player, action)

            starting_player = next_player
            legal_actions = next_legal_actions
            game_state = new_game_state

            if done:
                # Give agents their episode reward
                for agent in playing_agents:
                    agent.save_reward(env.game.get_score(agent.player))

                starting_player, legal_actions, game_state = env.reset(False)

            # Logging
            if step % 35000 == 0 and step > 0:
                for agent in playing_agents:
                    avg_reward = np.mean(agent.reward_buffer)
                    print(f"Step: {step} | Player: {agent.player.get_id()} | Avg reward: {avg_reward} | W/L: {max([w if p == agent.player else 0 for p, w in env.games_won])/env.games_played} | Games: {env.games_played} | Time: {str(datetime.timedelta(seconds=(timer() - env_start)))}")   

            # Break out of training loop when swap rate is reached
            if step % PLAYER_SWAP_FREQ == 0 and step > 0:
                print(f"Player {playing_agents[0].player.get_id()} vs Player {playing_agents[1].player.get_id()} finished.")
                
                # Clear GPU cache
                T.cuda.empty_cache()
                break;

    
                
    # Destroy all envs
    envs.clear()

    random.shuffle(agents)

    # Create new Envs
    envs = [TrucoEnvironment([a1.player, a2.player], logging_level=log_level) for a1, a2 in zip(agents[0::2], agents[1::2])]

    
    print(f"Epoch {epoch} finished in {str(datetime.timedelta(seconds=(timer() - epoch_start)))}")
    

Starting epoch: 0
Step: 35000 | Player: Random Bob | Avg reward: 0.96 | W/L: 0.5081256771397616 | Games: 923 | Time: 0:00:20.776738
Step: 35000 | Player: Coors | Avg reward: 0.88 | W/L: 0.49187432286023836 | Games: 923 | Time: 0:00:20.776963
Player Random Bob vs Player Coors finished.
Epoch 0 finished in 0:00:20.777065
Starting epoch: 1
Step: 35000 | Player: Coors | Avg reward: 1.02 | W/L: 0.5111607142857143 | Games: 896 | Time: 0:00:20.614623
Step: 35000 | Player: Random Bob | Avg reward: 1.02 | W/L: 0.4888392857142857 | Games: 896 | Time: 0:00:20.614784
Player Coors vs Player Random Bob finished.
Epoch 1 finished in 0:00:20.614890
Starting epoch: 2
Step: 35000 | Player: Random Bob | Avg reward: 0.96 | W/L: 0.5423925667828107 | Games: 861 | Time: 0:00:20.528799
Step: 35000 | Player: Coors | Avg reward: 1.03 | W/L: 0.45760743321718933 | Games: 861 | Time: 0:00:20.528960
Player Random Bob vs Player Coors finished.
Epoch 2 finished in 0:00:20.529069
Starting epoch: 3
Step: 35000 | Player

KeyboardInterrupt: 