In [8]:
from player import Player
from dealer import Dealer
from game import TrucoGame
from environment import TrucoEnvironment
from agent import Agent, save_agent, load_agent
from actions import game_actions, game_actions_list
import random
import itertools
import torch as T
import numpy as np
import logging

In [9]:
device = T.device("cuda:0" if T.cuda.is_available() else "cpu")

In [10]:
names = ['test1', 'test2', 'test3', 'test4']

agents = [Agent(player=Player(names[i]), state_space_dim=339, action_space_dim=game_actions.shape[0],model_type='shallow', device=device, replay_buffer_size=25000, batch_size=48, target_update_freq=1500, epsilon_end=0.05, epsilon_decay=1500000, learning_rate=0.0005) for i in range(len(names))]
#agents = [load_agent(name, device) for name in names]


random.shuffle(agents)

envs = [TrucoEnvironment([a1.player, a2.player], logging_level=logging.WARNING) for a1, a2 in zip(agents[0::2], agents[1::2])]

In [11]:
def get_env_by_agent(envs, agent):
    for env in envs:
        if agent.player in env.players:
            return env
    return None

def get_agents_by_env(agents, env):
    playing = []
    for agent in agents:
        if agent.player in env.players:
            playing.append(agent)
    return playing[0], playing[1]

In [12]:
for env in envs:
    
    playing_agents = get_agents_by_env(agents, env)
    
    for agent in playing_agents:

        # reset env
        starting_player, legal_actions, game_state = env.reset(False)

        # Initialize the ReplayBuffer
        while len(agent.replay_buffer) < agent.min_replay_size:
            # pick legal action given uniform distribution
            action = np.random.choice(legal_actions, 1, [1/len(legal_actions) for i in legal_actions])
            action = game_actions_list.index(action)

            # Take action, observer outcome
            rew, done, next_player, next_legal_actions, new_game_state = env.step(starting_player, action)

            #Save transition for training later
            transition = (game_state, action, rew, done, new_game_state)
            agent.save_transition(transition)

            starting_player = next_player
            legal_actions = next_legal_actions
            game_state = new_game_state

            if done:
                starting_player, legal_actions, game_state = env.reset()

In [13]:
PLAYER_SWAP_FREQ=35000

In [14]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from timeit import default_timer as timer
import datetime

for epoch in itertools.count():
    print(f"Starting epoch: {epoch}")
    epoch_start = timer()
    
    for env in envs:
        
        env_start = timer()

        starting_player, legal_actions, game_state = env.reset(False)
        playing_agents = get_agents_by_env(agents, env)

        # Main training loop
        for step in itertools.count():
            agent = [agent for agent in playing_agents if agent.player == starting_player][0]

            action = agent.choose_action(legal_actions, game_state)

            # Take action, observe outcome
            rew, done, next_player, next_legal_actions, new_game_state = env.step(agent.player, action)

            #Save transition for training later
            transition = (game_state, action, rew, done, new_game_state)
            agent.save_transition(transition)

            starting_player = next_player
            legal_actions = next_legal_actions
            game_state = new_game_state

            if done:
                # Give agents their episode reward
                for agent in playing_agents:
                    agent.save_reward(env.game.get_score(agent.player))

                starting_player, legal_actions, game_state = env.reset(False)

            # Learn from replay buffer
            agent.learn()

            # Logging
            if step % 100 == 0 and step > 0:
                for agent in playing_agents:
                    avg_reward = np.mean(agent.reward_buffer)
                    print(f"Step: {step} | Player: {agent.player.get_id()} | Avg reward: {avg_reward} | W/L: {max([w if p == agent.player else 0 for p, w in env.games_won])/env.games_played} | Games: {env.games_played} | Time: {str(datetime.timedelta(seconds=(timer() - env_start)))}")   

            # Break out of training loop when swap rate is reached
            if step % PLAYER_SWAP_FREQ == 0 and step > 0:
                print(f"Player {playing_agents[0].player.get_id()} vs Player {playing_agents[1].player.get_id()} finished.")
                
                # Clear GPU cache
                T.cuda.empty_cache()
                break;

    
                
    # Destroy all envs
    envs.clear()

    random.shuffle(agents)

    # Create new Envs
    envs = [TrucoEnvironment([a1.player, a2.player]) for a1, a2 in zip(agents[0::2], agents[1::2])]
    
    # Save models
    if epoch % 2 == 0 and epoch > 0:
        for agent in agents:
            save_agent(agent)
    
    print(f"Epoch {epoch} finished in {str(datetime.timedelta(seconds=(timer() - epoch_start)))}")
    

Starting epoch: 0
Step: 100 | Player: test4 | Avg reward: 1.121212121212121 | W/L: 0.5233968804159446 | Games: 577 | Time: 0:00:00.477810
Step: 100 | Player: test1 | Avg reward: 0.7575757575757576 | W/L: 0.47660311958405543 | Games: 577 | Time: 0:00:00.477974
Step: 200 | Player: test4 | Avg reward: 1.078125 | W/L: 0.5279605263157895 | Games: 608 | Time: 0:00:00.853357
Step: 200 | Player: test1 | Avg reward: 0.828125 | W/L: 0.4720394736842105 | Games: 608 | Time: 0:00:00.853554
Step: 300 | Player: test4 | Avg reward: 1.0449438202247192 | W/L: 0.5244865718799369 | Games: 633 | Time: 0:00:01.227324
Step: 300 | Player: test1 | Avg reward: 0.9213483146067416 | W/L: 0.4755134281200632 | Games: 633 | Time: 0:00:01.227576
Step: 400 | Player: test4 | Avg reward: 0.94 | W/L: 0.5210843373493976 | Games: 664 | Time: 0:00:01.597163
Step: 400 | Player: test1 | Avg reward: 0.98 | W/L: 0.4789156626506024 | Games: 664 | Time: 0:00:01.597359
Step: 500 | Player: test4 | Avg reward: 0.89 | W/L: 0.52098408

KeyboardInterrupt: 

In [None]:
for agent in agents:
    save_agent(agent)

In [9]:
for agent in agents:
    print()
    print(agent.get_config())


{'model_type': 'shallow', 'action_space_dim': 48, 'state_space_dim': 339, 'save_freq': 200000, 'batch_size': 48, 'target_update_freq': 1500, 'min_replay_size': 1000, 'learning_rate': 0.0005, 'replay_buffer_size': 25000, 'reward_buffer_size': 100, 'epsilon_start': 1.0, 'epsilon_end': 0.05, 'epsilon_decay': 1500000, 'gamma': 0.99, 'step': 5757726}

{'model_type': 'shallow', 'action_space_dim': 48, 'state_space_dim': 339, 'save_freq': 200000, 'batch_size': 48, 'target_update_freq': 1500, 'min_replay_size': 1000, 'learning_rate': 0.0005, 'replay_buffer_size': 25000, 'reward_buffer_size': 100, 'epsilon_start': 1.0, 'epsilon_end': 0.05, 'epsilon_decay': 1500000, 'gamma': 0.99, 'step': 5730169}

{'model_type': 'shallow', 'action_space_dim': 48, 'state_space_dim': 339, 'save_freq': 200000, 'batch_size': 48, 'target_update_freq': 1500, 'min_replay_size': 1000, 'learning_rate': 0.0005, 'replay_buffer_size': 25000, 'reward_buffer_size': 100, 'epsilon_start': 1.0, 'epsilon_end': 0.05, 'epsilon_de