In [1]:
from player import Player
from dealer import Dealer
from game import TrucoGame
from environment import TrucoEnvironment
from agent import Agent, save_agent, load_agent
from actions import game_actions, game_actions_list
import random
import itertools
import torch as T
import numpy as np
import logging

In [2]:
device = T.device("cuda:0" if T.cuda.is_available() else "cpu")

In [5]:
load_names = ['Red', 'Green', 'Blue', 'Yellow']


#players = [Player(id) for id in names]

#agents = [Agent(player=players[i], state_space_dim=339, action_space_dim=game_actions.shape[0],model_type='shallow', device=device, replay_buffer_size=25000, batch_size=48, target_update_freq=1500, epsilon_end=0.05, epsilon_decay=1000000, learning_rate=0.0005) for i in range(len(names))]
agents = [load_agent(name, device) for name in load_names]


random.shuffle(agents)

envs = [TrucoEnvironment([a1.player, a2.player], logging_level=logging.WARNING) for a1, a2 in zip(agents[0::2], agents[1::2])]

In [6]:
def get_env_by_agent(envs, agent):
    for env in envs:
        if agent.player in env.players:
            return env
    return None

def get_agents_by_env(agents, env):
    playing = []
    for agent in agents:
        if agent.player in env.players:
            playing.append(agent)
    return playing[0], playing[1]

In [7]:
for env in envs:
    
    playing_agents = get_agents_by_env(agents, env)
    
    for agent in playing_agents:

        # reset env
        starting_player, legal_actions, game_state = env.reset(False)

        # Initialize the ReplayBuffer
        while len(agent.replay_buffer) < agent.min_replay_size:
            # pick legal action given uniform distribution
            action = np.random.choice(legal_actions, 1, [1/len(legal_actions) for i in legal_actions])
            action = game_actions_list.index(action)

            # Take action, observer outcome
            rew, done, next_player, next_legal_actions, new_game_state = env.step(starting_player, action)

            #Save transition for training later
            transition = (game_state, action, rew, done, new_game_state)
            agent.save_transition(transition)

            starting_player = next_player
            legal_actions = next_legal_actions
            game_state = new_game_state

            if done:
                starting_player, legal_actions, game_state = env.reset()

In [8]:
PLAYER_SWAP_FREQ=35000

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from timeit import default_timer as timer
import datetime

for epoch in itertools.count():
    print(f"Starting epoch: {epoch}")
    epoch_start = timer()
    
    for env in envs:
        
        env_start = timer()

        starting_player, legal_actions, game_state = env.reset(False)
        playing_agents = get_agents_by_env(agents, env)

        # Main training loop
        for step in itertools.count():
            agent = [agent for agent in playing_agents if agent.player == starting_player][0]

            action = agent.choose_action(legal_actions, game_state)

            # Take action, observe outcome
            rew, done, next_player, next_legal_actions, new_game_state = env.step(agent.player, action)

            #Save transition for training later
            transition = (game_state, action, rew, done, new_game_state)
            agent.save_transition(transition)

            starting_player = next_player
            legal_actions = next_legal_actions
            game_state = new_game_state

            if done:
                # Give agents their episode reward
                for agent in playing_agents:
                    agent.save_reward(env.game.get_score(agent.player))

                starting_player, legal_actions, game_state = env.reset(False)

            # Learn from replay buffer
            agent.learn()

            # Logging
            if step % 15000 == 0 and step > 0:
                for agent in playing_agents:
                    avg_reward = np.mean(agent.reward_buffer)
                    print(f"Step: {step} | Player: {agent.player.get_id()} | Avg reward: {avg_reward} | W/L: {max([w if p == agent.player else 0 for p, w in env.games_won])/env.games_played} | Games: {env.games_played} | Time: {str(datetime.timedelta(seconds=(timer() - env_start)))}")   

            # Break out of training loop when swap rate is reached
            if step % PLAYER_SWAP_FREQ == 0 and step > 0:
                print(f"Player {playing_agents[0].player.get_id()} vs Player {playing_agents[1].player.get_id()} finished.")
                
                # Clear GPU cache
                T.cuda.empty_cache()
                break;

    
                
    # Destroy all envs
    envs.clear()

    random.shuffle(agents)

    # Create new Envs
    envs = [TrucoEnvironment([a1.player, a2.player]) for a1, a2 in zip(agents[0::2], agents[1::2])]
    
    # Save models
    if epoch % 2 == 0 and epoch > 0:
        for agent in agents:
            save_agent(agent)
    
    print(f"Epoch {epoch} finished in {str(datetime.timedelta(seconds=(timer() - epoch_start)))}")
    

Starting epoch: 0
Step: 15000 | Player: Yellow | Avg reward: 1.07 | W/L: 0.5038265306122449 | Games: 784 | Time: 0:00:44.118636
Step: 15000 | Player: Red | Avg reward: 1.01 | W/L: 0.4961734693877551 | Games: 784 | Time: 0:00:44.118806
Step: 30000 | Player: Yellow | Avg reward: 0.98 | W/L: 0.5005107252298263 | Games: 979 | Time: 0:01:30.659024
Step: 30000 | Player: Red | Avg reward: 0.96 | W/L: 0.49948927477017363 | Games: 979 | Time: 0:01:30.659222
Player Yellow vs Player Red finished.
Step: 15000 | Player: Green | Avg reward: 0.98 | W/L: 0.5298804780876494 | Games: 753 | Time: 0:00:46.560126
Step: 15000 | Player: Blue | Avg reward: 1.01 | W/L: 0.4701195219123506 | Games: 753 | Time: 0:00:46.560294
Step: 30000 | Player: Green | Avg reward: 1.2 | W/L: 0.5285565939771547 | Games: 963 | Time: 0:01:33.275296
Step: 30000 | Player: Blue | Avg reward: 0.76 | W/L: 0.4714434060228453 | Games: 963 | Time: 0:01:33.275461
Player Green vs Player Blue finished.
Epoch 0 finished in 0:03:35.248071
Sta

In [8]:
for agent in agents:
    save_agent(agent)

Model Blue saved. 
Model Yellow saved. 
Model Green saved. 
Model Red saved. 


In [9]:
for agent in agents:
    print(agent.step)

19783433
19791951
19770908
19799143
