In [None]:
from player import Player
from dealer import Dealer
from game import TrucoGame
from actions import game_actions
from environment import TrucoEnvironment

In [None]:
num_players = 2

players = [Player(id) for id in range(num_players)]

p0 = players[0]
p1 = players[1]

env = TrucoEnvironment(players)

In [None]:
import torch
from torch import nn
from collections import deque
import itertools
import numpy as np
import random

In [None]:
GAMA=0.99
BATCH_SIZE=32
BUFFER_SIZE=50000
MIN_REPLAY_SIZE=1000
EPSILON_START=1.0
EPSILON_END=0.02
EPSILON_DECAY=10000
TARGET_UPDATE_FREQ=1000

In [None]:
class DQNetwork(nn.Module):
    
    def __init__(self, state_space_dim, action_space_dim):
        super().__init__()
        
        self.net = nn.Sequential(
            nn.Linear(state_space_dim, 64),
            nn.Tanh(),
            nn.Linear(64, action_space_dim)
        )
        
    def forward(self, x):
        return self.net(x)
    
    def act(self, game_state_t):
        q_values = self(game_state_t.unsqueeze(0))
        
        return q_values.detach().squeeze()

In [None]:
replay_buffer = deque(maxlen=BUFFER_SIZE)
reward_buffer = deque([0.0], maxlen=100)

episode_reward = 0.0

In [None]:
def initialize_agent():
    # Initialize the NNs
    online_net = DQNetwork(
        state_space_dim=env.state_space_dim, 
        action_space_dim=env.action_space_dim
    )

    target_net = DQNetwork(
        state_space_dim=env.state_space_dim, 
        action_space_dim=env.action_space_dim
    )

    # Initialize both with the same weight
    target_net.load_state_dict(online_net.state_dict())
    
    return online_net, target_net

agents = [(p, initialize_agent()) for p in players]

In [None]:
# reset env
starting_player, legal_actions, game_state_t = env.reset()

# Initialize the ReplayBuffer
while len(replay_buffer) < MIN_REPLAY_SIZE:
    # pick legal action given uniform distribution
    action = np.random.choice(legal_actions, 1, [1/len(legal_actions) for i in legal_actions])
    
    # Take action, observer outcome
    rew, done, next_player, next_legal_actions, new_game_state_t = env.step(starting_player, action)
    
    #Save transition for training later
    transition = (game_state_t, action, rew, done, new_game_state_t)
    replay_buffer.append(transition)
    
    starting_player = next_player
    legal_actions = next_legal_actions
    game_state_t = new_game_state_t
    
    if done:
        starting_player, legal_actions, game_state_t = env.reset()
        

In [None]:
replay_buffer.clear()

In [9]:
# reset env
starting_player, legal_actions, game_state_t = env.reset()


04-Oct-21 00:54:54 - INFO - New Game.


In [None]:
from actions import game_actions

# Main training loop
for step in itertools.count(500):
    # Linearly decrease epsilon 
    epsilon = np.interp(step, [0, EPSILON_DECAY], [EPSILON_START, EPSILON_END])
    
    use_random = random.random() <= epsilon
    
    action = None
    if use_random:
        # pick legal action given uniform distribution
        action = np.random.choice(legal_actions, 1, [1/len(legal_actions) for i in legal_actions])
    else: 
        # Compute Q-Values
        online_net = [nets[0] for player, nets in agents if player == starting_player][0]
        q_values = online_net.act(game_state_t)
        # index of best action
        action = np.argmax(q_values, axis=0)
        
    # Take action, observer outcome
    rew, done, next_player, next_legal_actions, new_game_state_t = env.step(starting_player, action)
    
    #Save transition for training later
    transition = (game_state_t, action, rew, done, new_game_state_t)
    replay_buffer.append(transition)
    
    starting_player = next_player
    legal_actions = next_legal_actions
    game_state_t = new_game_state_t
    
    episode_reward += rew
    
    if done:
        starting_player, legal_actions, game_state_t = env.reset()
        
        reward_buffer.append(episode_reward)
        episode_reward = 0

04-Oct-21 00:54:54 - INFO - Player 0 | Cards: ['O11', 'C5'] played E7
04-Oct-21 00:54:54 - INFO - Player 1 | Cards: ['B2', 'O12', 'C11'] called ['envido'].
04-Oct-21 00:54:54 - INFO - Player 0 | Cards: ['O11', 'C5'] called ['no quiero'] envido
04-Oct-21 00:54:54 - DEBUG - Player 1 | Cards: ['B2', 'O12', 'C11'] was rewarded 1 for winning envido.
04-Oct-21 00:54:54 - INFO - Player 1 | Cards: ['B2', 'C11'] played O12
04-Oct-21 00:54:54 - DEBUG - Player 1 | Cards: ['B2', 'C11'] won the round playing O12. They will start the next one.
04-Oct-21 00:54:54 - DEBUG - Round finished
04-Oct-21 00:54:54 - INFO - Player 0 | Cards: ['O11', 'C5'] folded.
04-Oct-21 00:54:54 - DEBUG - Player 1 | Cards: ['B2', 'C11'] was rewarded 1 for winning hand.
04-Oct-21 00:54:54 - INFO - Hand finished.
04-Oct-21 00:54:54 - INFO - Player 0 | Cards: ['O11', 'C5'] scored 0
04-Oct-21 00:54:54 - INFO - Player 1 | Cards: ['B2', 'C11'] scored 2
04-Oct-21 00:54:54 - INFO - New Game.
04-Oct-21 00:54:54 - INFO - Player 0 | 

In [None]:
from actions import game_actions


epsilon = np.interp(0, [0, EPSILON_DECAY], [EPSILON_START, EPSILON_END])

online_net = [nets[0] for player, nets in agents if player == starting_player][0]
q_values = online_net.act(game_state_t)
probs = np.array([1 if action in legal_actions else 0 for action in game_actions], dtype=float) * epsilon / len(legal_actions)
best_action_index = np.argmax(q_values, axis=0)
probs[best_action_index] += (1.0 - epsilon)


action_index = np.random.choice(np.arange(len(probs)), p=probs)
action = legal_actions[action_index]
action