In [1]:
from player import Player
from dealer import Dealer
from game import TrucoGame
from actions import game_actions
from environment import TrucoEnvironment
from actions import game_actions, game_actions_list

In [2]:
num_players = 2

players = [Player(id) for id in range(num_players)]

p0 = players[0]
p1 = players[1]

env = TrucoEnvironment(players)

In [3]:
import torch as T
from torch import nn
from collections import deque
import itertools
import numpy as np
import random

In [4]:
device = T.device("cuda:0" if T.cuda.is_available() else "cpu")

In [5]:
class DQNetwork(nn.Module):
    
    def __init__(self, state_space_dim, action_space_dim):
        super().__init__()
        
        self.net = nn.Sequential(
            nn.Linear(state_space_dim, 256),
            nn.Tanh(),
            nn.Linear(256, 128),
            nn.Tanh(),
            nn.Linear(128, 64),
            nn.Tanh(),
            nn.Linear(64, action_space_dim)
        )
        
    def forward(self, x):
        return self.net(x)
    
    def act(self, game_state_t):
        q_values = self(game_state_t.unsqueeze(0))
        
        return q_values.detach().squeeze()

In [6]:
class Agent:
    def __init__(self, 
                 player, 
                 state_space_dim, 
                 action_space_dim, 
                 device, 
                 batch_size=32,
                 target_update_freq=1000,
                 min_replay_size=1000,
                 learning_rate=5e-4, 
                 replay_buffer_size=50000, 
                 reward_buffer_size=100, 
                 epsilon_start=1.0, 
                 epsilon_end=0.02, 
                 epsilon_decay=10000,
                 gamma=0.99
                ):
        
        self.player = player
        
        # Initialize the NNs
        self.online_net = DQNetwork(
            state_space_dim, 
            action_space_dim
        ).to(device)

        self.target_net = DQNetwork(
            state_space_dim, 
            action_space_dim
        ).to(device)

        # Initialize both with the same weight
        self.target_net.load_state_dict(self.online_net.state_dict())

        # Initialize optimizer with online_net 
        self.optimizer = T.optim.Adam(self.online_net.parameters(), lr=learning_rate)
        
        self.replay_buffer = deque(maxlen=replay_buffer_size)
        self.min_replay_size = min_replay_size
        
        self.reward_buffer = deque([0.0], maxlen=reward_buffer_size)
        
        self.batch_size = batch_size
        self.target_update_freq = target_update_freq
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.gamma = gamma

In [7]:
agents = [Agent(player, env.state_space_dim, env.action_space_dim, device, batch_size=64) for player in players]

In [8]:
for agent in agents:

    # reset env
    starting_player, legal_actions, game_state_t = env.reset()

    # Initialize the ReplayBuffer
    while len(agent.replay_buffer) < agent.min_replay_size:
        # pick legal action given uniform distribution
        action = np.random.choice(legal_actions, 1, [1/len(legal_actions) for i in legal_actions])
        action = game_actions_list.index(action)

        # Take action, observer outcome
        rew, done, next_player, next_legal_actions, new_game_state_t = env.step(starting_player, action)

        #Save transition for training later
        transition = (game_state_t, action, rew, done, new_game_state_t)
        agent.replay_buffer.append(transition)

        starting_player = next_player
        legal_actions = next_legal_actions
        game_state_t = new_game_state_t

        if done:
            starting_player, legal_actions, game_state_t = env.reset()

In [9]:
# reset env
starting_player, legal_actions, game_state_t = env.reset()
episode_reward = 0.0
best_avg_reward = -10
worst_avg_reward = 10

In [None]:


# Main training loop
for step in itertools.count():
    agent = [agent for agent in agents if agent.player == starting_player][0]
    # Linearly decrease epsilon 
    epsilon = np.interp(step, [0, agent.epsilon_decay], [agent.epsilon_start, agent.epsilon_end])
    
    use_random = random.random() <= epsilon
    
    action = None
    if use_random:
        # pick legal action given uniform distribution
        action = np.random.choice(legal_actions, 1, [1/len(legal_actions) for i in legal_actions])
        action = game_actions_list.index(action)
    else: 
        # Compute Q-Values
        q_values = agent.online_net.act(game_state_t)
        # Get index of best action
        action = np.argmax(q_values, axis=0)
        
    # Take action, observer outcome
    rew, done, next_player, next_legal_actions, new_game_state_t = env.step(starting_player, action)
    
    #Save transition for training later
    transition = (game_state_t, action, rew, done, new_game_state_t)
    agent.replay_buffer.append(transition)
    
    starting_player = next_player
    legal_actions = next_legal_actions
    game_state_t = new_game_state_t
    
    episode_reward += rew
    
    if done:
        starting_player, legal_actions, game_state_t = env.reset()
        
        agent.reward_buffer.append(episode_reward)
        episode_reward = 0
    
    # Start Gradient Step
    transitions = random.sample(agent.replay_buffer, agent.batch_size)
    
    obs_t = np.vstack([t[0] for t in transitions])
    actions_t = np.asarray([t[1] for t in transitions], dtype=np.int64)
    rews_t = np.asarray([t[2] for t in transitions], dtype=np.float32)
    dones_t = np.asarray([t[3] for t in transitions], dtype=np.float32)
    new_obs_t = np.vstack([t[4] for t in transitions])
    
 
    obs_t = T.as_tensor(obs_t).to(device)
    actions_t = T.as_tensor(actions_t).unsqueeze(-1).to(device)
    rews_t = T.as_tensor(rews_t).to(device)
    new_obs_t = T.as_tensor(new_obs_t).to(device)
    dones_t = T.as_tensor(dones_t).to(device)
    
    # Compute Targets
    target_q_values = agent.target_net(new_obs_t)
    max_target_q_values = target_q_values.max(dim=1, keepdim=True)[0]
    
    targets = rews_t + agent.gamma * (1 - dones_t) * max_target_q_values
    
    # Compute Loss
    q_values = agent.online_net(obs_t)
    
    action_q_values = T.gather(input=q_values, dim=1, index=actions_t)
    loss = nn.functional.smooth_l1_loss(action_q_values, max_target_q_values)

    # Gradient Descent
    agent.optimizer.zero_grad()
    loss.backward()
    agent.optimizer.step()
    
    # Update Target Network
    if step % agent.target_update_freq == 0:
        agent.target_net.load_state_dict(agent.online_net.state_dict())
        
    #Logging
    if step % 1000 == 0:
        avg_reward = np.mean(agent.reward_buffer)
        if avg_reward > best_avg_reward:
            best_avg_reward = avg_reward
        if avg_reward < worst_avg_reward:
            worst_avg_reward = avg_reward
        print(f"Step: {step} --- Avg reward: {avg_reward} --- Best reward: {best_avg_reward} --- Worst reward: {}", end='\r')
    

Step: 591000 --- Avg reward: -1.47