In [1]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install torch==1.9.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html


You should consider upgrading via the 'c:\users\bautista\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


In [1]:
from player import Player
from dealer import Dealer
from game import TrucoGame
from environment import TrucoEnvironment
from actions import game_actions, game_actions_list

In [2]:
num_players = 2

players = [Player(id) for id in range(num_players)]

p0 = players[0]
p1 = players[1]

env = TrucoEnvironment(players)

In [3]:
import torch as T
from torch import nn
from collections import deque
import itertools
import numpy as np
import random

In [4]:
device = T.device("cuda:0" if T.cuda.is_available() else "cpu")

In [5]:
class DQNetwork(nn.Module):
    
    def __init__(self, state_space_dim, action_space_dim):
        super().__init__()
        
        self.net = nn.Sequential(
            nn.Linear(state_space_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, action_space_dim)
        )
        
    def forward(self, x):
        return self.net(x)
    
    def act(self, game_state_t):
        q_values = self(game_state_t.unsqueeze(0))
        
        return q_values.detach().squeeze()

In [6]:
import random
from actions import game_actions, game_actions_list

class Agent:
    def __init__(self, 
                 player, 
                 state_space_dim, 
                 action_space_dim, 
                 device, 
                 batch_size=32,
                 target_update_freq=1000,
                 min_replay_size=1000,
                 learning_rate=5e-4, 
                 replay_buffer_size=50000, 
                 reward_buffer_size=100, 
                 epsilon_start=1.0, 
                 epsilon_end=0.02, 
                 epsilon_decay=10000,
                 gamma=0.99
                ):
        
        self.player = player
        
        self.device = device
        
        # Initialize the NNs
        self.online_net = DQNetwork(
            state_space_dim, 
            action_space_dim
        ).to(device)

        self.target_net = DQNetwork(
            state_space_dim, 
            action_space_dim
        ).to(device)

        # Initialize both with the same weight
        self.target_net.load_state_dict(self.online_net.state_dict())

        # Initialize optimizer with online_net 
        self.optimizer = T.optim.Adam(self.online_net.parameters(), lr=learning_rate)
        
        self.replay_buffer = deque(maxlen=replay_buffer_size)
        self.min_replay_size = min_replay_size
        
        self.reward_buffer = deque([0.0], maxlen=reward_buffer_size)
        
        self.batch_size = batch_size
        self.target_update_freq = target_update_freq
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.gamma = gamma
        
        def save_transition(self, transition):
            self.replay_buffer.append(transition)
            
        def save_reward(self, reward):
            self.reward_buffer.append(reward)
            
        def choose_action(self, step, legal_actions, game_state):
            # Linearly decrease epsilon 
            epsilon = np.interp(step, [0, self.epsilon_decay], [self.epsilon_start, self.epsilon_end])

            use_random = random.random() <= epsilon

            action = None
            if use_random:
                # pick legal action given uniform distribution
                action = np.random.choice(legal_actions, 1, [1/len(legal_actions) for i in legal_actions])
            else: 
                # Compute Q-Values
                game_state = T.as_tensor(game_state).to(self.device)
                q_values = self.online_net.act(game_state)
                # Get index of best action
                action = T.argmax(q_values, axis=0)
                # Send game_state and action to cpu so we can save it into replay memory
                game_state = game_state.cpu().data.numpy()
                action = action.cpu().data.numpy()
                
            return game_actions_list.index(action)

In [7]:
agents = [Agent(player, env.state_space_dim, env.action_space_dim, device, batch_size=256, target_update_freq=1000, epsilon_end=0.1, epsilon_decay=250000) for player in players]

In [8]:
for agent in agents:
    agent.replay_buffer.clear() 

In [9]:
for agent in agents:

    # reset env
    starting_player, legal_actions, game_state = env.reset()

    # Initialize the ReplayBuffer
    while len(agent.replay_buffer) < agent.min_replay_size:
        # pick legal action given uniform distribution
        action = np.random.choice(legal_actions, 1, [1/len(legal_actions) for i in legal_actions])
        action = game_actions_list.index(action)

        # Take action, observer outcome
        rew, done, next_player, next_legal_actions, new_game_state = env.step(starting_player, action)

        #Save transition for training later
        transition = (game_state, action, rew, done, new_game_state)
        agent.replay_buffer.append(transition)

        starting_player = next_player
        legal_actions = next_legal_actions
        game_state = new_game_state

        if done:
            starting_player, legal_actions, game_state = env.reset()



In [10]:
# reset env
starting_player, legal_actions, game_state = env.reset()
episode_reward = 0.0
best_avg_reward = -10
worst_avg_reward = 10



In [14]:


# Main training loop
for step in itertools.count():
    agent = [agent for agent in agents if agent.player == starting_player][0]
    
    action = agent.choose_action(step, legal_actions, game_state)
        
    # Take action, observer outcome
    rew, done, next_player, next_legal_actions, new_game_state = env.step(agent.player, action)
    
    #Save transition for training later
    transition = (game_state, action, rew, done, new_game_state)
    agent.save_transition(transition)
    
    starting_player = next_player
    legal_actions = next_legal_actions
    game_state = new_game_state
    
    episode_reward += rew
    
    if done:
        starting_player, legal_actions, game_state = env.reset()
        
        agent.save_reward(episode_reward)
        episode_reward = 0
    
    # Start Gradient Step
    transitions = random.sample(agent.replay_buffer, agent.batch_size)
    
    all_obs = np.vstack([t[0] for t in transitions])
    all_actions = np.asarray([t[1] for t in transitions], dtype=np.int64)
    all_rews = np.asarray([t[2] for t in transitions], dtype=np.float32)
    all_dones = np.asarray([t[3] for t in transitions], dtype=np.float32)
    all_new_obs = np.vstack([t[4] for t in transitions])
    
 
    obs_t = T.as_tensor(all_obs).to(device)
    actions_t = T.as_tensor(all_actions).unsqueeze(-1).to(device)
    rews_t = T.as_tensor(all_rews).to(device)
    new_obs_t = T.as_tensor(all_new_obs).to(device)
    dones_t = T.as_tensor(all_dones).to(device)
    
    # Compute Targets
    target_q_values = agent.target_net(new_obs_t)
    max_target_q_values = target_q_values.max(dim=1, keepdim=True)[0]
    
    targets = rews_t + agent.gamma * (1 - dones_t) * max_target_q_values
    
    # Compute Loss
    q_values = agent.online_net(obs_t)
    
    action_q_values = T.gather(input=q_values, dim=1, index=actions_t)
    loss = nn.functional.smooth_l1_loss(action_q_values, max_target_q_values).to(device)

    # Gradient Descent
    agent.optimizer.zero_grad()
    loss.backward()
    agent.optimizer.step()
    
    
    # Update Target Network
    if step % agent.target_update_freq == 0:
        agent.target_net.load_state_dict(agent.online_net.state_dict())
        
        # Logging
        avg_reward = np.mean(agent.reward_buffer)
        if avg_reward > best_avg_reward and avg_reward != 0:
            best_avg_reward = avg_reward
        if avg_reward < worst_avg_reward:
            worst_avg_reward = avg_reward
        print(f"Step: {step} | Player: {agent.player.get_id()} | Avg reward: {avg_reward} | Best reward: {best_avg_reward} | Worst reward: {worst_avg_reward} | W/L: {max([w if p == agent.player else 0 for p, w in env.games_won])/env.games_played} | Games: {env.games_played} ", end='\r')   
    

Step: 36000 | Player: 1 | Avg reward: -1.54 | Best reward: -0.5 | Worst reward: -1.83 | W/L: 0.47097877768757973 | Games: 34492 

KeyboardInterrupt: 

In [None]:
env.games_won

In [13]:
for agent in agents:
    print(agent.reward_buffer)

deque([-1, -3, -1, -3, -3, -1, 1, -2, -3, -1, -1, -3, -2, -1, -1, -4, -1, -1, -1, -6, -3, -1, -1, -1, -1, -2, -1, 1, -2, -3, -1, -1, -2, -2, 2, 3, -1, -3, -1, -4, -3, -3, -2, -2, -1, -1, -2, -2, -1, 2, -1, -3, -2, -3, 2, -2, -4, 1, -1, -1, -2, -3, -2, 2, -1, 4, 2, 2, -1, -3, -1, -1, -2, -2, 0, 4, -2, -2, -1, -2, -1, -1, -1, -1, -3, -1, -3, -1, -1, -2, -2, -2, -4, -3, -2, -1, -3, -1, -1, -2], maxlen=100)
deque([3, -1, -1, -3, 1, -2, -2, 3, -1, -2, -1, -2, -1, -1, 1, -3, -1, 0, -2, -1, -2, -1, -1, -5, -1, -1, 1, -1, -2, -1, -1, -4, -1, -1, 1, -1, -1, -2, 3, -5, -1, -3, -2, -1, -2, -1, -2, -1, 2, 3, -2, -2, -1, -1, -1, -3, -1, -1, 0, -2, -3, -2, -5, -1, -1, -7, -2, 2, -1, -1, -3, -1, -1, -2, -2, -3, -1, -2, 2, -2, -3, -2, -1, -1, -2, -2, -3, -3, -2, -5, -1, -1, -3, -1, -5, -1, -1, -1, -1, -1], maxlen=100)


In [31]:
from player import Player

num_players = 2

players = [Player(id) for id in range(num_players)]

p0 = players[0]
p1 = players[1]

old_score = [(p0,2),(p1,1)]

new_score = [(p0,3),(p1,1)]

player = p1

reward = 0
for i in range(len(new_score)):
    if new_score[i][0] == player:
        reward += (new_score[i][1] - old_score[i][1])
    else:
        reward -= (new_score[i][1] - old_score[i][1])
reward

-1