In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from pypokerengine.players import BasePokerPlayer
from pypokerengine.utils.card_utils import Card, Deck
from pypokerengine.api.game import setup_config, start_poker

import pickle
import tensorflow as tf
import random
import os
import scipy
import scipy.signal

import sys
sys.path.insert(0, '../scripts/')

import PlayerModels as pm
from MyEmulator import MyEmulator
# from DQNPlayer import DQNPlayer
from util import *

import threading
import multiprocessing

from random import choice
from time import sleep
from time import time

## Util

In [2]:
# Copies one set of variables to another.
# Used to set worker network parameters to those of global network.
def update_target_graph(from_scope,to_scope):
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

    op_holder = []
    for from_var,to_var in zip(from_vars,to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder

# Discounting function used to calculate discounted returns.
def discount(x, gamma):
    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]

#Used to initialize weights for policy and value output layers
def normalized_columns_initializer(std=1.0):
    def _initializer(shape, dtype=None, partition_info=None):
        out = np.random.randn(*shape).astype(np.float32)
        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
        return tf.constant(out)
    return _initializer

## Network

In [3]:
# class AC_Network():
#     def __init__(self,s_size,a_size,scope,trainer):
#         with tf.variable_scope(scope):
#             #Input and visual encoding layers
#             self.inputs = tf.placeholder(shape=[None,s_size],dtype=tf.float32)
#             self.imageIn = tf.reshape(self.inputs,shape=[-1,84,84,1])
#             self.conv1 = slim.conv2d(activation_fn=tf.nn.elu,
#                 inputs=self.imageIn,num_outputs=16,
#                 kernel_size=[8,8],stride=[4,4],padding='VALID')
#             self.conv2 = slim.conv2d(activation_fn=tf.nn.elu,
#                 inputs=self.conv1,num_outputs=32,
#                 kernel_size=[4,4],stride=[2,2],padding='VALID')
#             hidden = slim.fully_connected(slim.flatten(self.conv2),256,activation_fn=tf.nn.elu)
            
#             #Recurrent network for temporal dependencies
#             lstm_cell = tf.contrib.rnn.BasicLSTMCell(256,state_is_tuple=True)
#             c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
#             h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
#             self.state_init = [c_init, h_init]
#             c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
#             h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
#             self.state_in = (c_in, h_in)
#             rnn_in = tf.expand_dims(hidden, [0])
#             step_size = tf.shape(self.imageIn)[:1]
#             state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)
#             lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
#                 lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size,
#                 time_major=False)
#             lstm_c, lstm_h = lstm_state
#             self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
#             rnn_out = tf.reshape(lstm_outputs, [-1, 256])
            
#             #Output layers for policy and value estimations
#             self.policy = slim.fully_connected(rnn_out,a_size,
#                 activation_fn=tf.nn.softmax,
#                 weights_initializer=normalized_columns_initializer(0.01),
#                 biases_initializer=None)
#             self.value = slim.fully_connected(rnn_out,1,
#                 activation_fn=None,
#                 weights_initializer=normalized_columns_initializer(1.0),
#                 biases_initializer=None)
            
#             #Only the worker network need ops for loss functions and gradient updating.
#             if scope != 'global':
#                 self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
#                 self.actions_onehot = tf.one_hot(self.actions,a_size,dtype=tf.float32)
#                 self.target_v = tf.placeholder(shape=[None],dtype=tf.float32)
#                 self.advantages = tf.placeholder(shape=[None],dtype=tf.float32)

#                 self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1])

#                 #Loss functions
#                 self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1])))
#                 self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy))
#                 self.policy_loss = -tf.reduce_sum(tf.log(self.responsible_outputs)*self.advantages)
#                 self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01

#                 #Get gradients from local network using local losses
#                 local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
#                 self.gradients = tf.gradients(self.loss,local_vars)
#                 self.var_norms = tf.global_norm(local_vars)
#                 grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,40.0)
                
#                 #Apply local gradients to global network
#                 global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
#                 self.apply_grads = trainer.apply_gradients(zip(grads,global_vars))

In [4]:
class A3CPlayer(BasePokerPlayer):
    '''
    DQN Player, bot wich using Double-Dueling-DQN architecture.

    Parametrs
    ---------
    h_size : shape of layer after conv part (also before double part too)

    lr : learning rate of the optimizer

    gradient_clip_norm : gradients of the loss function will be clipped by this value
    
    total_num_actions : the number of actions witch agent can choose

    is_double : whether or not to use the double architecture

    is_main : whether or not to use this agent as main (when using the dueling architecture)

    is_restore : wheter or not to use pretrained weight of the network

    is_train : whether or not to use this agent for training

    is_debug  wheter or not to print the debug information
    '''
    def __init__(self, a_size, scope, trainer, h_size=64):
        self.h_size = h_size
        
#         with tf.variable_scope(scope):
#             #Input and visual encoding layers
#             self.scalar_input = tf.placeholder(tf.float32, [None, 17 * 17 * 1])
#             self.features_input = tf.placeholder(tf.float32, [None, 20])

#             xavier_init = tf.contrib.layers.xavier_initializer()

#             self.img_in = tf.reshape(self.scalar_input, [-1, 17, 17, 1])
#             self.conv1 = tf.layers.conv2d(self.img_in, 32, 5, 2, activation=tf.nn.elu,
#                                           kernel_initializer=xavier_init)
#             self.conv2 = tf.layers.conv2d(self.conv1, 32, 3, activation=tf.nn.elu, kernel_initializer=xavier_init)
#             self.conv3 = tf.layers.conv2d(self.conv2, self.h_size, 5, activation=tf.nn.elu,
#                                           kernel_initializer=xavier_init)
#             self.conv3_flat = tf.contrib.layers.flatten(self.conv3)
# #             self.conv3_flat = tf.layers.dropout(self.conv3_flat)

#             self.d1 = tf.layers.dense(self.features_input, 32, activation=tf.nn.elu, kernel_initializer=xavier_init)
# #             self.d1 = tf.layers.dropout(self.d1)
#             self.d2 = tf.layers.dense(self.d1, self.h_size, activation=tf.nn.elu, kernel_initializer=xavier_init)
# #             self.d2 = tf.layers.dropout(self.d2)

#             self.merge = tf.concat([self.conv3_flat, self.d2], axis=1)
#             self.d3 = tf.layers.dense(self.merge, self.h_size, activation=tf.nn.elu, kernel_initializer=xavier_init)
# #             self.d3 = tf.layers.dropout(self.d3)
#             self.d4 = tf.layers.dense(self.d3, self.h_size, activation=tf.nn.elu, kernel_initializer=xavier_init)
        
# #             self.inputs = tf.placeholder(shape=[None,s_size],dtype=tf.float32)
# #             self.imageIn = tf.reshape(self.inputs,shape=[-1,84,84,1])
# #             self.conv1 = slim.conv2d(activation_fn=tf.nn.elu,
# #                 inputs=self.imageIn,num_outputs=16,
# #                 kernel_size=[8,8],stride=[4,4],padding='VALID')
# #             self.conv2 = slim.conv2d(activation_fn=tf.nn.elu,
# #                 inputs=self.conv1,num_outputs=32,
# #                 kernel_size=[4,4],stride=[2,2],padding='VALID')
# #             hidden = slim.fully_connected(slim.flatten(self.conv2),256,activation_fn=tf.nn.elu)
            
#             #Recurrent network for temporal dependencies
# #             lstm_cell = tf.contrib.rnn.BasicLSTMCell(256,state_is_tuple=True)
# #             c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
# #             h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
# #             self.state_init = [c_init, h_init]
# #             c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
# #             h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
# #             self.state_in = (c_in, h_in)
# #             rnn_in = tf.expand_dims(hidden, [0])
# #             step_size = tf.shape(self.imageIn)[:1]
# #             state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)
# #             lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
# #                 lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size,
# #                 time_major=False)
# #             lstm_c, lstm_h = lstm_state
# #             self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
# #             rnn_out = tf.reshape(lstm_outputs, [-1, 256])
            
#             #Output layers for policy and value estimations
#             self.policy = tf.layers.dense(self.d4, a_size,
#                 activation=tf.nn.softmax,
#                 kernel_initializer=normalized_columns_initializer(0.01))
#             self.value = tf.layers.dense(self.d4, 1,
#                 activation=None,
#                 kernel_initializer=normalized_columns_initializer(1.0))
            
#             #Only the worker network need ops for loss functions and gradient updating.
#             if scope != 'global':
#                 self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
#                 self.actions_onehot = tf.one_hot(self.actions,a_size,dtype=tf.float32)
#                 self.target_v = tf.placeholder(shape=[None],dtype=tf.float32)
#                 self.advantages = tf.placeholder(shape=[None],dtype=tf.float32)

#                 self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1])

#                 #Loss functions
#                 self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1])))
#                 self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy))
#                 self.policy_loss = -tf.reduce_sum(tf.log(self.responsible_outputs)*self.advantages)
#                 self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01

#                 #Get gradients from local network using local losses
#                 local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
#                 self.gradients = tf.gradients(self.loss,local_vars)
#                 self.var_norms = tf.global_norm(local_vars)
#                 grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,100.0)
# #                 self.grad_norms = tf.global_norm(grads)
                
#                 #Apply local gradients to global network
#                 global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
#                 self.apply_grads = trainer.apply_gradients(zip(grads,global_vars))
                
    def _print(self, *msg):
        if self.debug:
            print(msg)
        
    def declare_action(self, valid_actions, hole_card, round_state):
        street = round_state['street']
        bank = round_state['pot']['main']['amount']
        stack = [s['stack'] for s in round_state['seats'] if s['uuid'] == self.uuid][0]
        other_stacks = [s['stack'] for s in round_state['seats'] if s['uuid'] != self.uuid]
        dealer_btn = round_state['dealer_btn']
        small_blind_pos = round_state['small_blind_pos']
        big_blind_pos = round_state['big_blind_pos']
        next_player = round_state['next_player']
        round_count = round_state['round_count']
        estimation = self.hole_card_est[(hole_card[0], hole_card[1])]

        
        self.features = get_street(street)
        self.features.extend([bank, stack, dealer_btn, small_blind_pos, big_blind_pos, next_player, round_count])
        self.features.extend(other_stacks)
        self.features.append(estimation)
        
        img_state = img_from_state(hole_card, round_state)
        img_state = process_img(img_state)
        action_num = self.sess.run(self.predict, feed_dict={self.scalar_input: [img_state],
                                                            self.features_input: [self.features]})[0]
        qs = self.sess.run(self.Q_out, feed_dict={self.scalar_input: [img_state],
                                                  self.features_input: [self.features]})[0]
        self._print(qs)
        action, amount = get_action_by_num(action_num, valid_actions)                    

#         if not self.debug and np.random.rand() < 0.2:
#             self.action_num = np.random.randint(0, 5)
        return action, amount
        
    def receive_game_start_message(self, game_info):
        pass
    
    def receive_round_start_message(self, round_count, hole_card, seats):
        self._print(['Hole:', hole_card])        
        self.start_stack = [s['stack'] for s in seats if s['uuid'] == self.uuid][0]
        self._print(['Start stack:', self.start_stack])
        estimation = self.hole_card_est[(hole_card[0], hole_card[1])]
        self._print(['Estimation:', estimation])
    
    def receive_street_start_message(self, street, round_state):
        pass
            
    def receive_game_update_message(self, action, round_state):
        pass
    
    def receive_round_result_message(self, winners, hand_info, round_state):
        end_stack = [s['stack'] for s in round_state['seats'] if s['uuid'] == self.uuid][0]
        self._print(['End stack:', end_stack])

In [5]:
class AC_Network():
    def __init__(self, a_size, scope, trainer, h_size=64):
        self.h_size = h_size
        
        with tf.variable_scope(scope):
            #Input and visual encoding layers
            self.scalar_input = tf.placeholder(tf.float32, [None, 17 * 17 * 1])
            self.features_input = tf.placeholder(tf.float32, [None, 20])

            xavier_init = tf.contrib.layers.xavier_initializer()

            self.img_in = tf.reshape(self.scalar_input, [-1, 17, 17, 1])
            self.conv1 = tf.layers.conv2d(self.img_in, 32, 5, 2, activation=tf.nn.elu,
                                          kernel_initializer=xavier_init)
            self.conv2 = tf.layers.conv2d(self.conv1, 32, 3, activation=tf.nn.elu, kernel_initializer=xavier_init)
            self.conv3 = tf.layers.conv2d(self.conv2, self.h_size, 5, activation=tf.nn.elu,
                                          kernel_initializer=xavier_init)
            self.conv3_flat = tf.contrib.layers.flatten(self.conv3)
    #             self.conv3_flat = tf.layers.dropout(self.conv3_flat)

            self.d1 = tf.layers.dense(self.features_input, 32, activation=tf.nn.elu, kernel_initializer=xavier_init)
    #             self.d1 = tf.layers.dropout(self.d1)
            self.d2 = tf.layers.dense(self.d1, self.h_size, activation=tf.nn.elu, kernel_initializer=xavier_init)
    #             self.d2 = tf.layers.dropout(self.d2)

            self.merge = tf.concat([self.conv3_flat, self.d2], axis=1)
            self.d3 = tf.layers.dense(self.merge, self.h_size, activation=tf.nn.elu, kernel_initializer=xavier_init)
    #             self.d3 = tf.layers.dropout(self.d3)
            self.d4 = tf.layers.dense(self.d3, self.h_size, activation=tf.nn.elu, kernel_initializer=xavier_init)

    #             self.inputs = tf.placeholder(shape=[None,s_size],dtype=tf.float32)
    #             self.imageIn = tf.reshape(self.inputs,shape=[-1,84,84,1])
    #             self.conv1 = slim.conv2d(activation_fn=tf.nn.elu,
    #                 inputs=self.imageIn,num_outputs=16,
    #                 kernel_size=[8,8],stride=[4,4],padding='VALID')
    #             self.conv2 = slim.conv2d(activation_fn=tf.nn.elu,
    #                 inputs=self.conv1,num_outputs=32,
    #                 kernel_size=[4,4],stride=[2,2],padding='VALID')
    #             hidden = slim.fully_connected(slim.flatten(self.conv2),256,activation_fn=tf.nn.elu)

            #Recurrent network for temporal dependencies
    #             lstm_cell = tf.contrib.rnn.BasicLSTMCell(256,state_is_tuple=True)
    #             c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
    #             h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
    #             self.state_init = [c_init, h_init]
    #             c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
    #             h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
    #             self.state_in = (c_in, h_in)
    #             rnn_in = tf.expand_dims(hidden, [0])
    #             step_size = tf.shape(self.imageIn)[:1]
    #             state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)
    #             lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
    #                 lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size,
    #                 time_major=False)
    #             lstm_c, lstm_h = lstm_state
    #             self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
    #             rnn_out = tf.reshape(lstm_outputs, [-1, 256])

            #Output layers for policy and value estimations
            self.policy = tf.layers.dense(self.d4, a_size,
                activation=tf.nn.softmax,
                kernel_initializer=normalized_columns_initializer(0.01))
            self.value = tf.layers.dense(self.d4, 1,
                activation=None,
                kernel_initializer=normalized_columns_initializer(1.0))

            #Only the worker network need ops for loss functions and gradient updating.
            if scope != 'global':
                self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
                self.actions_onehot = tf.one_hot(self.actions,a_size,dtype=tf.float32)
                self.target_v = tf.placeholder(shape=[None],dtype=tf.float32)
                self.advantages = tf.placeholder(shape=[None],dtype=tf.float32)

                self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1])

                #Loss functions
                self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1])))
                self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy))
                self.policy_loss = -tf.reduce_sum(tf.log(self.responsible_outputs)*self.advantages)
                self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01

                #Get gradients from local network using local losses
                local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss,local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,100.0)
    #                 self.grad_norms = tf.global_norm(grads)

                #Apply local gradients to global network
                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(zip(grads,global_vars))

In [6]:
# tf.reset_default_graph()
# AC_Network(3, 'f', None)

In [7]:
def init_emul(my_uuid_):
    global my_uuid
    my_uuid = my_uuid_

#     emul.register_player("1", pm.CallPlayer())
#     emul.register_player("2", pm.CallPlayer())
#     emul.register_player("3", pm.CallPlayer())
#     emul.register_player("4", pm.CallPlayer())
#     emul.register_player("5", pm.CallPlayer())
#     emul.register_player("6", pm.CallPlayer())
#     emul.register_player("7", pm.CallPlayer())
#     emul.register_player("8", pm.CallPlayer())
#     emul.register_player("9", pm.CallPlayer())

    emul.register_player("1", pm.CallPlayer())
    emul.register_player("2", pm.CallPlayer())
    emul.register_player("3", pm.FoldPlayer())
    emul.register_player("4", pm.FoldPlayer())
    emul.register_player("5", pm.HeuristicPlayer())
    emul.register_player("6", pm.HeuristicPlayer())
    emul.register_player("7", pm.RandomPlayer())
    emul.register_player("8", pm.RandomPlayer())
    emul.register_player("9", pm.CallPlayer())


    players_info = {
        "1": { "name": "CallPlayer1", "stack": 1500 },
        "2": { "name": "CallPlayer2", "stack": 1500 },
        "3": { "name": "FoldPlayer1", "stack": 1500 },
        "4": { "name": "FoldPlayer2", "stack": 1500 },
        "5": { "name": "HeuristicPlayer1", "stack": 1500 },
        "6": { "name": "HeuristicPlayer2", "stack": 1500 },
        "7": { "name": "RandomPlayer1", "stack": 1500 },
        "8": { "name": "RandomPlayer2", "stack": 1500 },
        "9": { "name": "DQN", "stack": 1500 }
    }

In [62]:
class Worker():
    def __init__(self, name, a_size, trainer, model_path, global_episodes):    
        with open('../cache/hole_card_estimation.pkl', 'rb') as f:
            self.hole_card_est = pickle.load(f)
            
        self.name = "worker_" + str(name)
        self.number = name        
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.summary_writer = tf.summary.FileWriter("../log/A3C/train_"+str(self.number))

        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_AC = AC_Network(a_size,self.name,trainer)
        self.update_local_ops = update_target_graph('global',self.name)    
        
        emul = MyEmulator()
        emul.set_game_rule(9, 50, 15, 0)
        self.my_uuid = '9'
        self.players_info = {
            "1": { "name": "f1", "stack": 1500 },
            "2": { "name": "f2", "stack": 1500 },
            "3": { "name": "f3", "stack": 1500 },
            "4": { "name": "f4", "stack": 1500 },
            "5": { "name": "f5", "stack": 1500 },
            "6": { "name": "f6", "stack": 1500 },
            "7": { "name": "f7", "stack": 1500 },
            "8": { "name": "f8", "stack": 1500 },
            "9": { "name": "f9", "stack": 1500 }
        }
    
        emul.register_player("1", pm.CallPlayer())
        emul.register_player("2", pm.CallPlayer())
        emul.register_player("3", pm.FoldPlayer())
        emul.register_player("4", pm.FoldPlayer())
        emul.register_player("5", pm.HeuristicPlayer())
        emul.register_player("6", pm.HeuristicPlayer())
        emul.register_player("7", pm.RandomPlayer())
        emul.register_player("8", pm.RandomPlayer())
        emul.register_player("9", pm.CallPlayer())
    
        self.actions = self.actions = np.identity(a_size,dtype=bool).tolist()
        self.env = emul
        
    def train(self,rollout,sess,gamma,bootstrap_value):
        rollout = np.array(rollout)
        last_img_states = rollout[:,0]
        last_features = rollout[:,1]
        last_actions_num = rollout[:,2]
        rewards = rollout[:,3]
        img_states = rollout[:,4]
        features = rollout[:,5]
        values = rollout[:,7]
        
        # Here we take the rewards and values from the rollout, and use them to 
        # generate the advantage and discounted returns. 
        # The advantage function uses "Generalized Advantage Estimation"
        
        # "discount" done earlier
#         self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
#         discounted_rewards = discount(self.rewards_plus,gamma)[:-1]
        self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
#         advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1]
#         advantages = discount(advantages,gamma)
        advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1]
        discounted_rewards = rewards

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        feed_dict = {self.local_AC.target_v:discounted_rewards,
            self.local_AC.scalar_input:np.vstack(last_img_states),
            self.local_AC.features_input:np.vstack(last_features),
            self.local_AC.actions:last_actions_num,
            self.local_AC.advantages:advantages}
#             self.local_AC.state_in[0]:self.batch_rnn_state[0],
#             self.local_AC.state_in[1]:self.batch_rnn_state[1]}
        v_l,p_l,e_l,g_n,v_n, _ = sess.run([self.local_AC.value_loss, # self.batch_rnn_state,
            self.local_AC.policy_loss,
            self.local_AC.entropy,
            self.local_AC.grad_norms,
            self.local_AC.var_norms,
#             self.local_AC.state_out,
            self.local_AC.apply_grads],
            feed_dict=feed_dict)
        return v_l / len(rollout),p_l / len(rollout),e_l / len(rollout), g_n, v_n
        
    def work(self,gamma,sess,coord,saver):
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        print ("Starting worker " + str(self.number))
        with sess.as_default(), sess.graph.as_default():                 
            while not coord.should_stop():
                sess.run(self.update_local_ops)
                episode_buffer = []
                episode_values = []
                episode_reward = 0
                episode_step_count = 0
                d = False
                
                initial_state = self.env.generate_initial_game_state(self.players_info)
                msgs = []
                game_state, events = self.env.start_new_round(initial_state)
                is_last_round = False
#                 r_all = 0
#                 j = 0

                last_img_state = None
                last_features = None
                last_action_num = None
                last_v = None
                     
#                 self.env.new_episode()
#                 s = self.env.get_state().screen_buffer
#                 episode_frames.append(s)
#                 s = process_frame(s)
#                 rnn_state = self.local_AC.state_init
#                 self.batch_rnn_state = rnn_state
#                 while self.env.is_episode_finished() == False:

                round_buffer = []
                while not is_last_round:
                    #Take an action using probabilities from policy network output.
                    a = self.env.run_until_my_next_action(game_state, self.my_uuid, msgs)
                    
                    if len(a) == 4:
                        game_state, valid_actions, hole_card, round_state = a
                        img_state = img_from_state(hole_card, round_state)
                        img_state = process_img(img_state)

                        street = round_state['street']
                        bank = round_state['pot']['main']['amount']
                        stack = [s['stack'] for s in round_state['seats'] if s['uuid'] == self.my_uuid][0]
                        other_stacks = [s['stack'] for s in round_state['seats'] if s['uuid'] != self.my_uuid]
                        dealer_btn = round_state['dealer_btn']
                        small_blind_pos = round_state['small_blind_pos']
                        big_blind_pos = round_state['big_blind_pos']
                        next_player = round_state['next_player']
                        round_count = round_state['round_count']
                        estimation = self.hole_card_est[(hole_card[0], hole_card[1])]

                        features = get_street(street)
                        features.extend([bank, stack, dealer_btn, small_blind_pos, big_blind_pos, next_player,
                                         round_count])
                        features.extend(other_stacks)
                        features.append(estimation)
                     
                        # add to buffer last hand 
                        if last_img_state is not None:
                            round_buffer.append([last_img_state, last_features, last_action_num, 0, img_state,
                                                   features, 0, last_v[0, 0]])
                            episode_values.append(last_v[0, 0])
                     
                        pol_val = sess.run([self.local_AC.policy, self.local_AC.value],
                                              feed_dict={self.local_AC.scalar_input: [img_state],
                                                         self.local_AC.features_input: [features]})
                        a_dist, v = pol_val[0], pol_val[1]

                        a = np.random.choice(a_dist[0],p=a_dist[0])
                        a = np.argmax(a_dist == a)
                        action, amount = get_action_by_num(a, valid_actions)
                        game_state, msgs = self.env.apply_my_action(game_state, action, amount)

                        last_img_state = img_state.copy()
                        last_features = features.copy()
                        last_action_num = a
                        last_v = v
                    else: # round end
                        game_state, reward = a
                        reward /= 1000
                        episode_reward += reward

#                         if reward >= 0:
#                             reward = np.log(1 + reward)
#                         else:
#                             reward = -np.log(1 - reward)
#                         r_all += reward

                        # add to buffer last hand 
                        if last_img_state is not None:
                            round_buffer.append([last_img_state, last_features, last_action_num, reward,
                                                   last_img_state, last_features, 1, last_v[0, 0]])
                            episode_values.append(last_v[0,0])

                            # apply same reward for all states in round
                            for k in range(len(round_buffer)):
                                round_buffer[k][3] = reward
                                
                        episode_buffer.extend(round_buffer)
                        round_buffer = []

                        is_last_round = self.env._is_last_round(game_state, self.env.game_rule)
                        game_state, events = self.env.start_new_round(game_state)

                        last_img_state = None
                        last_action_num = None   
                        last_v = None
                        
                    self.episode_buffer = episode_buffer # for debug
                    self.episode_values = episode_values
#                     episode_buffer.append([s,a,r,s1,d,v[0,0]])
#                     episode_values.append(v[0,0])

#                     s = s1                    
                    total_steps += 1
                    episode_step_count += 1
                    
                    # If the episode hasn't ended, but the experience buffer is full, then we
                    # make an update step using that experience rollout.
#                     if len(episode_buffer) == 30 and d != True and episode_step_count != max_episode_length - 1:
#                         # Since we don't know what the true final return is, we "bootstrap" from our current
#                         # value estimation.
#                         v1 = sess.run(self.local_AC.value, 
#                             feed_dict={self.local_AC.inputs:[s],
#                             self.local_AC.state_in[0]:rnn_state[0],
#                             self.local_AC.state_in[1]:rnn_state[1]})[0,0]
#                         v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,v1)
#                         episode_buffer = []
#                         sess.run(self.update_local_ops)
#                     if d == True:
#                         break
                                            
                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)
                self.episode_mean_values.append(np.mean(episode_values))
                
                # Update the network using the episode buffer at the end of the episode.
                if len(episode_buffer) != 0:
                    v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,0.0)
                                
                    
                if episode_count % 200 == 0 and self.name == 'worker_0':
                    saver.save(sess, self.model_path, episode_count)
                    print ("Saved Model", episode_count)
                     
                if episode_count % 1 == 0:
                    mean_reward = np.mean(self.episode_rewards[-3:])
                    mean_length = np.mean(self.episode_lengths[-3:])
                    mean_value = np.mean(self.episode_mean_values[-3:])
                    summary = tf.Summary()
                    summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward))
                    summary.value.add(tag='Perf/Length', simple_value=float(mean_length))
                    summary.value.add(tag='Perf/Value', simple_value=float(mean_value))
                    summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l))
                    summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l))
                    summary.value.add(tag='Losses/Entropy', simple_value=float(e_l))
                    summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n))
                    summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n))
                    self.summary_writer.add_summary(summary, episode_count)

                    self.summary_writer.flush()
                if self.name == 'worker_0':
                    sess.run(self.increment)
                episode_count += 1

In [63]:
gamma = .99 # discount rate for advantage estimation and reward discounting
a_size = 5 # Agent can move Left, Right, or Fire
load_model = False
model_path = '../cache/models/A3C'

In [64]:
tf.reset_default_graph()

if not os.path.exists(model_path):
    os.makedirs(model_path)
    
with tf.device("/cpu:0"): 
    global_episodes = tf.Variable(0,dtype=tf.int32,name='global_episodes',trainable=False)
    trainer = tf.train.AdamOptimizer(learning_rate=1e-4)
    master_network = AC_Network(a_size,'global',None) # Generate global network
    num_workers = multiprocessing.cpu_count() # Set workers ot number of available CPU threads
    workers = []
    # Create worker classes
    for i in range(num_workers):
        workers.append(Worker(i,a_size,trainer,model_path,global_episodes))
    saver = tf.train.Saver(max_to_keep=5)

with tf.Session() as sess:
    coord = tf.train.Coordinator()
    if load_model == True:
        print ('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess,ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())
        
    # This is where the asynchronous magic happens.
    # Start the "work" process for each worker in a separate threat.
    worker_threads = []
    for worker in workers:
        worker_work = lambda: worker.work(gamma,sess,coord,saver)
        t = threading.Thread(target=(worker_work))
        t.start()
        sleep(0.5)
        worker_threads.append(t)
    coord.join(worker_threads)

Starting worker 0
Starting worker 1
Starting worker 2
Starting worker 3
Saved Model 0


Exception in thread Thread-34:
Traceback (most recent call last):
  File "/home/digitman/miniconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/digitman/miniconda3/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-64-19e237ff1914>", line 30, in <lambda>
    worker_work = lambda: worker.work(gamma,sess,coord,saver)
  File "<ipython-input-62-43979b96175d>", line 159, in work
    self.local_AC.features_input: [features]})
  File "/home/digitman/miniconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 778, in run
    run_metadata_ptr)
  File "/home/digitman/miniconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 914, in _run
    raise RuntimeError('Attempted to use a closed Session.')
RuntimeError: Attempted to use a closed Session.



KeyboardInterrupt: 

Exception in thread Thread-32:
Traceback (most recent call last):
  File "/home/digitman/miniconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/digitman/miniconda3/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-64-19e237ff1914>", line 30, in <lambda>
    worker_work = lambda: worker.work(gamma,sess,coord,saver)
  File "<ipython-input-62-43979b96175d>", line 159, in work
    self.local_AC.features_input: [features]})
  File "/home/digitman/miniconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 778, in run
    run_metadata_ptr)
  File "/home/digitman/miniconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 914, in _run
    raise RuntimeError('Attempted to use a closed Session.')
RuntimeError: Attempted to use a closed Session.

Exception in thread Thread-33:
Traceback (most recent call last):
  File "/home/digitman/miniconda3/