In [1]:
import os 
import numpy as np
import gym

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import tensorflow_probability as tfp

import matplotlib.pyplot as plt 

import gym
import minihack
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from nle import nethack
import time

Init Plugin
Init Graph Optimizer
Init Kernel


In [2]:
class ActorCriticNetwork(keras.Model):
    def __init__(self, n_actions, fc1_dims=1024, fc2_dims=512, name = 'actor_crtic', chkpt_dir = 'tmp/actor_critic'):
        super(ActorCriticNetwork, self).__init__()
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.model_name = name 
        self.checkpoint_dir = chkpt_dir
        self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_ac')
        
        self.fc1 = Dense(self.fc1_dims, activation ='relu')
        self.fc2 = Dense(self.fc2_dims, activation ='relu')
        self.v = Dense(1, activation = None)
        self.pi = Dense(n_actions, activation='softmax')
        
    def call(self, state):
        value = self.fc1(state)
        value = self.fc2(value)
        
        v = self.v(value)
        pi = self.pi(value)
        
        return v,pi

In [3]:
class Agent:
    def __init__(self, alpha=0.003, gamma =0.99, n_actions = 2):
        self.gamma = gamma
        self.n_actions = n_actions
        self.action = None 
        self.action_space = [i for i in range(self.n_actions)]
        
        self.actor_critic = ActorCriticNetwork(n_actions=n_actions)
        self.actor_critic.compile(optimizer=Adam(learning_rate=alpha))
        
    def choose_action(self, observation):
        state = tf.convert_to_tensor([observation])
        _, probs = self.actor_critic(state)
        
        action_probabilities = tfp.distributions.Categorical(probs=probs)
        action = action_probabilities.sample()
        self.action = action
        
        return action.numpy()[0]
    
    def save_models(self):
        print('...saving models...')
        self.actor_critic.save_weights(self.actor_critic.checkpoint_file)
        
    def load_models(self):
        print('...loading models...')
        self.actor_critic.load_weights(self.actor_critic.checkpoint_file)
        
    def learn(self, state, reward, state_, done):
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        state_ = tf.convert_to_tensor([state_], dtype = tf.float32)
        reward = tf.convert_to_tensor(reward, dtype=tf.float32)
        
        with tf.GradientTape(persistent=True) as tape:
            state_value, probs = self.actor_critic(state)
            state_value_, _ = self.actor_critic(state_)
            state_value = tf.squeeze(state_value)
            state_value_ = tf.squeeze(state_value_)
            
            action_probs = tfp.distributions.Categorical(probs=probs)
            log_prob = action_probs.log_prob(self.action)
            
            delta = reward + self.gamma*state_value_*(1-int(done)) - state_value 
            actor_loss = -log_prob*delta 
            critic_loss = delta**2
            
            total_loss = actor_loss + critic_loss
        
        gradient = tape.gradient(total_loss, self.actor_critic.trainable_variables)
        self.actor_critic.optimizer.apply_gradients(zip(gradient, self.actor_critic.trainable_variables))

In [4]:
env = gym.make("MiniHack-Room-5x5-v0", observation_keys=["glyphs","pixel"])

In [5]:
def glyph_array(state):
    glyphs = state['glyphs'].flatten()
    #glyphs = glyphs[glyphs!=2359]
    glyphs = glyphs/2383
    return glyphs

In [None]:
agent = Agent(alpha = 1e-5, n_actions = env.action_space.n)
n_games= 2000 

filename = 'cartpole.png'

best_score = env.reward_range[0]
score_history = [] 
load_checkpoint = False 

if load_checkpoint:
    agent.load_models()
    
for i in range(n_games):
    observation = glyph_array(env.reset())
    done = False
    score = 0
    
    while not done:
        #print(observation)
        action = agent.choose_action(observation)
        #print(action)
        observation_, reward, done, info = env.step(action)
        observation_ = glyph_array(observation_)
        score += reward 
        if not load_checkpoint:
            agent.learn(observation, reward, observation_, done)
        observation = observation_
    score_history.append(score)
    avg_score = np.mean(score_history[-100:])
    
    if avg_score > best_score:
        best_score = avg_score
        if not load_checkpoint:
            agent.save_models()
            
    print('episode ', i, 'score %.1f' % score, 'avg_score %.1f' % avg_score )

x = [i+1 for i in range(n_games)]
plt.plot(x,score_history)

Metal device set to: Apple M1


2021-10-27 19:38:39.404396: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-10-27 19:38:39.404486: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


...saving models...
episode  0 score 0.9 avg_score 0.9
episode  1 score -0.3 avg_score 0.3
episode  2 score -0.4 avg_score 0.1
episode  3 score 0.9 avg_score 0.3
episode  4 score -0.4 avg_score 0.2
episode  5 score -0.6 avg_score 0.0
episode  6 score -0.7 avg_score -0.1
episode  7 score -0.6 avg_score -0.2
episode  8 score -0.6 avg_score -0.2
episode  9 score 1.0 avg_score -0.1
episode  10 score -0.3 avg_score -0.1
episode  11 score -0.8 avg_score -0.2
episode  12 score -0.8 avg_score -0.2
episode  13 score -0.7 avg_score -0.2
episode  14 score -0.7 avg_score -0.3
episode  15 score -0.5 avg_score -0.3
episode  16 score -0.5 avg_score -0.3
episode  17 score -0.7 avg_score -0.3
episode  18 score -0.7 avg_score -0.3
episode  19 score -0.7 avg_score -0.4
episode  20 score -0.4 avg_score -0.4
episode  21 score 0.7 avg_score -0.3
episode  22 score -0.3 avg_score -0.3
episode  23 score 0.7 avg_score -0.3
episode  24 score 1.0 avg_score -0.2
episode  25 score -0.3 avg_score -0.2
episode  26 sc