In [13]:
import gym
from PIL import Image
import numpy as np


from keras.models import Sequential
from keras.layers import Dense, Convolution2D , Activation , Flatten , Permute

from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy , EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor

INPUT_SHAPE = (84 , 84)
WINDOW_LENGTH = 4

In [14]:
class PacmanProcessor(Processor):
    
    def process_observation(self, observation):
        assert observation.ndim == 3  # (height, width, channel)
        img = Image.fromarray(observation)
        img = img.resize(INPUT_SHAPE).convert('L')  # resize and convert to grayscale
        processed_observation = np.array(img)
        assert processed_observation.shape == INPUT_SHAPE
        return processed_observation.astype('uint8')  # saves storage in experience memory

    def process_state_batch(self, batch):
        # We could perform this processing step in `process_observation`. In this case, however,
        # we would need to store a `float32` array instead, which is 4x more memory intensive than
        # an `uint8` array. This matters if we store 1M observations.
        processed_batch = batch.astype('float32') / 255.
        return processed_batch

    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)   

In [15]:
class Agent():
    
    def __init__(self , env):
        self.env = env
        self.nb_actions = self.env.action_space.n
        
        
        #print(self.model.weights)
        
    def create_model(self):
        
        input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE
        
        model = Sequential()
        
        model.add(Permute((3, 2, 1), input_shape=input_shape))
        
        model.add(Convolution2D(32 , (8,8) , strides = (4,4)))
        model.add(Activation('relu')) 
        model.add(Convolution2D(64 , (4,4) , strides = (2,2)))
        model.add(Activation('relu'))
        model.add(Convolution2D(64 , (3,3) , strides = (1,1)))
        model.add(Activation('relu'))
        model.add(Flatten())
        model.add(Dense(512))
        model.add(Activation('relu'))
        model.add(Dense(self.nb_actions))
        model.add(Activation('linear'))
        
        return model

In [16]:
env = gym.make('MsPacman-v0')
bot = Agent(env = env)
cov_model = bot.create_model()
cov_model.compile(Adam(lr = 0.001) , metrics = ['accuracy'] , loss = 'mse')
print(cov_model.output.shape)

(None, 9)


In [17]:
memory = SequentialMemory(limit = 10000 , window_length = WINDOW_LENGTH)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy() , 
                                  attr = 'eps',
                                  value_max = 1.,
                                  value_min = .1,
                                  value_test = .05,
                                  nb_steps = 5000)

In [18]:
print(bot.nb_actions)
processor = PacmanProcessor()

dqn = DQNAgent(model=cov_model , 
               memory = memory , 
               nb_actions = bot.nb_actions , 
               policy = policy,
               processor = processor)

dqn.compile(Adam(lr=.00025), metrics=['mae'])

9


In [19]:
dqn

<rl.agents.dqn.DQNAgent at 0x62a3f5710>

In [20]:
dqn.fit(env,nb_steps=50000)

Training for 50000 steps ...
Interval 1 (0 steps performed)
13 episodes - episode_reward: 34.538 [15.000, 71.000] - loss: 0.008 - mae: 0.049 - mean_q: 0.127 - mean_eps: 0.260 - ale.lives: 2.022

Interval 2 (10000 steps performed)
11 episodes - episode_reward: 45.182 [19.000, 78.000] - loss: 0.008 - mae: 0.078 - mean_q: 0.187 - mean_eps: 0.100 - ale.lives: 2.024

Interval 3 (20000 steps performed)
12 episodes - episode_reward: 43.083 [29.000, 62.000] - loss: 0.007 - mae: 0.099 - mean_q: 0.230 - mean_eps: 0.100 - ale.lives: 2.102

Interval 4 (30000 steps performed)
12 episodes - episode_reward: 46.417 [27.000, 71.000] - loss: 0.006 - mae: 0.123 - mean_q: 0.266 - mean_eps: 0.100 - ale.lives: 1.991

Interval 5 (40000 steps performed)
done, took 4351.239 seconds


<keras.callbacks.callbacks.History at 0x62c15b240>

In [21]:
dqn.test(env , nb_episodes = 10)

Testing for 10 episodes ...
Episode 1: reward: 34.000, steps: 671
Episode 2: reward: 42.000, steps: 1048
Episode 3: reward: 39.000, steps: 1020
Episode 4: reward: 42.000, steps: 956
Episode 5: reward: 63.000, steps: 1126
Episode 6: reward: 34.000, steps: 530
Episode 7: reward: 34.000, steps: 713
Episode 8: reward: 45.000, steps: 889
Episode 9: reward: 35.000, steps: 615
Episode 10: reward: 21.000, steps: 583


<keras.callbacks.callbacks.History at 0x62b4fb9e8>

In [None]:
dqn.save_weights()