In [65]:
import gym
from PIL import Image
import numpy as np


from keras.models import Sequential
from keras.layers import Dense, Convolution2D , Activation , Flatten , Permute

from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy , EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor

INPUT_SHAPE = (84 , 84)
WINDOW_LENGTH = 4

In [66]:
class PacmanProcessor(Processor):
    
    def process_observation(self, observation):
        assert observation.ndim == 3  # (height, width, channel)
        img = Image.fromarray(observation)
        img = img.resize(INPUT_SHAPE).convert('L')  # resize and convert to grayscale
        processed_observation = np.array(img)
        assert processed_observation.shape == INPUT_SHAPE
        return processed_observation.astype('uint8')  # saves storage in experience memory

    def process_state_batch(self, batch):
        # We could perform this processing step in `process_observation`. In this case, however,
        # we would need to store a `float32` array instead, which is 4x more memory intensive than
        # an `uint8` array. This matters if we store 1M observations.
        processed_batch = batch.astype('float32') / 255.
        return processed_batch

    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)   

In [67]:
class Agent():
    
    def __init__(self , env):
        self.env = env
        self.nb_actions = self.env.action_space.n
        
        
        #print(self.model.weights)
        
    def create_model(self):
        
        input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE
        
        model = Sequential()
        
        model.add(Permute((3, 2, 1), input_shape=input_shape))
        
        model.add(Convolution2D(32 , (8,8) , strides = (4,4)))
        model.add(Activation('relu'))
        model.add(Convolution2D(64 , (4,4) , strides = (2,2)))
        model.add(Activation('relu'))
        model.add(Convolution2D(64 , (3,3) , strides = (1,1)))
        model.add(Activation('relu'))
        model.add(Flatten())
        model.add(Dense(512))
        model.add(Activation('relu'))
        model.add(Dense(self.nb_actions))
        model.add(Activation('linear'))
        
        return model

In [68]:
env = gym.make('MsPacman-v0')
bot = Agent(env = env)
cov_model = bot.create_model()
cov_model.compile(Adam(lr = 0.001) , metrics = ['accuracy'] , loss = 'mse')
print(cov_model.output.shape)

(None, 9)


In [69]:
memory = SequentialMemory(limit = 10000 , window_length = WINDOW_LENGTH)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy() , 
                                  attr = 'eps',
                                  value_max = 1.,
                                  value_min = .1,
                                  value_test = .05,
                                  nb_steps = 100000)

In [70]:
print(bot.nb_actions)
processor = PacmanProcessor()

dqn = DQNAgent(model=cov_model , 
               memory = memory , 
               nb_actions = bot.nb_actions , 
               policy = policy,
               processor = processor)

dqn.compile(Adam(lr=.00025), metrics=['mae'])

9


In [71]:
dqn

<rl.agents.dqn.DQNAgent at 0x633cdc828>

In [73]:
dqn.fit(env,nb_steps=10000)

Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 763.668 seconds


<keras.callbacks.callbacks.History at 0x634b274a8>

In [None]:
dqn.test()