In [1]:
import random
import numpy as np
from collections import deque
from keras.models import Sequential, Model
import keras.models
from keras.layers import Dense, Input, Lambda
from keras.layers.convolutional import Conv2D
from keras import backend as K
from keras.engine.topology import Layer
from keras.layers.merge import _Merge,Multiply
from keras.optimizers import Adam
import tensorflow as tf

class QLayer(_Merge):
    '''Q Layer that merges an advantage and value layer'''
    def _merge_function(self, inputs):
        '''Assume that the inputs come in as [value, advantage]'''
        output = inputs[0] + (inputs[1] - K.mean(inputs[1], axis=1, keepdims=True))
        return output
    
class DQNSolver():#QNetwork():
    
    GAMMA = 0.95
    LEARNING_RATE = 0.001

    MEMORY_SIZE = 1000000
    BATCH_SIZE = 20

    EXPLORATION_MAX = 1.0
    EXPLORATION_MIN = 0.01
    EXPLORATION_DECAY = 0.995

    def __init__(self, observation_space, action_space,height,width):
        
        self.exploration_rate = self.EXPLORATION_MAX
        self.action_space = action_space
        self.memory = deque(maxlen=self.MEMORY_SIZE)
        
        self.inputs=Input(shape=(height,width,1))
        self.actions = Input(shape=(1,), dtype='int32')
        self.actions_onehot = Lambda(K.one_hot, arguments={'num_classes':self.action_space}, output_shape=(None, self.action_space))(self.actions)
        
        x = Conv2D(filters=32, kernel_size=[2,2], strides=[4,4], input_shape=(height,width,1),activation="elu")(self.inputs)
        x = Conv2D(filters=64, kernel_size=[2,2],strides=[2,2],activation="elu")(x)
        x = Conv2D(filters=512, kernel_size=[1,1],strides=[1,1],activation="elu")(x)
                
        #Splice outputs of last conv layer using lambda layer
        x_value = Lambda(lambda x: x[:,:,:,:512//2])(x)
        x_advantage = Lambda(lambda x: x[:,:,:,512//2:])(x)
        
        #Process spliced data stream into value and advantage function
        value = Dense(self.action_space, activation="linear")(x_value) 
        advantage = Dense(self.action_space, activation="linear")(x_advantage)
        
        #Recombine value and advantage layers into Q layer
        q = QLayer()([value, advantage])
        self.q_out = Multiply()([q, self.actions_onehot])
        self.q_out = Lambda(lambda x: K.cumsum(x, axis=3), output_shape=(1,))(self.q_out)
        
        #need to figure out how to represent actions within training
        self.model = Model(inputs=[self.inputs, self.actions], outputs=[q, self.q_out]) 
        self.model.compile(optimizer=Adam(lr=self.LEARNING_RATE), loss="mean_squared_error")
        
        self.target_model=self.copy_model()
        
        """self.model = Sequential()
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))
        self.model.add(Dense(24, activation="relu"))
        self.model.add(Dense(self.action_space, activation="linear"))
        self.model.compile(loss="mse", optimizer=Adam(lr=self.LEARNING_RATE))
    """
    def remember(self, state, action, reward, next_state, done):
        #experience=[state, action, reward, next_state, done]
        #self.memory.store(experience)
        self.memory.append((state, action, reward, next_state, done))
        
    def copy_model(self):
        """Returns a copy of a keras model."""
        self.model.save('tmp_model')
        return keras.models.load_model('tmp_model',custom_objects={'QLayer':QLayer, 'tf':tf})

    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        q_values = self.model.predict([np.array(state).reshape((1,np.array(state).shape[0],np.array(state).shape[1],np.array(state).shape[2])),np.asarray([1])])#state)
        #print("Q_values ",q_values[0],"Max ",np.argmax(q_values[0]))
        #print("Q Values",q_values[0][0][0][0],len(q_values[0][0][0][0]))
        #if(len(q_values[0][0][0][0])>5):
        #    print("Problem",q_values[0][0][0][0])
        #print("returning",np.argmax(q_values[0][0][0]))
        return min(np.argmax(q_values[0][0][0]),4)

    def experience_replay(self):        
        if len(self.memory) < self.BATCH_SIZE:
            #print("In replay return")
            return
        #print("In replay full")
        batch = random.sample(self.memory, self.BATCH_SIZE)
        #print("1")
        for state, action, reward, state_next, terminal in batch:
            q_update = reward            
            #print("2")
            if not terminal:                
                #print(np.array(state_next).shape)
                statenext_dash=np.array(state_next).reshape((1,np.array(state_next).shape[0],np.array(state_next).shape[1],np.array(state_next).shape[2]))
                
                #print("3",self.model.predict([statenext_dash,np.asarray([1])])[0],np.amax(self.model.predict([statenext_dash,np.asarray([1])])[0]))
                a_dash_val=self.model.predict([statenext_dash,np.asarray([1])])[0]
                a_dash=np.argmax(a_dash_val[0][0][0])
                #print("a_dash",a_dash)
                q_update = (reward + self.GAMMA * (self.target_model.predict([statenext_dash,np.array([1])])[0])[0][0][0][a_dash])
            #print("4")
            q,q_values = self.target_model.predict([np.array(state).reshape((1,np.array(state).shape[0],np.array(state).shape[1],np.array(state).shape[2])),np.asarray([1])])
            #print("5",q_values)
            q_values[0][0][0][action] = q_update
            #print("6")
            fit_in= [np.array(state).reshape((1,np.array(state).shape[0],np.array(state).shape[1],np.array(state).shape[2])),np.asarray([1])]
            #q_values=q_values.reshape((q_values.shape[2],q_values.shape[3]))
            #q=q.reshape((q.shape[2],q.shape[3]))
            fit_out=[q_values,np.array([1])]#fit_out=[q, q_values[0]]#.reshape((q_values.shape[2],q_values.shape[3]))]
            #print("Fit_in",fit_in)
            #print("Fit_out",fit_out)
            #print("in",fit_in[0].shape, fit_in[1].shape)
            #print("out",fit_out[0].shape, fit_out[1].shape) #q.reshape((q.shape[2],q.shape[3]))
            self.model.fit(fit_in, fit_out, verbose=0)
            #print("7")
        #print("8")    
        self.exploration_rate *= self.EXPLORATION_DECAY
        self.exploration_rate = max(self.EXPLORATION_MIN, self.exploration_rate)
        self.target_model.set_weights(self.model.get_weights())
        #print("9")
        
        

Using TensorFlow backend.


In [2]:
import keras
print(keras.__version__
     )

2.2.4


In [10]:
import numpy as np
s=np.asarray([[[[1,2,3,4,5]]]])
print(s.shape)
print(s.reshape((5,1)))

(1, 1, 1, 5)
[[1]
 [2]
 [3]
 [4]
 [5]]
