In [1]:
import numpy as np
import tensorflow as tf
from datetime import datetime
import matplotlib.pyplot as plt

2023-12-09 13:15:49.422403: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-09 13:15:49.424007: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-09 13:15:49.460485: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-09 13:15:49.461617: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Definition of Hyperparameters

In [2]:
start_script_time = datetime.now()

actions_n = 4
gamma = 0.75
steps = 6000
memory_steps = 512

### Definition Memory for train model 

In [3]:
class Memory:
    def __init__(self, memory_length, actions_n, state_features):
        self.memory_length = memory_length
        self.buffer_x = np.zeros((memory_length, state_features))
        self.buffer_y = np.zeros((memory_length, actions_n))
        self.index = 0
        self.ready = False

    def add(self, x, y):
        self.buffer_x[self.index] = x
        self.buffer_y[self.index] = y
        self.index += 1
        if self.index >= self.memory_length:
            self.index = 0
            self.ready = True
            

### Agent Implementation

In [4]:
class Agent:
    def __init__(self, memory_length, actions_n):
        self.state = None
        self.reset_state()
        self.exploration_rate = 1
        self.memory = Memory(memory_length = memory_length, actions_n = actions_n, state_features = len(self.state))
        self.model = self.get_model(actions_n = actions_n, state_features = len(self.state))
        self.model_2 = self.get_model(actions_n = actions_n, state_features = len(self.state))

    def reset_state(self):
        self.state = np.asarray([0, 0], dtype = np.float32)

    def execute(self, action):
        # state[0] = Warehouse 1 or 2
        # state[1] = Package received yes/no
        reward = -1
        if action == 0 and self.state[0] == 0: # Take the package
            self.state[1] = 1  # Package taken
        elif action == 1:   # Leave the package
            if self.state[1]:
                self.state[1] = 0    # No package
                if self.state[0] == 1:
                    reward = 5 
        elif action == 2:       # Go to Warehouse 2
            self.state[0] = 1   # I am in Warehouse 2
        elif action == 3:       # Go to warehouse 1
            self.state[0] = 0
        return reward

    def get_model(self, actions_n, state_features):
        inputs = tf.keras.layers.Input(shape=(state_features))
        dense = tf.keras.layers.Dense(8, activation= 'swish')(inputs)
        dense = tf.keras.layers.Dense(8, activation='swish')(dense)
        outputs = tf.keras.layers.Dense(actions_n, activation='linear')(dense)

        model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
        model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.MeanSquaredError())
        model.summary()

        return model

### Implementation of the DoubleDeepQNetwork model

In [6]:
a = Agent(memory_length = memory_steps, actions_n=actions_n)
q2 = np.asarray([0, 0, 0, 0], dtype = np.float32)
__s = np.asarray([0, 0], dtype = np.float32)
cum_rewards = []
dqn_cum_rewards_log = []
dqn_tests_log = []

def max_min(v1, v2):
    return np.where(v1 < v2, v1, v2)

p = a.model(np.expand_dims(a.state, axis=0), training= False)[0]
p2 = a.model_2(np.expand_dims(a.state, axis=0), training = False)[0]
q = max_min(p, p2)

for i in range(steps):
    __s[0] = a.state[0]
    __s[1] = a.state[1]

    if np.random.random() > a.exploration_rate:
        action = np.argmax(q)

    else:
        action = np.random.randint(4)

    r = a.execute(action)
    cum_rewards.append(r)
    if len(cum_rewards) > 250:
        cum_rewards = cum_rewards[1:]


    p = a.model(np.expand_dims(a.state, axis= 0), training = False)[0]
    p2 = a.model_2(np.expand_dims(a.state, axis = 0), training = False)[0]
    q2 = max_min(p, p2)
    q[action] = q[action] * .1 + (r + np.max(q2) * gamma) * .9
    a.memory.add(__s, q)
    q = q2

    if i % 5 == 0 and a.memory.ready:
        h = a.model.fit(a.memory.buffer_x, a.memory.buffer_y, verbose = 0, batch_size = 128)
        h2 = a.model_2.fit(a.memory.buffer_x, a.memory.buffer_y, verbose = 0, batch_size= 128)
        
    if i % 100 == 0 and i >a.memory.memory_length:
        a.reset_state()
        rewards = 0
        for _ in range(64):
            next_action = np.argmax(a.model(np.expand_dims(a.state, axis= 0), training=False)[0] + a.model_2(np.expand_dims(a.state, axis= 0), training=False)[0])
            rewards += a.execute(next_action)
        print('#', i, '\t{:.3}'.format(a.exploration_rate), '\t', np.sum(cum_rewards), '\t', h.history['loss'], '\t',h2.history['loss'], '\t', rewards)
        dqn_tests_log.append(rewards)

    dqn_cum_rewards_log.append(np.sum(cum_rewards))

    if a.exploration_rate > .05:
        a.exploration_rate -= .001
            
    

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 2)]               0         
                                                                 
 dense (Dense)               (None, 8)                 24        
                                                                 
 dense_1 (Dense)             (None, 8)                 72        
                                                                 
 dense_2 (Dense)             (None, 4)                 36        
                                                                 
Total params: 132 (528.00 Byte)
Trainable params: 132 (528.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input

### Model Predictions

In [8]:
a.state = np.asarray([0, 0], dtype=np.float32)
print(a.state, '->', a.model.predict(np.expand_dims(a.state, axis=0))[0])
print(a.execute(0))
print('exec 0 ->', a.state)

print(a.state, '->', a.model.predict(np.expand_dims(a.state, axis=0))[0])
print(a.execute(2))
print('exec 2 ->', a.state)

print(a.state, '->', a.model.predict(np.expand_dims(a.state, axis=0))[0])
print(a.execute(1))
print('exec 1 ->', a.state)

print(a.state, '->', a.model.predict(np.expand_dims(a.state, axis=0))[0])
print(a.execute(3))
print('exec 3 ->', a.state)


[0. 0.] -> [ 0.736439  -1.3480597 -1.4356552 -1.2563772]
-1
exec 0 -> [0. 1.]
[0. 1.] -> [-0.8751782  -1.3507099   2.4682293  -0.96553946]
-1
exec 2 -> [1. 1.]
[1. 1.] -> [-7.6798165e-01  4.6491847e+00 -3.3885241e-05 -7.6573628e-01]
5
exec 1 -> [1. 0.]
[1. 0.] -> [-1.8110086  -1.8265507  -1.7182546  -0.44987726]
-1
exec 3 -> [0. 0.]
