In [31]:
import gym
import numpy as np
from scipy.spatial.distance import cdist


# Define the environment
class TSPEnv(gym.Env):
  def __init__(self, n_cities=100, show_debug_data = False):
    self.n_cities = n_cities
    self.xy = (np.random.rand(self.n_cities,2)*100).round(2)
    self.x=self.xy[:,0]
    self.y=self.xy[:,1]
    self.step_counter = 0
    self.show_debug_data = show_debug_data

    #print(f'genrated stops xy: {self.xy}')
    self.distance_matrix = cdist(self.xy,self.xy,'euclidean').round(0)

    # self.distance_matrix = np.array([[0, 2448, 1434, 1260, 2045],
    #                   [2448, 0, 2546, 959, 2367],
    #                   [1434, 2546, 0, 2408, 1745],
    #                   [1260, 959, 2408, 0, 2295],
    #                   [2045, 2367, 1745, 2295, 0]])
    self.current_city = np.random.randint(n_cities)
    self.visited_cities = [self.current_city]
    self.remaining_cities = [i for i in range(n_cities) if i != self.current_city]
    # Define the action space
    self.action_space = gym.spaces.Discrete(n_cities)

    # Define the observation space
    self.observation_space = gym.spaces.Box(low=0, high=1, shape=(n_cities,), dtype=np.float32)

    print(f'Start in Init: {self.current_city}')

  def reset(self):
    self.step_counter = 0
    self.current_city = np.random.randint(self.n_cities)
    self.visited_cities = [self.current_city]
    self.remaining_cities = [i for i in range(self.n_cities) if i != self.current_city]
    return self._get_observation()

  def step(self, action):
    self.step_counter += 1
    if action < len(self.remaining_cities):
      next_city = self.remaining_cities[action]
      reward = -self.distance_matrix[self.current_city][next_city]
      self.remaining_cities.remove(next_city)
      self.visited_cities.append(next_city)
      self.current_city = next_city
      done = len(self.remaining_cities) == 0
      
      if(self.show_debug_data):
        print(f'Action in step: {action}')
        print(f'Reward in step: {reward}')
        print(f'Current city in step: {self.current_city}')
        print(f'Remaining city in step: {self.remaining_cities}')
        print(f'Visited city in step: {self.visited_cities}')
        print(f'Stepcounter in step: {self.step_counter}')
      return self._get_observation(), reward, done, {}
    else:
      return self._get_observation(), 0, False, {}


  def _get_observation(self):
    observation = np.zeros(self.n_cities)
    observation[self.current_city] = 1
    return observation

  
  def _test_distance(self,CurrentCity, NextCity):
    return -self.distance_matrix[CurrentCity][NextCity]


env = TSPEnv() 

Start in Init: 36


In [33]:
env.reset()

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [30]:
import tensorflow as tf

class DQNAgent:

  def __init__(self, n_actions, learning_rate=0.01, discount_factor=0.95, epsilon=0.1, batch_size=64):
    self.n_actions = n_actions
    self.learning_rate = learning_rate
    self.discount_factor = discount_factor
    self.epsilon = epsilon
    self.replay_memory = []
    self.model = self._build_model(env.observation_space.shape[0], env.action_space.n)
    self.batch_size = batch_size

  def _build_model(self, input_dim, action_size):
    model = tf.keras.Sequential(
      [
        tf.keras.layers.Dense(units=32, activation='relu', input_shape=(input_dim,)),
        tf.keras.layers.Dense(units=10, activation='softmax'),
        tf.keras.layers.Dense(action_size, activation='linear'),
      ])
    
    model(tf.ones(1,3))
    return model

  def act(self, state):
    if np.random.rand() < self.epsilon:
      # Explore: choose a random action
      return np.random.randint(self.n_actions)
    else:
      # Exploit: choose the action with the highest Q value
      q_values = self.model.predict(state)
      return np.argmax(q_values[0])

  def remember(self, state, action, reward, next_state, done):
    self.replay_memory.append((state, action, reward, next_state, done))

  def train(self):
    # Sample a minibatch from the replay memory
    minibatch = np.random.choice(self.replay_memory, self.batch_size)
    for state, action, reward, next_state, done in minibatch:
      q_update = reward
      if not done:
        q_update = (reward + self.discount_factor * np.amax(self.model.predict(next_state)[0]))
      q_values = self.model.predict(state)
      q_values[0][action] = q_update
      self.model.fit(state, q_values, verbose=0)

# Initialize the agent
agent = DQNAgent(n_actions=env.action_space.n)

# Train the agent
while True:
  # Reset the environment
  state = env.reset()
  # Run the episode
  while True:
    # Choose an action
    action = agent.act(state)
    # Take a step in the environment
    next_state, reward, done, _ = env.step(action)
    # Remember the experience
    agent.remember(state, action, reward, next_state, done)
    # Update the state
    state = next_state
    # If the episode is done, exit the loop
    if done:
      break
  # Train the agent
  agent.train()


ValueError: Exception encountered when calling layer "sequential_9" "                 f"(type Sequential).

Input 0 of layer "dense_40" is incompatible with the layer: expected min_ndim=2, found ndim=1. Full shape received: (1,)

Call arguments received by layer "sequential_9" "                 f"(type Sequential):
  • inputs=tf.Tensor(shape=(1,), dtype=int32)
  • training=None
  • mask=None

In [None]:
import tensorflow as tf
import numpy as np
import random

# Define the DQN model
class DQN(tf.keras.Model):
  def __init__(self, input_dim, output_dim):
    super().__init__()
    self.fc1 = tf.keras.layers.Dense(32, input_shape=(input_dim,), activation="relu")
    self.fc2 = tf.keras.layers.Dense(output_dim)

  # def call(self, x):
  #   x = self.fc1(x)
  #   x = self.fc2(x)
  #   return x

  def call(self, inputs):
    # Concatenate the inputs into a single tensor
    x = tf.concat(inputs, axis=-1)
    # Pass the tensor through the model
    return self.model(x)

# Define the DQNAgent
class DQNAgent:
  def __init__(self, env, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01, 
               alpha=1e-3, alpha_decay=0.01, gamma=0.99, memory_size=10000, 
               batch_size=64):
    self.env = env
    self.epsilon = epsilon
    self.epsilon_decay = epsilon_decay
    self.epsilon_min = epsilon_min
    self.alpha = alpha
    self.alpha_decay = alpha_decay
    self.gamma = gamma
    self.memory = []
    self.batch_size = batch_size

    # Define the model and the target model
    self.model = DQN(env.observation_space.shape[0], env.action_space.n)
    self.target_model = DQN(env.observation_space.shape[0], env.action_space.n)
    self.target_model.set_weights(self.model.get_weights())

    # Define the optimizer
    self.optimizer = tf.keras.optimizers.Adam(learning_rate=alpha)

  def remember(self, state, action, reward, next_state, done):
    self.memory.append((state, action, reward, next_state, done))

  def act(self, state):
    if np.random.rand() <= self.epsilon:
      return self.env.action_space.sample()
    else:
      state = np.expand_dims(state, axis=0)
      q_values = self.model(state)
      return np.argmax(q_values[0]).numpy()

  def update(self):
    # Don't update if there are not enough samples in the memory
    if len(self.memory) < self.batch_size:
      return

    # Sample a batch from the memory
    samples = random.sample(self.memory, self.batch_size)

    # Split the batch into separate variables
    states, actions, rewards, next_states, dones = zip(*samples)

    # Convert variables to arrays
    states = np.vstack(states)
    actions = np.vstack(actions)
    rewards = np.vstack(rewards)

        # Calculate the Q values for the current states
    q_values = self.model(np.array(states))
    q_values = tf.gather(q_values, actions, axis=1)

    # Calculate the Q values for the next states
    next_q_values = self.target_model(next_states).max(1)[0]

    # Calculate the target Q values
    target_q_values = rewards + (self.gamma * next_q_values * (1 - dones))

    # Calculate the loss
    loss = tf.losses.mean_squared_error(target_q_values, q_values)

    # Perform backpropagation
    with tf.GradientTape() as tape:
      grads = tape.gradient(loss, self.model.trainable_variables)
      self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

    # Update the target model
    self.target_model.set_weights(self.model.get_weights())

    # Update the epsilon value
    self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)



In [None]:
import tensorflow as tf
import numpy as np
import random

# Instantiate the environment
env = TSPEnv()

# Instantiate the agent
agent = DQNAgent(env)

# Set the number of episodes to run
n_episodes = 10

# Set the initial reward to 0
total_reward = 0

# Run the episodes
for episode in range(n_episodes):
  # Reset the environment and get the initial state
  state = env.reset()

  while True:
    # Take an action
    action = agent.act(state)

    # Step the environment
    next_state, reward, done, _ = env.step(action)

    # Remember the experience
    agent.remember(state, action, reward, next_state, done)

    # Update the state and the reward
    state = next_state
    total_reward += reward

    # Update the agent
    agent.update()

    # If the episode is done, break the loop
    if done:
      break

  # Print the total reward for the episode
  print(f"Episode: {episode+1}, Reward: {total_reward}")
