In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist



# Define the environment
class TSPEnv(gym.Env):
  def __init__(self, n_cities=100, show_debug_data = False):
    self.n_cities = n_cities
    self.xy = (np.random.rand(self.n_cities,2)*100).round(2)
    self.x=self.xy[:,0]
    self.y=self.xy[:,1]
    self.step_counter = 0
    self.show_debug_data = show_debug_data
    self._array_visited = np.zeros(self.n_cities)
    self._total_distance = 0

    #print(f'genrated stops xy: {self.xy}')
    self.distance_matrix = cdist(self.xy,self.xy,'euclidean').round(0)


    self.current_city = np.random.randint(n_cities)
    self.visited_cities = [self.current_city]
    self._array_visited[self.current_city] = 1
    self.remaining_cities = [i for i in range(n_cities)]
    self.cities_list = [i for i in range(n_cities)]
    self.remaining_cities.remove(self.current_city)
    # Define the action space
    self.action_space = gym.spaces.Discrete(n_cities)

    # Define the observation space
    #self.observation_space = gym.spaces.Box(low=0, high=1, shape=(n_cities,), dtype=np.int32)
    self.observation_space = gym.spaces.MultiBinary(n_cities)

    if(self.show_debug_data):
        print(f'Current city in init: {self.current_city}')
        print(f'Remaining city in init: {self.remaining_cities}')
        print(f'Visited city in init: {self.visited_cities}')
        print(f'Stepcounter in init: {self.step_counter}')
        print(f'Observation in init: {self._array_visited}')

  def reset(self):
    self.step_counter = 0
    self._total_distance = 0
    self._array_visited = np.zeros(self.n_cities)
    self.current_city = np.random.randint(self.n_cities)
    self._array_visited[self.current_city] = 1
    self.visited_cities = [self.current_city]
    self.remaining_cities = [i for i in range(self.n_cities)]
    self.remaining_cities.remove(self.current_city)
    
    if(self.show_debug_data):
      print(f'Observation in Reset: {self._array_visited}')
    return self._get_observation()

  def step(self, action):
    done = False
    self.step_counter += 1
    reward = -999999999

    if(self.show_debug_data):
      print(f'Action in Step(top): {action}')
      print(f'Remaining-City Step(top): {self.remaining_cities}')
      print(f'Observation in step: {self._array_visited}')
    
    if (action in self.remaining_cities):
      if(self.show_debug_data):
        print('Action True')
      
      reward = -self.distance_matrix[self.current_city][action]
      self._total_distance += reward

      self.remaining_cities.remove(action)
      self.visited_cities.append(action)
      self._array_visited[action] = 1
      self.current_city = action

      if (len(self.remaining_cities) == 0):
        startingpoint = self.visited_cities[0]
        self.visited_cities.append(startingpoint)
        reward += -self.distance_matrix[self.current_city][startingpoint]
        done = True
      
      if(self.show_debug_data):
        print(f'Action in step: {action}')
        print(f'Reward in step: {reward}')
        print(f'Current city in step: {self.current_city}')
        print(f'Remaining city in step: {self.remaining_cities}')
        print(f'Visited city in step: {self.visited_cities}')
        print(f'Stepcounter in step: {self.step_counter}')
        print(f'Observation in step: {self._array_visited}')
      
      return self._get_observation(), reward, done, {}
    else:
        return self._get_observation(), reward, False, {}



  def _get_observation(self):
    # observation = np.zeros(self.n_cities)
    # observation[self.current_city] = 1
    # return observation
    return self._array_visited

  
  def _test_distance(self,CurrentCity, NextCity):
    return -self.distance_matrix[CurrentCity][NextCity]
    
  def plotCities(self):
    fig, ax = plt.subplots(1, figsize=(7,7))
    fig.suptitle = "Delivery Stops"
    plt.scatter(self.x,self.y)
    xcoord = []
    ycoord = []
    for i in range(0,len(self.visited_cities)):
      xcoord.append(self.x[self.visited_cities[i]])
      ycoord.append(self.y[self.visited_cities[i]])
      if(i == 0):
        ax.annotate("Anfang", xy=(xcoord[i], ycoord[i]), xytext=(xcoord[i]+0.5, ycoord[i]))
      ax.annotate(str(i), xy=(xcoord[i], ycoord[i]), xytext=(xcoord[i]+0.5, ycoord[i]))

    plt.plot(xcoord, ycoord)


In [None]:
env = TSPEnv(10,False)
for i in range(0,env.n_cities):
    env.step(i)
print(env.visited_cities)    
env.plotCities()

In [2]:
import random
import numpy as np
from collections import deque

import torch
import torch.nn as nn
import torch.optim as optim

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the DQN model
class DQN(nn.Module):
  def __init__(self, input_dim, output_dim):
    super().__init__()
    self.fc1 = nn.Linear(input_dim, 512)
    self.relu1 = nn.ReLU()
    self.fc2 = nn.Linear(512, 256)
    self.relu2 = nn.ReLU()
    self.fc3 = nn.Linear(256, output_dim)

  def forward(self, x):
    x = self.fc1(x)
    x = self.relu1(x)
    x = self.fc2(x)
    x = self.relu2(x)
    x = self.fc3(x)
    return x


# Define the DQNAgent
class DQNAgent:
  def __init__(self, env, epsilon=0.01, epsilon_decay=0.995, epsilon_min=0.01, 
               alpha=1e-3, alpha_decay=0.01, gamma=0.99, memory_size=10000, 
               batch_size=64):
    self.env = env
    self.epsilon = epsilon
    self.epsilon_decay = epsilon_decay
    self.epsilon_min = epsilon_min
    self.alpha = alpha
    self.alpha_decay = alpha_decay
    self.gamma = gamma
    self.memory = deque(maxlen=memory_size)
    self.batch_size = batch_size

    # Define the model and the target model
    self.model = DQN(env.observation_space.shape[0], env.action_space.n).to(device)
    self.target_model = DQN(env.observation_space.shape[0], env.action_space.n).to(device)
    self.target_model.load_state_dict(self.model.state_dict())

    # Define the optimizer
    self.optimizer = optim.Adam(self.model.parameters(), lr=alpha)

    # Define the loss function
    self.loss_fn = nn.MSELoss()

  def remember(self, state, action, reward, next_state, done):
    self.memory.append((state, action, reward, next_state, done))

  def act(self, state):
    if np.random.rand() <= self.epsilon:
      return self.env.action_space.sample()
    else:
      state = torch.from_numpy(state).float().to(device)
      q_values = self.model(state)
      return q_values.argmax().item()

  def update(self):
    # Don't update if there are not enough samples in the memory
    if len(self.memory) < self.batch_size:
      return

    # Sample a batch from the memory
    samples = random.sample(self.memory, self.batch_size)

    # Split the batch into separate variables
    states, actions, rewards, next_states, dones = zip(*samples)

    # Convert variables to tensors and move them to the device
    states = torch.from_numpy(np.vstack(states)).float().to(device)
    actions = torch.from_numpy(np.vstack(actions)).long().to(device)
    rewards = torch.from_numpy(np.vstack(rewards)).float().to(device)
    next_states = torch.from_numpy(np.vstack(next_states)).float().to(device)
    dones = torch.from_numpy(np.vstack(dones)).float().to(device)

    # Calculate the Q values for the current states
    q_values = self.model(states)
    q_values = q_values.gather(1, actions)

    # Calculate the Q values for the next states
    next_q_values = self.target_model(next_states).max(1)[0].unsqueeze(1)

    # Calculate the target Q values
    target_q_values = rewards + (self.gamma * next_q_values * (1 - dones))

    # Calculate the loss
    loss = self.loss_fn(q_values, target_q_values)

    # Perform backpropagation
    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()

    # Update the target model
    for target_param, param in zip(self.target_model.parameters(), self.model.parameters()):
      target_param.data.copy_(param.data * (1 - self.alpha_decay) + target_param.data * self.alpha_decay)

    # Update the epsilon value
    self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)
  
  def SaveAgent(self):
    torch.save(self.model.state_dict(),'model.pt')
    print('Model saved')

In [14]:
import gym
import numpy as np


# Instantiate the environment
env = TSPEnv(10)

# Instantiate the agent
agent = DQNAgent(env)

# Set the number of episodes to run
n_episodes = 1000

#lists for learning evaluation
scores = []


# Run the episodes
for episode in range(n_episodes):
  # Reset the environment and get the initial state
  state = env.reset()

  # Set the initial reward to 0
  total_reward = 0
  total_steps = 0
  Route = []

  while True:
    # Take an action
    action = agent.act(state)

    # Step the environment
    next_state, reward, done, _ = env.step(action)

    # Remember the experience
    agent.remember(state, action, reward, next_state, done)

    # Update the state and the reward
    state = next_state
    total_reward += reward
    total_steps += 1

    # Update the agent
    agent.update()

    # If the episode is done, break the loop
    if done:
      break
  
  scores.append((episode, total_reward, total_steps,env.visited_cities, env._total_distance, env.xy))
  #env.plotCities()
  # Print the total reward for the episode
  print(f"Episode: {episode+1}, Reward: {total_reward}, Steps needed: {total_steps}, Total Distance: {env._total_distance}, Visited Cities: {env.visited_cities}")

agent.SaveAgent()

# Open a file handle in write mode
with open('my_file.txt', 'w') as f:
    # Write the list to the file, one item per line
    f.writelines([f"{item}\n" for item in scores])



Episode: 1, Reward: -81000000322.0, Steps needed: 90, Total Distance: -392.0, Visited Cities: [1, 5, 9, 4, 6, 2, 0, 8, 3, 7, 1]
Episode: 2, Reward: -351999999996.0, Steps needed: 361, Total Distance: -309.0, Visited Cities: [3, 7, 2, 9, 4, 0, 8, 6, 1, 5, 3]
Episode: 3, Reward: -275000000275.0, Steps needed: 284, Total Distance: -491.0, Visited Cities: [9, 8, 3, 5, 1, 0, 6, 7, 4, 2, 9]
Episode: 4, Reward: -338000000214.0, Steps needed: 347, Total Distance: -499.0, Visited Cities: [5, 2, 1, 8, 9, 7, 6, 0, 3, 4, 5]
Episode: 5, Reward: -767999999909.0, Steps needed: 777, Total Distance: -613.0, Visited Cities: [9, 4, 7, 0, 3, 8, 2, 5, 6, 1, 9]
Episode: 6, Reward: -753999999691.0, Steps needed: 763, Total Distance: -412.0, Visited Cities: [4, 1, 7, 5, 3, 6, 9, 0, 2, 8, 4]
Episode: 7, Reward: -622999999863.0, Steps needed: 632, Total Distance: -442.0, Visited Cities: [3, 6, 8, 0, 2, 7, 5, 9, 1, 4, 3]
Episode: 8, Reward: -1578999998912.0, Steps needed: 1588, Total Distance: -453.0, Visited Ci