In [16]:
## Create the TSP Environment
import numpy as np
import gym
from gym import spaces
import pygame

from scipy.spatial.distance import cdist
from matplotlib.patches import Rectangle
from matplotlib.collections import PatchCollection
import matplotlib.pyplot as plt
plt.style.use("seaborn-dark")

class TSPEnvironment(gym.Env):
    def __init__ (self, n_stops = 100, version=True, debugInfo=False):
        print(f"TSP-Environment initialized with {n_stops} random stops")

        #True (V1) = Discrete Space / False = Array space
        self.version = version
        self._debugInfo = debugInfo

        # Initialization
        #Number of stops
        self.n_stops = n_stops
        #Coordinates of stops
        self.xy = []
        self._visitedStops = []
        self._notVisitedStops = list(range(0,self.n_stops))
        
        #if(self.version):
        
        #self.action_space = self._notVisitedStops
        self.episode_length = 0
        self.step_count = 0
        self.distances = np.array

        #set starting point (state)
        
        self.array_visitedStops = np.zeros(n_stops)

        #Generate stops
        self._generate_stops()
        #self._generate_q_values()

        self.action_space = spaces.Discrete(n_stops)
        #else:
            #self.action_space = spaces.Box(np.array(range(0,self.n_stops))
            
        
        self.array1 = np.ones(n_stops)


        high = np.array(
            [
                np.zeros(4),
                np.zeros(4),
            ],
            dtype=np.int32,
        )
        
        if(self.version):
            self.observation_space = spaces.Box(-high, high, dtype=np.int32)
        else:
            self.observation_space = spaces.MultiBinary(self.n_stops)


    def _generate_stops(self):
        self.xy = (np.random.rand(self.n_stops,2)*100).round(2)
        self.x=self.xy[:,0]
        self.y=self.xy[:,1]

        #print(f'genrated stops xy: {self.xy}')
        self.distances = cdist(self.xy,self.xy,'euclidean').round(0)
        
        #pick random StartPoint
        self._visitedStops.append(np.random.randint(0,self.n_stops))
        self._debugInfo
        print(f'Starting Point: {self._visitedStops}')


    #return stops[-1]
    #Gibt die aktuelle Position des Agenten zurück
    def _get_state(self):
        if( len(self._visitedStops)>0):
            return self._visitedStops[-1]
        else:
            return 0 


    def render(self):
        pass

    #Resets StartingPoint
    def reset(self):
        self._visitedStops.clear()
        self.array_visitedStops = np.zeros(self.n_stops)
        #self._notVisitedStops = list(range(0,self.n_stops))

        first_stop = np.random.randint(self.n_stops)
        #self._notVisitedStops.remove(first_stop)
        self._visitedStops.append(first_stop)
        self.array_visitedStops[first_stop] = True
        self.step_count = 1

        if(self.version):
            return self._get_state()
        else:
            return np.array(self.xy), {}

    def step(self,destination):
        done = False
        self.step_count +=1
        reward = -self.n_stops*1000

        self.episode_length += 1
        if(self.episode_length < 1000):
            if(np.random.rand(1,1) < 0.1):
                destination = np.random.randint(0,self.n_stops)

        #Validize Step
        if(self._get_state() != destination & destination not in self._visitedStops):
            #Get reward for such a move
            reward = -self.distances[self._get_state(), destination]
            
            # Append state (new position)
            self._visitedStops.append(destination)
            self.array_visitedStops[destination] = True

        if(self._debugInfo):
            print(f'State in step: {self._get_state()}')
            print(f'Destination in step: {destination}')
            print(f'Length visited stops: {len(self._visitedStops)}')
            print(f'Visited Stops in step: {self._visitedStops}')
            print(f'Reward in step: {reward}')
            print(f'Stepcounter: {self.step_count}')
        
        #exploration = np.random.random_sample()
        #print(f'Exploration: {exploration}')
        #if(exploration >= 0.05):
            #destination = np.random.randint(self.action_space.n)
            #print(f'Random Destination: {destination}')

        # if(self.step_count >= self.n_stops*5):
        #     done = True
        #     reward = -2000000
        #     if(self._debugInfo):
        #         print('Too much steps')

        if(len(self._visitedStops) == self.n_stops):
            if(len(self._visitedStops) <= self.step_count*2):
                reward += 500
            reward += 200    
            done = True
            print(f'Done = True')
            print(f'Length visited stops: {len(self._visitedStops)}')
            print(f'Visited Stops in step: {self._visitedStops}')
            
        info = {}

        #print(f'Agent position: {self._get_state()}')

        if(self.version):
            return self._get_state(), reward, done, {}
        else:
            return np.array(self.array_visitedStops, reward, done, {})

  plt.style.use("seaborn-dark")


In [18]:
env = TSPEnvironment(4, False)
print(env.reset())
# env.step(2)
# print(env.observation_space.shape[0])

TSP-Environment initialized with 4 random stops
Starting Point: [2]
(array([[54.57, 41.98],
       [ 5.33, 86.36],
       [64.55, 61.67],
       [77.52, 62.32]]), {})


In [None]:
episodes = 10 

env = TSPEnvironment(episodes, False)
print(env.array_visitedStops)
print(f'Step: {env.step(2)}')
print(f'Step: {env.step(3)}')
print(env.array_visitedStops.shape)
print(f'Reset: {env.reset()}')




In [19]:
import keras
import tensorflow as tf

model = tf.keras.Sequential(
        [
    #model.add(Input(shape=(3,)))
    
            keras.layers.Dense(512 , activation='relu', kernel_initializer='he_uniform'),
            keras.layers.Dense(256 ,activation='relu', kernel_initializer='he_uniform'),
            keras.layers.Dense(1, activation='linear', kernel_initializer='he_uniform'),
        ]
)
    
x = tf.ones((1,3))
y = model(x)
model.summary()

KeyboardInterrupt: 

In [None]:
episodes = 10 

env = TSPEnvironment(episodes)


for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    Loops = 0
    

    while not done:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
        Loops += 1
    print('Epsiode: {} Score: {} Episodes: {}'.format(episode,score.round(2),Loops))

In [None]:
import tensorflow as tf
import keras

class TSPAgent:
  def __init__(self, n_stops):
    self.n_stops = n_stops
    self.model = self._build_model()
    
  # def _build_model(self):
  #   model = tf.keras.Sequential()
  #   # Add layers to your model
  #   model.add(tf.keras.layers.Dense(units=64, activation='relu', input_shape=(self.n_stops,), kernel_initializer='he_uniform'))
  #   model.add(tf.keras.layers.Dense(units=32, activation='relu', kernel_initializer='he_uniform'))
  #   model.add(tf.keras.layers.Dense(units=1, activation='linear', kernel_initializer='he_uniform'))
  #   return model

  

    x = tf.ones((1,3))
    model(x)
    model.summary()
    return model

    
  def act(self, state):
    # Convert the state from an integer to an array
    state = np.array([state])
    
    # Reshape the state array to have the correct shape
    # state = state.reshape(-1, self.n_stops)
    
    # Use the model to predict the action given the state
    action = self.model.predict(state)
    return action

    
  def train(self, state, action, reward, done):
    # Use the reward and next state to update the model
    self.model.fit(state, action, reward, done)


In [None]:
import numpy as np

# Set hyperparameters
learning_rate = 0.001
n_episodes = 1000

# Create the TSP environment and agent
env = TSPEnvironment(50)
agent = TSPAgent(n_stops=100)

# Set the optimizer for the agent
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Training loop
for episode in range(n_episodes):
  # Reset the environment at the beginning of each episode
  state = env.reset()
  done = False
  
  while not done:
    # Have the agent act on the current state
    #action = agent.act(state)
    
    # Take a step in the environment
    #next_state, reward, done, _ = env.step(action)
    
    # Train the agent
    agent.train(state, action, reward, done)
    
    # Update the current state
    #state = next_state


In [20]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents import DQNAgent, DDPGAgent, ContinuousDQNAgent
from rl.policy import EpsGreedyQPolicy, BoltzmannQPolicy
from rl.memory import SequentialMemory
from tensorflow.keras.layers import Lambda

# Learning factors...
GeneratedStops = 10
learning_rate = 0.001

# create replay memory using deque
#memory = deque(maxlen=2000)

# Create gym environment
env = TSPEnvironment(GeneratedStops, False)


action_size = env.action_space.n
state_size = env.observation_space.shape


# The following function creates a neural network which is used as an 
# approximate Q function
# Input: state 
# Output: Q Value of each action
def build_model(state_size, action_size):
    model = tf.keras.Sequential(
        [
    #model.add(Input(shape=(3,)))
    
            keras.layers.Dense(8 , input_dim=4, activation='relu', kernel_initializer='he_uniform'),
            keras.layers.Dense(8 ,activation='relu', kernel_initializer='he_uniform'),
            keras.layers.Dense(action_size, activation='linear', kernel_initializer='he_uniform'),
        ]
    )

    #x = tf.ones((1,3))
    #model(x)
    model.summary()
    model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate))
    return model
    #input_shape=(input_dim,)

# Create the TSP agent
def build_agent(model, action_size):
    # Use Epsilon-Greedy policy for exploration
    policy = BoltzmannQPolicy()
    # Create memory for storing transitions
    memory = SequentialMemory(limit=50000, window_length=1)
    # Create the DQN agent
    agent = DQNAgent(model, memory=memory, policy=policy, nb_actions=action_size, nb_steps_warmup=100, target_model_update=1e-2)
    
    agent.compile(Adam(lr=1e-3), metrics=['mae'])
    return agent


# Create the TSP model
model = build_model(state_size, action_size)
# Create the TSP agent
agent = build_agent(model, action_size)
#agent = DeliveryQAgent(env.observation_space.n,env.action_space.n)
# Train the agent



#print(model.output)
print(env.observation_space.shape[0])

#agent.fit(env, nb_steps=5000, visualize=False, verbose=0)


TSP-Environment initialized with 10 random stops
Starting Point: [1]


ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 4), found shape=(1, 3)

In [None]:
scores=agent.test(env, nb_episodes=1000, visualize=False)
print(np.mean(scores.history['episode_reward']))

In [None]:
env = TSPEnvironment(GeneratedStops, False)


action_size = env.action_space.n
state_size = env.observation_space.shape

print(action_size)
print(state_size)