In [None]:
from gym import Env
from gym.spaces import Discrete, box
import numpy as np
import random

In [107]:
## Create the TSP Environment
import numpy as np
import gym
from gym import spaces
import pygame

from scipy.spatial.distance import cdist
from matplotlib.patches import Rectangle
from matplotlib.collections import PatchCollection
import matplotlib.pyplot as plt
plt.style.use("seaborn-dark")

class TSPEnvironment(gym.Env):
    def __init__ (self, n_stops = 100):
        print(f"TSP-Environment initialized with {n_stops} random stops")

        # Initialization
        #Number of stops
        self.n_stops = n_stops
        #Coordinates of stops
        self.xy = []
        self._visitedStops = []

        #self.observation_space = spaces.Box(low= 0, high = self.n_stops)
        self.action_space = spaces.Discrete(n_stops)
        self.observation_space = spaces.Box(low=0, high = n_stops, shape=(1,))
        self.episode_length = n_stops
        
        self.step_count = 0
        self.distances = np.array
        

        #set starting point (state)
        
        #Generate stops
        self._generate_stops()
        #self._generate_q_values()


    def _generate_stops(self):
        self.xy = (np.random.rand(self.n_stops,2)*100).round(2)
        self.x=self.xy[:,0]
        self.y=self.xy[:,1]

        #print(f'genrated stops xy: {self.xy}')
        self.distances = cdist(self.xy,self.xy,'euclidean').round(2)
        
        #pick random StartPoint
        self._visitedStops.append(np.random.randint(0,self.n_stops))
        print(f'Starting Point: {self._visitedStops}')


    #return stops[-1]
    #Gibt die aktuelle Position des Agenten zurück
    def _get_state(self):
        if( len(self._visitedStops)>0):
            return self._visitedStops[-1]
        else:
            return 0 


    def render(self):
        pass

    #Resets StartingPoint
    def reset(self):
        self._visitedStops.clear()

        first_stop = np.random.randint(self.n_stops)
        self._visitedStops.append(first_stop)
        self.step_count = 1
        return self._get_state()

    def step(self,destination):
        done = False
        self.step_count +=1
        reward = -300

        #Validize Step
        if(self._get_state() != destination & destination not in self._visitedStops):
            #Get reward for such a move
            reward = -self.distances[self._get_state(), destination]
            
            # Append state (new position)
            self._visitedStops.append(destination)

        print(f'State in step: {self._get_state()}')
        print(f'Destination in step: {destination}')
        print(f'Length visited stops: {len(self._visitedStops)}')
        print(f'Visited Stops in step: {self._visitedStops}')
        print(f'Reward in step: {reward}')
        print(f'Stepcounter: {self.step_count}')
        
        #exploration = np.random.random_sample()
        #print(f'Exploration: {exploration}')
        #if(exploration >= 0.05):
            #destination = np.random.randint(self.action_space.n)
            #print(f'Random Destination: {destination}')

        if(self.step_count >= self.n_stops*5):
            done = True
            reward = -600
            print('Too much steps')

        if(len(self._visitedStops) == self.n_stops):
            if(len(self._visitedStops) <= self.step_count*2):
                reward += 500
            reward += 200    
            done = True
            print(f'Done = True')
            print(f'Length visited stops: {len(self._visitedStops)}')
            print(f'Visited Stops in step: {self._visitedStops}')
            
        info = {}

        #print(f'Agent position: {self._get_state()}')

        return self._get_state(), reward, done, {}


   



  plt.style.use("seaborn-dark")


In [None]:
episodes = 10 

env = TSPEnvironment(episodes)
env.step(3)



In [71]:
episodes = 10 

env = TSPEnvironment(episodes)


for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    Loops = 0
    

    while not done:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
        Loops += 1
    print('Epsiode: {} Score: {} Episodes: {}'.format(episode,score.round(2),Loops))

TSP-Environment initialized with 10 random stops
Starting Point: [8]
State in step: 8
Destination in step: 8
Length visited stops: 2
Visited Stops in step: [1, 8]
Reward in step: -29.08
Stepcounter: 2
State in step: 9
Destination in step: 9
Length visited stops: 3
Visited Stops in step: [1, 8, 9]
Reward in step: -30.29
Stepcounter: 3
State in step: 2
Destination in step: 2
Length visited stops: 4
Visited Stops in step: [1, 8, 9, 2]
Reward in step: -18.87
Stepcounter: 4
State in step: 6
Destination in step: 6
Length visited stops: 5
Visited Stops in step: [1, 8, 9, 2, 6]
Reward in step: -29.53
Stepcounter: 5
State in step: 7
Destination in step: 7
Length visited stops: 6
Visited Stops in step: [1, 8, 9, 2, 6, 7]
Reward in step: -56.26
Stepcounter: 6
State in step: 7
Destination in step: 9
Length visited stops: 6
Visited Stops in step: [1, 8, 9, 2, 6, 7]
Reward in step: -300
Stepcounter: 7
State in step: 7
Destination in step: 7
Length visited stops: 6
Visited Stops in step: [1, 8, 9, 2,

In [108]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents import DQNAgent, DDPGAgent, ContinuousDQNAgent
from rl.policy import EpsGreedyQPolicy, BoltzmannQPolicy
from rl.memory import SequentialMemory
from tensorflow.keras.layers import Lambda


# Learning factors...
EPISODES = 10
learning_rate = 0.001

# create replay memory using deque
#memory = deque(maxlen=2000)

# Create gym environment
env = TSPEnvironment(EPISODES)


action_size = env.action_space.n
state_size = 1

# Define the optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
loss_fn = tf.keras.losses.mean_squared_error



# The following function creates a neural network which is used as an 
# approximate Q function
# Input: state 
# Output: Q Value of each action
def build_model(state_size, action_size):
    model = tf.keras.Sequential()
    model.add(Dense(512, input_dim=state_size, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    model.summary()
    model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate))
    return model

# Create the TSP agent
def build_agent(model, action_size):
    # Use Epsilon-Greedy policy for exploration
    policy = BoltzmannQPolicy()
    # Create memory for storing transitions
    memory = SequentialMemory(limit=50000, window_length=1)
    # Create the DQN agent
    agent = DQNAgent(model, memory=memory, policy=policy, nb_actions=action_size, nb_steps_warmup=100, target_model_update=1e-2)
    
    return agent



# Create the TSP model
model = build_model(state_size, action_size)
model.output
# Create the TSP agent
agent = build_agent(model, action_size)
# Train the agent
agent.compile(Adam(lr=1e-3), metrics=['mae'])
agent.fit(env, nb_steps=5000, visualize=False, verbose=1)


TSP-Environment initialized with 10 random stops
Starting Point: [5]
Model: "sequential_40"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_114 (Dense)           (None, 512)               1024      
                                                                 
 dense_115 (Dense)           (None, 512)               262656    
                                                                 
 dense_116 (Dense)           (None, 10)                5130      
                                                                 
Total params: 268,810
Trainable params: 268,810
Non-trainable params: 0
_________________________________________________________________


  super().__init__(name, **kwargs)


Training for 5000 steps ...
Interval 1 (0 steps performed)


  updates=self.state_updates,


State in step: 6
Destination in step: 6
Length visited stops: 2
Visited Stops in step: [0, 6]
Reward in step: -53.95
Stepcounter: 2
    1/10000 [..............................] - ETA: 2:42:53 - reward: -53.9500State in step: 8
Destination in step: 8
Length visited stops: 3
Visited Stops in step: [0, 6, 8]
Reward in step: -43.94
Stepcounter: 3
State in step: 5
Destination in step: 5
Length visited stops: 4
Visited Stops in step: [0, 6, 8, 5]
Reward in step: -87.63
Stepcounter: 4
State in step: 1
Destination in step: 1
Length visited stops: 5
Visited Stops in step: [0, 6, 8, 5, 1]
Reward in step: -75.34
Stepcounter: 5
State in step: 1
Destination in step: 6
Length visited stops: 5
Visited Stops in step: [0, 6, 8, 5, 1]
Reward in step: -300
Stepcounter: 6
State in step: 1
Destination in step: 0
Length visited stops: 5
Visited Stops in step: [0, 6, 8, 5, 1]
Reward in step: -300
Stepcounter: 7
State in step: 2
Destination in step: 2
Length visited stops: 6
Visited Stops in step: [0, 6, 8, 5

<keras.callbacks.History at 0x1c80390da50>

In [109]:
scores=agent.test(env, nb_episodes=100, visualize=False)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
State in step: 7
Destination in step: 7
Length visited stops: 2
Visited Stops in step: [6, 7]
Reward in step: -78.72
Stepcounter: 2
State in step: 7
Destination in step: 7
Length visited stops: 2
Visited Stops in step: [6, 7]
Reward in step: -300
Stepcounter: 3
State in step: 7
Destination in step: 7
Length visited stops: 2
Visited Stops in step: [6, 7]
Reward in step: -300
Stepcounter: 4
State in step: 7
Destination in step: 7
Length visited stops: 2
Visited Stops in step: [6, 7]
Reward in step: -300
Stepcounter: 5
State in step: 7
Destination in step: 7
Length visited stops: 2
Visited Stops in step: [6, 7]
Reward in step: -300
Stepcounter: 6
State in step: 7
Destination in step: 7
Length visited stops: 2
Visited Stops in step: [6, 7]
Reward in step: -300
Stepcounter: 7
State in step: 7
Destination in step: 7
Length visited stops: 2
Visited Stops in step: [6, 7]
Reward in step: -300
Stepcounter: 8
State in step: 7
Destination in step: 7
Length visited stop