In [125]:
## Create the TSP Environment
import numpy as np
import gym
from gym import spaces
import pygame

from scipy.spatial.distance import cdist
from matplotlib.patches import Rectangle
from matplotlib.collections import PatchCollection
import matplotlib.pyplot as plt
plt.style.use("seaborn-dark")

class TSPEnvironment(gym.Env):
    def __init__ (self, n_stops = 100, version=True):
        print(f"TSP-Environment initialized with {n_stops} random stops")

        #True (V1) = Discrete Space / False = Array space
        self.version = version

        # Initialization
        #Number of stops
        self.n_stops = n_stops
        #Coordinates of stops
        self.xy = []
        self._visitedStops = []
        self._notVisitedStops = list(range(0,self.n_stops))
        
        #if(self.version):
        self.action_space = spaces.Discrete(n_stops)
        #else:
            #self.action_space = spaces.Box(np.array(range(0,self.n_stops)))
        
        if(self.version):
            self.observation_space = spaces.Box(low= 0, high = self.n_stops)
        else:
            self.observation_space = spaces.Box(low=np.zeros(self.n_stops), high=np.ones(self.n_stops))
        #self.action_space = self._notVisitedStops
        self.episode_length = 0
        self.step_count = 0
        self.distances = np.array

        self.array_visitedStops = np.zeros(n_stops)
        print(f'Shape Array:{self.array_visitedStops.shape}')

        

        #set starting point (state)
        
        #Generate stops
        self._generate_stops()
        #self._generate_q_values()


    def _generate_stops(self):
        self.xy = (np.random.rand(self.n_stops,2)*100).round(2)
        self.x=self.xy[:,0]
        self.y=self.xy[:,1]

        #print(f'genrated stops xy: {self.xy}')
        self.distances = cdist(self.xy,self.xy,'euclidean').round(0)
        
        #pick random StartPoint
        self._visitedStops.append(np.random.randint(0,self.n_stops))
        print(f'Starting Point: {self._visitedStops}')


    #return stops[-1]
    #Gibt die aktuelle Position des Agenten zurück
    def _get_state(self):
        if( len(self._visitedStops)>0):
            return self._visitedStops[-1]
        else:
            return 0 


    def render(self):
        pass

    #Resets StartingPoint
    def reset(self):
        self._visitedStops.clear()
        self.array_visitedStops = np.zeros(self.n_stops)
        #self._notVisitedStops = list(range(0,self.n_stops))

        first_stop = np.random.randint(self.n_stops)
        #self._notVisitedStops.remove(first_stop)
        self._visitedStops.append(first_stop)
        self.array_visitedStops[first_stop] = True
        self.step_count = 1

        if(self.version):
            return self._get_state()
        else:
            return self.array_visitedStops

    def step(self,destination):
        done = False
        self.step_count +=1
        reward = -self.n_stops*1000

        self.episode_length += 1
        if(self.episode_length < 1000):
            if(np.random.rand(1,1) < 0.1):
                destination = np.random.randint(0,self.n_stops)

        #Validize Step
        if(self._get_state() != destination & destination not in self._visitedStops):
            #Get reward for such a move
            reward = -self.distances[self._get_state(), destination]
            
            # Append state (new position)
            self._visitedStops.append(destination)
            self.array_visitedStops[destination] = True

        print(f'State in step: {self._get_state()}')
        print(f'Destination in step: {destination}')
        print(f'Length visited stops: {len(self._visitedStops)}')
        print(f'Visited Stops in step: {self._visitedStops}')
        print(f'Reward in step: {reward}')
        print(f'Stepcounter: {self.step_count}')
        
        #exploration = np.random.random_sample()
        #print(f'Exploration: {exploration}')
        #if(exploration >= 0.05):
            #destination = np.random.randint(self.action_space.n)
            #print(f'Random Destination: {destination}')

        if(self.step_count >= self.n_stops*5):
            done = True
            reward = -2000000
            print('Too much steps')

        if(len(self._visitedStops) == self.n_stops):
            if(len(self._visitedStops) <= self.step_count*2):
                reward += 500
            reward += 200    
            done = True
            print(f'Done = True')
            print(f'Length visited stops: {len(self._visitedStops)}')
            print(f'Visited Stops in step: {self._visitedStops}')
            
        info = {}

        #print(f'Agent position: {self._get_state()}')

        if(self.version):
            return self._get_state(), reward, done, {}
        else:
            return self.array_visitedStops, reward, done, {}


   



  plt.style.use("seaborn-dark")


In [126]:
episodes = 10 

env = TSPEnvironment(episodes, True)
print(env.array_visitedStops)
print(f'Step: {env.step(2)}')
print(f'Step: {env.step(3)}')
print(env.array_visitedStops.shape)
print(f'Reset: {env.reset()}')




TSP-Environment initialized with 10 random stops
Shape Array:(10,)
Starting Point: [1]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
State in step: 2
Destination in step: 2
Length visited stops: 2
Visited Stops in step: [1, 2]
Reward in step: -10.0
Stepcounter: 1
Step: (2, -10.0, False, {})
State in step: 3
Destination in step: 3
Length visited stops: 3
Visited Stops in step: [1, 2, 3]
Reward in step: -81.0
Stepcounter: 2
Step: (3, -81.0, False, {})
(10,)
Reset: 3


In [None]:
episodes = 10 

env = TSPEnvironment(episodes)


for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    Loops = 0
    

    while not done:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
        Loops += 1
    print('Epsiode: {} Score: {} Episodes: {}'.format(episode,score.round(2),Loops))

In [128]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents import DQNAgent, DDPGAgent, ContinuousDQNAgent
from rl.policy import EpsGreedyQPolicy, BoltzmannQPolicy
from rl.memory import SequentialMemory
from tensorflow.keras.layers import Lambda

# Learning factors...
GeneratedStops = 10
learning_rate = 0.001

# create replay memory using deque
#memory = deque(maxlen=2000)

# Create gym environment
env = TSPEnvironment(GeneratedStops)


action_size = env.action_space.n
state_size = env.observation_space.shape[0]


# The following function creates a neural network which is used as an 
# approximate Q function
# Input: state 
# Output: Q Value of each action
def build_model(state_size, action_size):
    model = tf.keras.Sequential()
    #model.add(Input(shape=(3,)))
    model.add(Dense(512 ,input_dim=state_size, activation='relu'))
    model.add(Dense(512 ,activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    model.summary()
    model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate))
    return model

# Create the TSP agent
def build_agent(model, action_size):
    # Use Epsilon-Greedy policy for exploration
    policy = BoltzmannQPolicy()
    # Create memory for storing transitions
    memory = SequentialMemory(limit=50000, window_length=1)
    # Create the DQN agent
    agent = DQNAgent(model, memory=memory, policy=policy, nb_actions=action_size, nb_steps_warmup=100, target_model_update=1e-2)
    
    return agent



# Create the TSP model
model = build_model(state_size, action_size)
model.output
# Create the TSP agent
agent = build_agent(model, action_size)
# Train the agent
agent.compile(Adam(lr=1e-3), metrics=['mae'])

agent.fit(env, nb_steps=5000, visualize=False, verbose=0)


TSP-Environment initialized with 10 random stops
Shape Array:(10,)
Starting Point: [4]
Model: "sequential_42"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_114 (Dense)           (None, 512)               1024      
                                                                 
 dense_115 (Dense)           (None, 512)               262656    
                                                                 
 dense_116 (Dense)           (None, 10)                5130      
                                                                 
Total params: 268,810
Trainable params: 268,810
Non-trainable params: 0
_________________________________________________________________


  updates=self.state_updates,


State in step: 1
Destination in step: 1
Length visited stops: 2
Visited Stops in step: [4, 1]
Reward in step: -77.0
Stepcounter: 2
State in step: 0
Destination in step: 0
Length visited stops: 3
Visited Stops in step: [4, 1, 0]
Reward in step: -80.0
Stepcounter: 3
State in step: 2
Destination in step: 2
Length visited stops: 4
Visited Stops in step: [4, 1, 0, 2]
Reward in step: -25.0
Stepcounter: 4
State in step: 5
Destination in step: 5
Length visited stops: 5
Visited Stops in step: [4, 1, 0, 2, 5]
Reward in step: -35.0
Stepcounter: 5
State in step: 3
Destination in step: 3
Length visited stops: 6
Visited Stops in step: [4, 1, 0, 2, 5, 3]
Reward in step: -27.0
Stepcounter: 6
State in step: 3
Destination in step: 5
Length visited stops: 6
Visited Stops in step: [4, 1, 0, 2, 5, 3]
Reward in step: -10000
Stepcounter: 7
State in step: 3
Destination in step: 2
Length visited stops: 6
Visited Stops in step: [4, 1, 0, 2, 5, 3]
Reward in step: -10000
Stepcounter: 8
State in step: 3
Destinatio

<keras.callbacks.History at 0x271e9900370>

In [None]:
scores=agent.test(env, nb_episodes=1000, visualize=False)
print(np.mean(scores.history['episode_reward']))

In [111]:
xy = (np.random.randint(0,100)*100)
xy


2700