In [36]:
#!/usr/bin/env python
# -*- coding: utf-8 -*- 


"""--------------------------------------------------------------------
REINFORCEMENT LEARNING

Started on the 25/08/2017


theo.alves.da.costa@gmail.com
https://github.com/theolvs
------------------------------------------------------------------------
"""

import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np 
import sys
import random
import time
import random
import numpy as np

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

from rl.memory import Memory

class Agent(object):
    def __init__(self):
        pass


    def expand_state_vector(self,state):
        if len(state.shape) == 1 or len(state.shape)==3:
            return np.expand_dims(state,axis = 0)
        else:
            return state



    def remember(self,*args):
        self.memory.save(args)

class QAgent(Agent):
    def __init__(self,states_size,actions_size,epsilon = 1.0,epsilon_min = 0.01,epsilon_decay = 0.999,gamma = 0.95,lr = 0.8):
        self.states_size = states_size
        self.actions_size = actions_size
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.gamma = gamma
        self.lr = lr
        self.Q = self.build_model(states_size,actions_size)


    def build_model(self,states_size,actions_size):
        Q = np.zeros([states_size,actions_size])
        return Q


    def train(self,s,a,r,s_next):
        self.Q[s,a] = self.Q[s,a] + self.lr * (r + self.gamma*np.max(self.Q[s_next,a]) - self.Q[s,a])

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


    def act(self,s):

        q = self.Q[s,:]

        if np.random.rand() > self.epsilon:
            a = np.argmax(q)
        else:
            a = np.random.randint(self.actions_size)

        return a

class DQNAgentTSP(Agent):
    def __init__(self,states_size,actions_size,epsilon = 1.0,epsilon_min = 0.01,epsilon_decay = 0.995,gamma = 0.95,lr = 0.001,low = 0,high = 1,max_memory = 2000,observation_type = "discrete"):
        assert observation_type in ["discrete","continuous"]
        self.states_size = states_size
        self.actions_size = actions_size
        self.memory = Memory(max_memory = max_memory)
        self.epsilon = epsilon
        self.low = low
        self.high = high
        self.observation_type = observation_type
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.gamma = gamma
        self.lr = lr
        self.model = self.build_model(states_size,actions_size)

    def build_model(self,states_size,actions_size):
        model = Sequential()
        model.add(Dense(24,input_dim = states_size,activation = "relu"))
        model.add(Dense(24,activation = "relu"))
        model.add(Dense(actions_size,activation = "linear"))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.lr))
        return model

    def train(self,batch_size = 32):
        if len(self.memory.cache) > batch_size:
            batch = random.sample(self.memory.cache, batch_size)
        else:
            batch = self.memory.cache

        for state,action,reward,next_state,done in batch:
            state = self.expand_state_vector(state)
            next_state = self.expand_state_vector(next_state)


            targets = self.model.predict(state)

            if not done:
                target = reward + self.gamma * np.max(self.model.predict(next_state))
            else:
                target = reward

            targets[0][action] = target

            self.model.fit(state,targets,epochs = 1,verbose = 0)


        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def act(self,state):
        state = self.expand_state_vector(state)


        if np.random.rand() > self.epsilon:
            q = self.model.predict(state)

            if self.observation_type == "discrete":
                a = np.argmax(q[0])
            elif self.observation_type == "continuous":
                a = np.squeeze(np.clip(q,self.low,self.high))

        else:
            if self.observation_type == "discrete":
                a = np.random.randint(self.actions_size)
            elif self.observation_type == "continuous":
                a = np.random.uniform(self.low,self.high,self.actions_size)
        return a 

class DeliveryQAgent(QAgent):

    def __init__(self,*args,**kwargs):
        super().__init__(*args,**kwargs)
        self.reset_memory()

    def act(self,s):

        # Get Q Vector
        q = np.copy(self.Q[s,:])

        # Avoid already visited states
        q[self.states_memory] = -np.inf

        if np.random.rand() > self.epsilon:
            a = np.argmax(q)
        else:
            a = np.random.choice([x for x in range(self.actions_size) if x not in self.states_memory])

        return a


    def remember_state(self,s):
        self.states_memory.append(s)

    def reset_memory(self):
        self.states_memory = []



def run_n_episodes(env,agent,name="training.gif",n_episodes=1000,render_each=10,fps=10):

    # Store the rewards
    rewards = []
    imgs = []

    # Experience replay
    for i in tqdm_notebook(range(n_episodes)):

        # Run the episode
        env,agent,episode_reward = run_episode(env,agent,verbose = 0)
        rewards.append(episode_reward)
        
        if i % render_each == 0:
            img = env.render(return_img = True)
            imgs.append(img)

    # Show rewards
    plt.figure(figsize = (15,3))
    plt.title("Rewards over training")
    plt.plot(rewards)
    plt.show()

    # Save imgs as gif
    imageio.mimsave(name,imgs,fps = fps)

    return env,agent


class DeliveryQAgent(QAgent):

    def __init__(self,*args,**kwargs):
        super().__init__(*args,**kwargs)
        self.reset_memory()

    def act(self,s):

        # Get Q Vector
        q = np.copy(self.Q[s,:])

        # Avoid already visited states
        q[self.states_memory] = -np.inf

        if np.random.rand() > self.epsilon:
            a = np.argmax(q)
        else:
            a = np.random.choice([x for x in range(self.actions_size) if x not in self.states_memory])

        return a


    def remember_state(self,s):
        self.states_memory.append(s)

    def reset_memory(self):
        self.states_memory = []


In [25]:
## Create the TSP Environment
import numpy as np
import gym
from gym import spaces
import pygame

from scipy.spatial.distance import cdist
from matplotlib.patches import Rectangle
from matplotlib.collections import PatchCollection
import matplotlib.pyplot as plt
plt.style.use("seaborn-dark")

class TSPEnvironment(gym.Env):
    def __init__ (self, n_stops = 100, version=True):
        print(f"TSP-Environment initialized with {n_stops} random stops")

        #True (V1) = Discrete Space / False = Array space
        self.version = version

        # Initialization
        #Number of stops
        self.n_stops = n_stops
        #Coordinates of stops
        self.xy = []
        self._visitedStops = []
        self._notVisitedStops = list(range(0,self.n_stops))
        
        #if(self.version):
        self.action_space = spaces.Discrete(n_stops)
        #else:
            #self.action_space = spaces.Box(np.array(range(0,self.n_stops)))
        
        if(self.version):
            self.observation_space = spaces.Box(low= 0, high = self.n_stops)
        else:
            self.observation_space = spaces.Box(low=np.zeros(self.n_stops), high=np.ones(self.n_stops))
        #self.action_space = self._notVisitedStops
        self.episode_length = 0
        self.step_count = 0
        self.distances = np.array

        self.array_visitedStops = np.zeros(n_stops)
        print(f'Shape Array:{self.array_visitedStops.shape}')

        

        #set starting point (state)
        
        #Generate stops
        self._generate_stops()
        #self._generate_q_values()


    def _generate_stops(self):
        self.xy = (np.random.rand(self.n_stops,2)*100).round(2)
        self.x=self.xy[:,0]
        self.y=self.xy[:,1]

        #print(f'genrated stops xy: {self.xy}')
        self.distances = cdist(self.xy,self.xy,'euclidean').round(0)
        
        #pick random StartPoint
        self._visitedStops.append(np.random.randint(0,self.n_stops))
        print(f'Starting Point: {self._visitedStops}')


    #return stops[-1]
    #Gibt die aktuelle Position des Agenten zurück
    def _get_state(self):
        if( len(self._visitedStops)>0):
            return self._visitedStops[-1]
        else:
            return 0 


    def render(self):
        pass

    #Resets StartingPoint
    def reset(self):
        self._visitedStops.clear()
        self.array_visitedStops = np.zeros(self.n_stops)
        #self._notVisitedStops = list(range(0,self.n_stops))

        first_stop = np.random.randint(self.n_stops)
        #self._notVisitedStops.remove(first_stop)
        self._visitedStops.append(first_stop)
        self.array_visitedStops[first_stop] = True
        self.step_count = 1

        if(self.version):
            return self._get_state()
        else:
            return self.array_visitedStops

    def step(self,destination):
        done = False
        self.step_count +=1
        reward = -self.n_stops*1000

        self.episode_length += 1
        if(self.episode_length < 1000):
            if(np.random.rand(1,1) < 0.1):
                destination = np.random.randint(0,self.n_stops)

        #Validize Step
        if(self._get_state() != destination & destination not in self._visitedStops):
            #Get reward for such a move
            reward = -self.distances[self._get_state(), destination]
            
            # Append state (new position)
            self._visitedStops.append(destination)
            self.array_visitedStops[destination] = True

        print(f'State in step: {self._get_state()}')
        print(f'Destination in step: {destination}')
        print(f'Length visited stops: {len(self._visitedStops)}')
        print(f'Visited Stops in step: {self._visitedStops}')
        print(f'Reward in step: {reward}')
        print(f'Stepcounter: {self.step_count}')
        
        #exploration = np.random.random_sample()
        #print(f'Exploration: {exploration}')
        #if(exploration >= 0.05):
            #destination = np.random.randint(self.action_space.n)
            #print(f'Random Destination: {destination}')

        if(self.step_count >= self.n_stops*5):
            done = True
            reward = -2000000
            print('Too much steps')

        if(len(self._visitedStops) == self.n_stops):
            if(len(self._visitedStops) <= self.step_count*2):
                reward += 500
            reward += 200    
            done = True
            print(f'Done = True')
            print(f'Length visited stops: {len(self._visitedStops)}')
            print(f'Visited Stops in step: {self._visitedStops}')
            
        info = {}

        #print(f'Agent position: {self._get_state()}')

        if(self.version):
            return self._get_state(), reward, done, {}
        else:
            return self.array_visitedStops, reward, done, {}


   



  plt.style.use("seaborn-dark")


In [26]:
episodes = 10 

env = TSPEnvironment(episodes, False)
print(env.array_visitedStops)
print(f'Step: {env.step(2)}')
print(f'Step: {env.step(3)}')
print(env.array_visitedStops.shape)
print(f'Reset: {env.reset()}')




TSP-Environment initialized with 10 random stops
Shape Array:(10,)
Starting Point: [2]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
State in step: 2
Destination in step: 2
Length visited stops: 1
Visited Stops in step: [2]
Reward in step: -10000
Stepcounter: 1
Step: (array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), -10000, False, {})
State in step: 5
Destination in step: 5
Length visited stops: 2
Visited Stops in step: [2, 5]
Reward in step: -72.0
Stepcounter: 2
Step: (array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]), -72.0, False, {})
(10,)
Reset: [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [None]:
episodes = 10 

env = TSPEnvironment(episodes)


for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    Loops = 0
    

    while not done:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
        Loops += 1
    print('Epsiode: {} Score: {} Episodes: {}'.format(episode,score.round(2),Loops))

In [39]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents import DQNAgent, DDPGAgent, ContinuousDQNAgent
from rl.policy import EpsGreedyQPolicy, BoltzmannQPolicy
from rl.memory import SequentialMemory
from tensorflow.keras.layers import Lambda

# Learning factors...
GeneratedStops = 10
learning_rate = 0.001

# create replay memory using deque
#memory = deque(maxlen=2000)

# Create gym environment
env = TSPEnvironment(GeneratedStops, False)


action_size = env.action_space.n
state_size = env.observation_space.shape[0]


# The following function creates a neural network which is used as an 
# approximate Q function
# Input: state 
# Output: Q Value of each action
def build_model(self, state_size, action_size):
    model = tf.keras.Sequential()
    #model.add(Input(shape=(3,)))
    model.add(Dense(512 ,input_shape=(state_size,), activation='relu'))
    # model.add(Dense(512 ,activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    model.summary()
    model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate))
    return model


    #input_shape=(input_dim,)

# Create the TSP agent
def build_agent(model, action_size):
    # Use Epsilon-Greedy policy for exploration
    policy = BoltzmannQPolicy()
    # Create memory for storing transitions
    memory = SequentialMemory(limit=50000, window_length=1)
    # Create the DQN agent
    #agent = DQNAgentTSP(model, memory=memory, policy=policy, nb_actions=action_size, nb_steps_warmup=100, target_model_update=1e-2)
    
    #agent.compile(Adam(lr=1e-3), metrics=['mae'])
    return agent


# Create the TSP model
model = build_model(True,state_size, action_size)
model.output
# Create the TSP agent
#agent = build_agent(model, action_size)
agent = DeliveryQAgent(env.observation_space,env.action_space)
# Train the agent



print(model.output)

#agent.fit(env, nb_steps=5000, visualize=False, verbose=0)


TSP-Environment initialized with 10 random stops
Shape Array:(10,)
Starting Point: [4]
Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_42 (Dense)            (None, 512)               5632      
                                                                 
 dense_43 (Dense)            (None, 512)               262656    
                                                                 
 dense_44 (Dense)            (None, 10)                5130      
                                                                 
Total params: 273,418
Trainable params: 273,418
Non-trainable params: 0
_________________________________________________________________


TypeError: 'Box' object cannot be interpreted as an integer

In [None]:
scores=agent.test(env, nb_episodes=1000, visualize=False)
print(np.mean(scores.history['episode_reward']))

In [None]:
env = TSPEnvironment(GeneratedStops, False)


action_size = env.action_space.n
state_size = env.observation_space.shape

print(action_size)
print(state_size)