In [1]:
from elevatorSystemEnv import *
import numpy as np
from stable_baselines3 import A2C, PPO, DQN
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor

import gym

In [2]:
n_elevators = 2
n_floors = 10
env = ElevatorSystemEnv(n_elevators,n_floors)
plays = env.action_space

In [3]:
for episode in range(1,5):
    obs = env.reset()
    done = False
    score = 0    
    while not done:        
        obs, reward, done, info = env.step(plays.sample())
        score += reward

    print('Episode:{} Score: {}'.format(episode,score))

Episode:1 Score: 14
Episode:2 Score: 40
Episode:3 Score: 12
Episode:4 Score: 12


In [4]:
states = env.observation_space.shape[0]
actions = env.action_space.n

In [5]:
states

24

In [6]:
actions

10

In [7]:
def train(env,model,name,timesteps):
    eval_callback = EvalCallback(Monitor(env), best_model_save_path='./logs/{}_best_model'.format(name),
                             log_path='./logs/results', eval_freq=10000, verbose=0)
    model.learn(total_timesteps=int(timesteps),progress_bar=True,callback=eval_callback)

In [8]:
def test(model):
    res = np.empty(10)
    for episode in range(1,11):
        obs = env.reset()
        done = False
        score = 0    
        while not done:        
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            score += reward
        res[episode-1] = score

        print('Episode:{} Score: {}'.format(episode,score))
    print('Mean:{}'.format(np.mean(res)))

In [9]:
model = A2C('MlpPolicy', env, verbose=0)

In [10]:
train(env,model,'a2c',5e5)

Output()

In [11]:
test(A2C.load('logs/a2c_best_model/best_model.zip'))

Episode:1 Score: 603
Episode:2 Score: 713
Episode:3 Score: 481
Episode:4 Score: 660
Episode:5 Score: 667
Episode:6 Score: 506
Episode:7 Score: 542
Episode:8 Score: 571
Episode:9 Score: 554
Episode:10 Score: 662
Mean:595.9


In [12]:
model2 = PPO('MlpPolicy', env, verbose=0)


In [13]:
train(env,model2,'ppo',5e5)

Output()

In [14]:
test(PPO.load('./logs/ppo_best_model/best_model.zip'))

Episode:1 Score: 904
Episode:2 Score: 988
Episode:3 Score: 811
Episode:4 Score: 951
Episode:5 Score: 970
Episode:6 Score: 941
Episode:7 Score: 961
Episode:8 Score: 913
Episode:9 Score: 971
Episode:10 Score: 915
Mean:932.5


In [15]:
model3 = DQN('MlpPolicy', env, verbose=0)


In [16]:
train(env,model3,'dqn',5e5)

Output()

In [17]:
test(DQN.load('./logs/dqn_best_model/best_model.zip'))

Episode:1 Score: 17
Episode:2 Score: 21
Episode:3 Score: 40
Episode:4 Score: 12
Episode:5 Score: 35
Episode:6 Score: 29
Episode:7 Score: 13
Episode:8 Score: 58
Episode:9 Score: 34
Episode:10 Score: 23
Mean:28.2
