forked from prakcoin/AutoDriveRL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
213 lines (179 loc) · 9.8 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import numpy as np
import gymnasium as gym
import time
import getopt, sys
import os
import csv
import itertools
import pickle
from agents.dqn import DQN
from agents.a2c import A2C
from baselines.a2c_baseline import A2CBaseline
from baselines.dqn_baseline import DQNBaseline
from helpers.evaluation import *
from helpers.plotting import Plotting
from tensorflow.keras import models
from stable_baselines3 import A2C as A2C_Baseline
from stable_baselines3 import DQN as DQN_Baseline
def main(argv):
# Hyperparameters
solver = None
mode = None
training_steps = 20000
testing_episodes = 100
testing_steps = 1000
neurons = 2048
lr = 0.0001
gamma = 0.99
duration = 90
epsilon = 0.9
replay_memory_size = 1000
update_target_every = 100
batch_size = 32
per_alpha = 0.0
num_layers = 6
# Environment configuration parameters
env = gym.make('highway-v0', render_mode='human')
env.config["duration"] = duration
env.config["right_lane_reward"] = 0.05
env.config["collision_reward"] = -5
env.config["high_speed_reward"] = 0.8
env.config["reward_speed_range"] = [30, 40]
env.config["observation"] = {
"type": "Kinematics",
"vehicles_count": 15,
"features": ["presence", "x", "y", "vx", "vy", "heading", "cos_h", "sin_h"],
"order": "sorted",
"normalize": True
}
env.reset()
try:
opts, _ = getopt.getopt(argv, "he:s:t:n:l:g:d:E:m:N:B:a:L:S:M:", ["help=", "episodes=", "steps=", "testing steps=" "neurons=",
"learning rate=", "gamma=", "duration=", "epsilon=",
"replay memory size=", "update target every=",
"batch size=", "per alpha=", "num layers=", "solver=", "mode="])
except getopt.GetoptError:
print('Usage: main.py [-h <help>] [-e <episodes>] [-s <steps>] [-t <testing steps>] [-n <neurons>] [-l <learning rate>] [-g <gamma>] [-d <duration>] [-E <epsilon>] [-m <replay memory size>] [-N <target update interval>] [-B <batch size>] [-a <per alpha>] [-L <num layers>] [-S <solver>] [-M <mode>]')
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print('Usage: main.py [-e <episodes>] [-s <steps>] [-t <testing steps>] [-n <neurons>] [-l <lr>] [-g <gamma>] [-d <duration>] [-E <epsilon>] [-m <replay memory size>] [-N <target update interval>] [-B <batch size>] [-a <per alpha>] [-L <num layers>] [-S <solver>] [-M <mode>]')
sys.exit()
elif opt in ("-e", "--episodes"):
testing_episodes = int(arg)
elif opt in ("-s", "--training_steps"):
training_steps = int(arg)
elif opt in ("-t", "--testing_steps"):
testing_steps = int(arg)
elif opt in ("-n", "--neurons"):
neurons = int(arg)
elif opt in ("-l", "--lr"):
lr = float(arg)
elif opt in ("-g", "--gamma"):
gamma = float(arg)
elif opt in ("-d", "--duration"):
duration = float(arg)
env.config["duration"] = duration
elif opt in ("-E", "--epsilon"):
epsilon = float(arg)
elif opt in ("-m", "--replay_memory_size"):
replay_memory_size = int(arg)
elif opt in ("-N", "--update_target_every"):
update_target_every = int(arg)
elif opt in ("-B", "--batch_size"):
batch_size = int(arg)
elif opt in ("-a", "--per_alpha"):
per_alpha = float(arg)
elif opt in ("-L", "--num_layers"):
num_layers = int(arg)
elif opt in ("-S", "--solver"):
solver = arg
elif opt in ("-M", "--mode"):
mode = arg
if solver == None or mode == None:
print("Error: missing required args")
sys.exit(2)
actor_critic = A2C(env, training_steps, testing_steps, neurons, lr, gamma)
actor_critic_baseline = A2CBaseline(env, training_steps, testing_steps, lr, gamma)
plotter = Plotting()
ddqn_baseline = DQNBaseline(env, training_steps, testing_steps)
if mode == 'train':
# Training A2C
if solver == 'a2c':
print("A2C Training")
a2c_training_action_distribution, a2c_training_rewards, a2c_training_max_reward = actor_critic.train_episode()
env.reset()
# Training A2C baseline
print("Baseline Training")
actor_critic_baseline.train_model()
env.reset()
# Training max reward
print("Max A2C training reward achieved:", a2c_training_max_reward)
pickle.dump(a2c_training_action_distribution, open("pickle files/a2c_action_dist20k",'wb'))
pickle.dump(a2c_training_rewards, open("pickle files/a2c_training_rewards20k",'wb'))
elif solver == 'ddqn':
print("DDQN Training")
for alpha in [1.0, 0.5, 0.0]:
ddqn = DQN(env, training_steps, testing_steps, neurons, lr, gamma, epsilon, replay_memory_size, batch_size, update_target_every, per_alpha, num_layers)
ddqn_training_action_distribution, ddqn_training_rewards = ddqn.train_episode()
pickle.dump(ddqn_training_rewards, open(f"dqn_rewards_{alpha}",'wb'))
pickle.dump(ddqn_training_action_distribution, open(f"dqn_action_dist_{alpha}",'wb'))
env.reset()
print("Baseline Training")
ddqn_baseline.train_model()
env.reset()
elif mode == 'test':
if solver == 'a2c':
# Testing A2C
print("A2C Prediction")
a2c_model = models.load_model("saved models/a2c_model5k.h5")
a2c_average_reward = prediction(episodes=testing_episodes, agent=actor_critic, model=a2c_model)
# # Testing baseline A2C
print("A2C Baseline Prediction")
a2c_baseline_model = A2C_Baseline.load("saved models/a2c_baseline5k")
a2c_baseline_average_reward = prediction(episodes=testing_episodes, agent=actor_critic_baseline, model=a2c_baseline_model)
# Evaluation average reward
print("Average A2C reward achieved:", a2c_average_reward)
print("Average A2C baseline reward achieved:", a2c_baseline_average_reward)
pickle.dump(a2c_average_reward, open("pickle files/A2C_Testing_Rewards", "wb"))
pickle.dump(dqn_baseline_average_reward, open("pickle files/A2C_Baseline_Testing_Rewards", "wb"))
elif solver == 'ddqn':
dqn_average_rewards = []
# Testing DQN
print("DQN Prediction")
for alpha in [1.0, 0.5, 0.0]:
ddqn = DQN(env, training_steps, testing_steps, neurons, lr, gamma, epsilon, replay_memory_size, batch_size, update_target_every, per_alpha, num_layers)
dqn_model = models.load_model(f"saved models/dqn_model_{alpha}.h5")
dqn_average_rewards.append(prediction(episodes=testing_episodes, agent=ddqn, model=dqn_model))
# Testing baseline DQN
print("DQN Baseline Prediction")
dqn_baseline_model = DQN_Baseline.load("saved models/dqn_baseline")
dqn_baseline_average_reward = prediction(episodes=testing_episodes, agent=ddqn_baseline, model=dqn_baseline_model)
# Evaluation average reward
print("Average DQN rewards achieved:", dqn_average_rewards)
print("Average DQN baseline reward achieved:", dqn_baseline_average_reward)
pickle.dump(dqn_average_rewards, open("pickle files/DQN_Testing_Rewards", "wb"))
pickle.dump(dqn_baseline_average_reward, open("pickle files/DQN_Baseline_Testing_Rewards", "wb"))
elif mode == 'plot_both':
# Print DQN testing rewards
average_dqn_test = pickle.load(open("pickle files/DQN_Testing_Rewards",'rb'))
baseline_dqn_test = pickle.load(open("pickle files/DQN_Baseline_Testing_Rewards",'rb'))
print("Average DQN Test", average_dqn_test)
print("Baseline DQN Test", baseline_dqn_test)
# Load pickle data
ddqn_training_rewards00 = pickle.load(open("pickle files/dqn_rewards_0.0",'rb'))
ddqn_training_rewards05 = pickle.load(open("pickle files/dqn_rewards_0.5",'rb'))
ddqn_training_rewards10 = pickle.load(open("pickle files/dqn_rewards_1.0",'rb'))
a2c_training_rewards20k = pickle.load(open("pickle files/a2c_training_rewards20k",'rb'))
a2c_training_rewards5k = pickle.load(open("pickle files/a2c_training_rewards5k",'rb'))
# Plot all metrics together
plotter.average_episodic_plot_all(a2c_training_rewards5k, a2c_training_rewards20k, ddqn_training_rewards00, ddqn_training_rewards05, ddqn_training_rewards10, "Reward", "A2C (5k 1024n)", "A2C (20k 2048n)", "DQN (0.0)", "DQN (0.5)", "DQN (1.0)")
plotter.episodic_plot_all(a2c_training_rewards5k, a2c_training_rewards20k, ddqn_training_rewards00, ddqn_training_rewards05, ddqn_training_rewards10, "Reward", "A2C (5k 1024n)", "A2C (20k 2048n)", "DQN (0.0)", "DQN (0.5)", "DQN (1.0)")
# Plot metrics individually
for ddqn_training_reward, alpha in zip([ddqn_training_rewards00, ddqn_training_rewards05, ddqn_training_rewards10], [0.0, 0.5, 1.0]):
plotter.average_episodic_plot(a2c_training_rewards5k, ddqn_training_reward, "Reward", "A2C (5k 1024n)", f"DQN ({alpha})")
plotter.episodic_plot(a2c_training_rewards5k, ddqn_training_reward, "Reward", "A2C (5k 1024n)", f"DQN ({alpha})")
plotter.average_episodic_plot(a2c_training_rewards20k, ddqn_training_reward, "Reward", "A2C (20k 2048n)", f"DQN ({alpha})")
plotter.episodic_plot(a2c_training_rewards20k, ddqn_training_reward, "Reward", "A2C (20k 2048n)", f"DQN ({alpha})")
if __name__ == "__main__":
main(sys.argv[1:])