-
Notifications
You must be signed in to change notification settings - Fork 0
/
IntrcptNN using tensorflow
162 lines (128 loc) · 4.88 KB
/
IntrcptNN using tensorflow
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import tensorflow as tf
import random
import numpy as np
from tensorflow.python.keras.callbacks import TensorBoard
from time import time
from Interceptor_V2 import Init, Draw, Game_step
tensorboard = TensorBoard(log_dir="logs/{}".format(time()))
# Replay Memory
reply_memory = []
# Set Hyper Parameters:
BATCH_SIZE = 10
EPSILON = 0.8
LEARNING_RATE = 0.08
NUMBER_OF_ACTIONS = 4
EPISODES = 1000
DISCOUNT = 0.99
SIZE_RM = 100 # size of replay memory
# building the network
def build_network():
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)))
model.add(tf.keras.layers.MaxPooling2D((2, 2)))
model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='relu'))
model.add(tf.keras.layers.MaxPooling2D((2, 2)))
model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='relu'))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(10, activation='softmax'))
model.summary()
return model
def optimize(init, train, x, log_x_squared):
with tf.Session() as session:
session.run(init)
# print("starting at", "x:", session.run(x), "log(x)^2:", session.run(log_x_squared))
for step in range(10):
session.run(train)
# print("step", step, "x:", session.run(x), "log(x)^2:", session.run(log_x_squared))
def predict(state, target, network):
with tf.Session() as session:
tf.keras.backend.set_session(session)
with tf.keras.sess.as_default():
with tf.keras.graph.as_default():
if target:
return target.predict(state)
else:
return network.predict(state)
# Training of the self-learning agent
def exec_process():
# Setting counters:
global EPSILON
average_loss = 0
average_reward = 0
# building network
network = build_network()
target_network = build_network()
state = np.ndarray(shape=[1, 32, 32, 3])
action = random.randint(0, NUMBER_OF_ACTIONS - 1)
r_locs, i_locs, c_locs, ang, score = Game_step(action)
np.append(state, [r_locs, i_locs, c_locs, ang])
reward = 0
for episode in range(EPISODES):
# tensorflow function for training model
network.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
target_network.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# state = tf.convert_to_tensor(state, dtype=None, dtype_hint=None, name=None)
history = network.fit(state, epochs=10, batch_size=10)
print("history dict: ", history.history)
# calculate state, next_state, reward and avg. reward
newState = [r_locs, i_locs, c_locs, ang]
next_reward = score - reward
reward = next_reward
average_reward = average_reward + reward
# get maximum q_value and particular action
with tf.Session() as sess:
qvals = sess.run(network, state)
bestAction = qvals.argmax(-1)
qval = qvals[action]
# get max Q_value of next state
with tf.Session() as sess:
next_q_vals = sess.run(target_network, newState)
maxNextValue = next_q_vals.max()
# loss function with Stochastic Gradient Descent
target = (reward + DISCOUNT * tf.math.maximum(maxNextValue, qval))
diff = target - qvals
loss = tf.nn.l2_loss(diff)
average_loss = tf.math.reduce_mean(loss, axis=None, keepdims=False, name=None)
# calculate gradient descent
x = tf.keras.Variable(loss, name='x', dtype=object)
log_x = tf.log(x)
log_x_squared = tf.square(log_x)
optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE)
train = optimizer.minimize(log_x_squared)
init = tf.tf.global_variables_initializer()
optimize(init, train, x, log_x_squared)
# store in reply memory
reply_memory.append((state, action, next_reward, newState, DISCOUNT))
# delete one tuple if replay memory becomes too big
if len(reply_memory) > SIZE_RM:
reply_memory.pop(0)
# print information
print("Average loss: ", average_loss / episode)
print("Average Reward: ", average_reward / episode)
print("States : ", newState)
# change exploration-exploitation ratio
if EPSILON > 0.1:
EPSILON = EPSILON - 0.0093
# updating the target network
if episode % 32 == 0:
target_network_update_ops = tf.update_target_variables(target_network.get_qnetwork_variables(),
network.get_qnetwork_variables(), tau=1.0)
with tf.Session() as sess:
sess.run(target_network_update_ops)
target_network = tf.keras.models.clone_model(network, input_tensors=None, clone_function=None)
print("\nCopied model parameters to target network.")
Draw()
# Use greedy epsilon algorithm to choose between random action (explore env.) or previous action (exploit env.)
rand = random.random()
if rand < EPSILON:
action = random.randint(0, NUMBER_OF_ACTIONS - 1)
else:
action = bestAction
state = newState
r_locs, i_locs, c_locs, ang, score = Game_step(action)
# saving parameters of the network
keras_model_path = "/tmp/keras_save"
network.save(keras_model_path)
Init()
exec_process()