IntrcptNN using tensorflow

import tensorflow as tf
import random
import numpy as np
from tensorflow.python.keras.callbacks import TensorBoard
from time import time
from Interceptor_V2 import Init, Draw, Game_step

tensorboard = TensorBoard(log_dir="logs/{}".format(time()))

# Replay Memory
reply_memory = []

# Set Hyper Parameters:
BATCH_SIZE = 10
EPSILON = 0.8
LEARNING_RATE = 0.08
NUMBER_OF_ACTIONS = 4
EPISODES = 1000
DISCOUNT = 0.99
SIZE_RM = 100  # size of replay memory


# building the network
def build_network():
	model = tf.keras.Sequential()
	model.add(tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)))
	model.add(tf.keras.layers.MaxPooling2D((2, 2)))
	model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='relu'))
	model.add(tf.keras.layers.MaxPooling2D((2, 2)))
	model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='relu'))
	model.add(tf.keras.layers.Flatten())
	model.add(tf.keras.layers.Dense(64, activation='relu'))
	model.add(tf.keras.layers.Dense(10, activation='softmax'))
	model.summary()
	return model


def optimize(init, train, x, log_x_squared):
	with tf.Session() as session:
		session.run(init)
		# print("starting at", "x:", session.run(x), "log(x)^2:", session.run(log_x_squared))
		for step in range(10):
			session.run(train)
			# print("step", step, "x:", session.run(x), "log(x)^2:", session.run(log_x_squared))


def predict(state, target, network):
	with tf.Session() as session:
		tf.keras.backend.set_session(session)
		with tf.keras.sess.as_default():
			with tf.keras.graph.as_default():
				if target:
					return target.predict(state)
				else:
					return network.predict(state)


# Training of the self-learning agent
def exec_process():
	# Setting counters:
	global EPSILON
	average_loss = 0
	average_reward = 0

	# building network
	network = build_network()
	target_network = build_network()

	state = np.ndarray(shape=[1, 32, 32, 3])

	action = random.randint(0, NUMBER_OF_ACTIONS - 1)
	r_locs, i_locs, c_locs, ang, score = Game_step(action)
	np.append(state, [r_locs, i_locs, c_locs, ang])
	reward = 0

	for episode in range(EPISODES):

		# tensorflow function for training model
		network.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
		target_network.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
		# state = tf.convert_to_tensor(state, dtype=None, dtype_hint=None, name=None)
		history = network.fit(state, epochs=10, batch_size=10)

		print("history dict: ", history.history)

		# calculate state, next_state, reward and avg. reward
		newState = [r_locs, i_locs, c_locs, ang]

		next_reward = score - reward
		reward = next_reward
		average_reward = average_reward + reward

		# get maximum q_value and particular action
		with tf.Session() as sess:
			qvals = sess.run(network, state)
		bestAction = qvals.argmax(-1)
		qval = qvals[action]

		# get max Q_value of next state
		with tf.Session() as sess:
			next_q_vals = sess.run(target_network, newState)
		maxNextValue = next_q_vals.max()

		# loss function with Stochastic Gradient Descent
		target = (reward + DISCOUNT * tf.math.maximum(maxNextValue, qval))
		diff = target - qvals
		loss = tf.nn.l2_loss(diff)
		average_loss = tf.math.reduce_mean(loss, axis=None, keepdims=False, name=None)

		# calculate gradient descent
		x = tf.keras.Variable(loss, name='x', dtype=object)
		log_x = tf.log(x)
		log_x_squared = tf.square(log_x)

		optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE)
		train = optimizer.minimize(log_x_squared)
		init = tf.tf.global_variables_initializer()
		optimize(init, train, x, log_x_squared)

		# store in reply memory
		reply_memory.append((state, action, next_reward, newState, DISCOUNT))
		# delete one tuple if replay memory becomes too big
		if len(reply_memory) > SIZE_RM:
			reply_memory.pop(0)

		# print information
		print("Average loss: ", average_loss / episode)
		print("Average Reward: ", average_reward / episode)
		print("States : ", newState)

		# change exploration-exploitation ratio
		if EPSILON > 0.1:
			EPSILON = EPSILON - 0.0093

		# updating the target network
		if episode % 32 == 0:
			target_network_update_ops = tf.update_target_variables(target_network.get_qnetwork_variables(),
																   network.get_qnetwork_variables(), tau=1.0)
			with tf.Session() as sess:
				sess.run(target_network_update_ops)
				target_network = tf.keras.models.clone_model(network, input_tensors=None, clone_function=None)
				print("\nCopied model parameters to target network.")

		Draw()

		# Use greedy epsilon algorithm to choose between random action (explore env.) or previous action (exploit env.)
		rand = random.random()
		if rand < EPSILON:
			action = random.randint(0, NUMBER_OF_ACTIONS - 1)
		else:
			action = bestAction
		state = newState

		r_locs, i_locs, c_locs, ang, score = Game_step(action)

	# saving parameters of the network
	keras_model_path = "/tmp/keras_save"
	network.save(keras_model_path)


Init()
exec_process()