In [None]:
import keras
from keras import layers
import gymnasium as gym
from gymnasium.wrappers.frame_stack import FrameStack

from gymnasium.wrappers.atari_preprocessing import AtariPreprocessing  
import numpy as np  
import tensorflow as tf  
import ale_py  

model = keras.models.load_model("models/breakout_qmodel_547.keras")
model_target = keras.models.load_model("models/breakout_qmodel_547.keras")

seed = 42  
gamma = 0.99  
epsilon = 1.0  
epsilon_min = 0.1  
epsilon_max = 1.0  
epsilon_interval = (epsilon_max - epsilon_min)  

batch_size = 32  
max_steps_per_episode = 10000  
max_episodes = 0  
max_frames = 1e7  

gym.register_envs(ale_py)
env = gym.make("SpaceInvadersNoFrameskip-v4", render_mode="rgb_array")  
env = AtariPreprocessing(env)  
env = FrameStack(env, 4)  

trigger = lambda t: t % 500 == 0
env = gym.wrappers.RecordVideo(env, video_folder="videos", episode_trigger=trigger, disable_logger=True)  

num_actions = env.action_space.n
  
action_meanings = env.unwrapped.get_action_meanings()
print(f"{num_actions} {action_meanings}") # 'NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE'

def preprocess_input(data):
    return np.transpose(data, (1, 2, 0))
  
# Skapa den primära modellen och en target-modell (för stabilitet vid uppdateringar).
# model = create_q_model()
# model_target = create_q_model()

optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []
running_reward = 0
episode_count = 550
frame_count = 280000

epsilon_random_frames = 50000
epsilon_greedy_frames = 1000000.0

max_memory_length = 1000000
update_after_actions = 4
update_target_network = 10000
loss_function = keras.losses.Huber()

while True:
    observation, _ = env.reset()
    state = np.array(observation)
    episode_reward = 0

    for timestep in range(1, max_steps_per_episode):
        frame_count += 1  

        if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
            action = np.random.choice(num_actions)
        else:
            state_processed = preprocess_input(state) #------------------------------------------------
            state_tensor = keras.ops.convert_to_tensor(state_processed)
            state_tensor = keras.ops.expand_dims(state_tensor, 0)
            action_probs = model(state_tensor, training=False)
            # # Exploaterar genom att använda modellen för att välja den bästa åtgärden.
            # state_tensor = keras.ops.convert_to_tensor(state)  # Konverterar tillståndet till en tensor.
            # state_tensor = keras.ops.expand_dims(state_tensor, 0)  # Lägger till batch-dimension.
            # action_probs = model(state_tensor, training=False)  # Förutspår Q-värden utan att aktivera träningsläge.
            # action = keras.ops.argmax(action_probs[0]).numpy()  # Väljer åtgärden med högst Q-värde.

        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min) 

        state_next, reward, done, _, _ = env.step(action)  
        state_next = np.array(state_next) 

        episode_reward += reward 

        action_history.append(action)
        state_history.append(state)
        state_next_history.append(state_next)
        done_history.append(done)
        rewards_history.append(reward)

        state = state_next

        if frame_count % update_after_actions == 0 and len(done_history) > batch_size:
            indices = np.random.choice(range(len(done_history)), size=batch_size)


            state_sample = np.array([state_history[i] for i in indices])
            state_next_sample = np.array([state_next_history[i] for i in indices])
            rewards_sample = [rewards_history[i] for i in indices]
            action_sample = [action_history[i] for i in indices]
            done_sample = keras.ops.convert_to_tensor(
                [float(done_history[i]) for i in indices]
            )

            state_next_sample_processed = np.array([preprocess_input(s) for s in state_next_sample])
            future_rewards = model_target.predict(state_next_sample_processed, verbose=0)
            #future_rewards = model_target.predict(state_next_sample, verbose=0)---------------------------------------------------------

            updated_q_values = rewards_sample + gamma * keras.ops.amax(future_rewards, axis=1)

            updated_q_values = updated_q_values * (1 - done_sample) - done_sample

            masks = keras.ops.one_hot(action_sample, num_actions)

            with tf.GradientTape() as tape:
                # Förutspår Q-värden för det aktuella tillståndet.
                #q_values = model(state_sample)------------------------------------------------------------------------------------------------

                state_sample_processed = np.array([preprocess_input(s) for s in state_sample])
                q_values = model(state_sample_processed)
                
                q_action = keras.ops.sum(keras.ops.multiply(q_values, masks), axis=1)
                
                loss = loss_function(updated_q_values, q_action)

            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        if frame_count % update_target_network == 0:
            model_target.set_weights(model.get_weights())  
            print(f"Best score of last 100: {np.max(episode_reward_history)}, running reward: {running_reward:.2f} at episode {episode_count}, frame count {frame_count}")
            model.save(f"models/breakout_qmodel_{episode_count}.keras")  

        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]

        if done:
            break

    episode_reward_history.append(episode_reward)
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    running_reward = np.mean(episode_reward_history)

    episode_count += 1
    print(f"Episode {episode_count-1}: {episode_reward}")

    if running_reward > 800:
        print("Solved at episode {}!".format(episode_count))
        break

    if max_episodes > 0 and episode_count >= max_episodes:
        print("Stopped at episode {}!".format(episode_count))
        break
    if max_frames <= frame_count:
        print(f"Stopped at frame {frame_count}!")
        break