In [3]:
#run two times
import tensorflow as tf
import random
import gym
from keras import __version__
tf.keras.__version__ = __version__
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
import numpy as np

In [5]:
env = gym.make("CartPole-v1", render_mode="human")

In [9]:
#random inputs
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        action = random.choice([0,1])
        _, reward, done, _ = env.step(action)
        score += reward
        env.render()

    print(f"Episode {episode}, Score: {score}")
env.close()

Episode 1, Score: 32.0
Episode 2, Score: 11.0
Episode 3, Score: 15.0
Episode 4, Score: 14.0
Episode 5, Score: 33.0
Episode 6, Score: 18.0
Episode 7, Score: 20.0
Episode 8, Score: 19.0
Episode 9, Score: 13.0
Episode 10, Score: 27.0


In [6]:
def build_model(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1,states)))
    model.add(Dense(24, activation="relu"))
    model.add(Dense(24, activation="relu"))
    model.add(Dense(actions, activation="linear"))
    return model

In [7]:
states = env.observation_space.shape[0]
actions = env.action_space.n

In [9]:
model = build_model(states, actions)

In [10]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_1 (Flatten)         (None, 4)                 0         
                                                                 
 dense_3 (Dense)             (None, 24)                120       
                                                                 
 dense_4 (Dense)             (None, 24)                600       
                                                                 
 dense_5 (Dense)             (None, 2)                 50        
                                                                 
Total params: 770 (3.01 KB)
Trainable params: 770 (3.01 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
def build_agent(model, actions):
    agent = DQNAgent(
        model=model,
        memory=SequentialMemory(limit=50000, window_length=1),
        policy=BoltzmannQPolicy(),
        nb_actions=actions,
        nb_steps_warmup=10,
        target_model_update=0.01
    )
    return agent

In [28]:
agent = build_agent(model, actions)
agent.compile(tf.keras.optimizers.legacy.Adam(learning_rate=0.001),metrics=["mae"])
agent.fit(env, nb_steps=100000, visualize=False, verbose=1)

Training for 100000 steps ...
Interval 1 (0 steps performed)
94 episodes - episode_reward: 106.298 [9.000, 432.000] - loss: 2.390 - mae: 20.638 - mean_q: 41.726

Interval 2 (10000 steps performed)
36 episodes - episode_reward: 267.194 [207.000, 500.000] - loss: 3.502 - mae: 44.643 - mean_q: 90.025

Interval 3 (20000 steps performed)
35 episodes - episode_reward: 295.686 [227.000, 476.000] - loss: 2.668 - mae: 48.810 - mean_q: 98.210

Interval 4 (30000 steps performed)
28 episodes - episode_reward: 350.571 [229.000, 500.000] - loss: 1.693 - mae: 47.339 - mean_q: 95.157

Interval 5 (40000 steps performed)
32 episodes - episode_reward: 306.656 [239.000, 500.000] - loss: 1.126 - mae: 44.318 - mean_q: 88.991

Interval 6 (50000 steps performed)
34 episodes - episode_reward: 301.029 [36.000, 500.000] - loss: 0.593 - mae: 42.854 - mean_q: 85.960

Interval 7 (60000 steps performed)
24 episodes - episode_reward: 411.083 [93.000, 500.000] - loss: 1.837 - mae: 44.309 - mean_q: 89.042

Interval 8 (

<keras.src.callbacks.History at 0x1884bd76fd0>

In [17]:
results = agent.test(env, nb_episodes=10, visualize=True)
print(np.mean(results.history["episode_reward"]))
env.close()

Testing for 10 episodes ...
Episode 1: reward: 500.000, steps: 500
Episode 2: reward: 500.000, steps: 500
Episode 3: reward: 500.000, steps: 500
Episode 4: reward: 500.000, steps: 500
Episode 5: reward: 500.000, steps: 500
Episode 6: reward: 500.000, steps: 500
Episode 7: reward: 500.000, steps: 500
Episode 8: reward: 500.000, steps: 500
Episode 9: reward: 500.000, steps: 500
Episode 10: reward: 500.000, steps: 500
500.0


In [34]:
agent.save_weights('dqn_weights.h5f', overwrite=True)

In [20]:
env = gym.make("CartPole-v1", render_mode="human")
actions = env.action_space.n
states = env.observation_space.shape[0]
model = build_model(states, actions)
agent = build_agent(model, actions)
agent.compile(tf.keras.optimizers.legacy.Adam(learning_rate=0.001), metrics=['mae'])

In [21]:
agent.load_weights('dqn_weights.h5f')

In [23]:
agent.test(env, nb_episodes=10, visualize=True)
env.close()

Testing for 10 episodes ...
Episode 1: reward: 500.000, steps: 500
Episode 2: reward: 500.000, steps: 500
Episode 3: reward: 500.000, steps: 500
Episode 4: reward: 500.000, steps: 500
Episode 5: reward: 500.000, steps: 500
Episode 6: reward: 500.000, steps: 500
Episode 7: reward: 500.000, steps: 500
Episode 8: reward: 500.000, steps: 500
Episode 9: reward: 500.000, steps: 500
Episode 10: reward: 500.000, steps: 500
