In [1]:
import gymnasium as gym
import random
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout, Input
from collections import deque

In [2]:
env = gym.make('MountainCar-v0', render_mode="human")

In [3]:
env.reset()

  from pkg_resources import resource_stream, resource_exists


(array([-0.5752154,  0.       ], dtype=float32), {})

In [13]:
for _ in range(1000):
    action = env.action_space.sample()  # یک اکشن تصادفی
    obs, reward, done, truncated, info = env.step(action)
    if done or truncated:
        env.reset()


KeyboardInterrupt: 

In [5]:
env.close()

In [6]:
env.observation_space.shape[0]

2

In [7]:
env.action_space.n

np.int64(3)

In [4]:
env.action_space.sample()

np.int64(0)

In [5]:
class DQN:
    def __init__ (self, env):
        self.env = env
        self.memory = deque(maxlen=2000)
        self.gamma = 0.03
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.005
        self.tau = 0.125
        self.model = self.create_model()
        self.target_model = self.create_model()
    
    def create_model(self):
        model = Sequential()
        model.add(Input(shape=(self.env.observation_space.shape[0],)))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(48, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.env.action_space.n))
        model.compile(loss="mean_squared_error", optimizer=Adam(learning_rate=self.learning_rate))
        return model
    
    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.model.predict(state, verbose=0)[0])
    
    def replay(self):
        batch_size = 32
        if len(self.memory) < batch_size:
            return
        
        samples = random.sample(self.memory, batch_size)
        for sample in samples:
            state, action, reward, new_state, done = sample
            target = self.target_model.predict(state, verbose=0)
            if done:
                target[0][action] = reward
            else:
                Q_future = max(self.target_model.predict(new_state, verbose=0)[0])
                target[0][action] = reward + Q_future * self.gamma
            self.model.fit(state, target, epochs=1, verbose=1)
    
    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])
        
    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)
    
    def save_model(self, fn):
        self.model.save(fn)

In [6]:
env = gym.make('MountainCar-v0', render_mode="human")
gamma = 0.9
epsilon = 0.95

In [7]:
trails = 1000
trail_len = 500
dqn_agnet = DQN(env=env)

In [8]:
obs, _ = env.reset()
current_state = obs.reshape(1, 2)
current_state

array([[-0.41156262,  0.        ]], dtype=float32)

In [9]:
dqn_agnet.act(current_state)

np.int64(1)

In [None]:
steps = []
for trail in range(trails):
    obs, info = env.reset()
    current_state = obs.reshape(1, 2)

    for step in range(trail_len):
        print('#', step)
        action = dqn_agnet.act(current_state)

        new_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated  # همون مفهوم done قدیمی

        new_state = new_state.reshape(1, 2)
        dqn_agnet.remember(current_state, action, reward, new_state, done)
        dqn_agnet.replay()
        dqn_agnet.target_train()

        current_state = new_state
        if done:
            break

    if step >= 199:
        print('Failed')
    else:
        print('Success')
        dqn_agnet.save_model('masoud')
        break


# 0
# 1
# 2
# 3
# 4
# 5
# 6
# 7
# 8
# 9
# 10
# 11
# 12
# 13
# 14
# 15
# 16
# 17
# 18
# 19
# 20
# 21
# 22
# 23
# 24
# 25
# 26
# 27
# 28
# 29
# 30
# 31
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 637ms/step - loss: 0.3872
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 0.3334
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 0.3430
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 0.3516
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 0.3318
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 0.3063
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 0.3051
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 0.2893
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 0.3024
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms

Exception ignored in: <function WeakKeyDictionary.__init__.<locals>.remove at 0x000001E0109E8400>
Traceback (most recent call last):
  File "C:\Python313\Lib\weakref.py", line 369, in remove
    def remove(k, selfref=ref(self)):
KeyboardInterrupt: 


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - loss: 3.0150e-05
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 2.1093e-05
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 5.9280e-06
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 4.1016e-05
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 1.2701e-06
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 1.0662e-05
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 1.2853e-05
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - loss: 2.6345e-06
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - loss: 2.7103e-06
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 1.3550e-05
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - loss: 1.1898e-06