In [1]:
import gym
import numpy as np
import random
import matplotlib.pyplot as plt
import matplotlib

import os
from keras.optimizers import Adam
from keras.layers import Dense
from keras.models import Sequential
from collections import deque

from tensorflow.keras.layers import Dropout

In [2]:
env = gym.make("FrozenLake-v1")
train_episodes=400
test_episodes=100
max_steps= 300

state_size = env.observation_space.n
action_size = env.action_space.n

batch_size=256

In [3]:
class Agent:
    def __init__(self, state_size, action_size):
        self.memory = deque(maxlen=2500)
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate=0.7
        self.epsilon=1
        self.max_eps=1
        self.min_eps=0.01
        self.eps_decay = 0.5
        self.gamma=0.95
        self.state_size= state_size
        self.action_size= action_size
        self.epsilon_lst=[]
        self.model = self.buildmodel()

    def buildmodel(self):
        model=Sequential()
        model.add(Dense(32, input_dim=self.state_size, activation='relu'))
        model.add(Dense(32, activation='relu'))
        model.add(Dropout(0.3))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        model.summary()
        return model
    
    def add_memory(self, new_state, reward, done, state, action):
        self.memory.append((new_state, reward, done, state, action))

    def action(self, state):
        if np.random.rand() > self.epsilon:
            return np.random.randint(0,4)
        return np.argmax(self.model.predict(state))

    def pred(self, state):
        return np.argmax(self.model.predict(state))

    def replay(self,batch_size):
        minibatch=random.sample(self.memory, batch_size)
        for new_state, reward, done, state, action in minibatch:
            target= reward
            if not done:
                target=reward + self.gamma* np.amax(self.model.predict(new_state))
            target_f= self.model.predict(state)
            target_f[0][action]= target
            self.model.fit(state, target_f, epochs=1)

        if self.epsilon > self.min_eps:
            self.epsilon=(self.max_eps - self.min_eps) * np.exp(-self.eps_decay*episode) + self.min_eps

        self.epsilon_lst.append(self.epsilon)

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

agent=Agent(state_size, action_size)

print(state_size)
print(action_size)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 32)                544       
                                                                 
 dense_1 (Dense)             (None, 32)                1056      
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 4)                 132       
                                                                 
Total params: 1,732
Trainable params: 1,732
Non-trainable params: 0
_________________________________________________________________
16
4


  super().__init__(name, **kwargs)


In [4]:
reward_lst = []
for episode in range(train_episodes):
    state = env.reset()
    state_arr = np.zeros(state_size)
    state_arr[int(state[0])] = 1
    state = np.reshape(state_arr, [1, state_size])
    reward = 0
    done = False
    for t in range(max_steps):
        # env.render()
        action = agent.action(state)
        result = env.step(action)
        new_state = result[0]
        reward = result[1]
        done = result[2]
        new_state_arr = np.zeros(state_size)
        new_state_arr[new_state] = 1
        new_state = np.reshape(new_state_arr, [1, state_size])
        agent.add_memory(new_state, reward, done, state, action)
        state = new_state

        if done:
            print(f'Episode: {episode:4}/{train_episodes} . Eps: {float(agent.epsilon):.2}, reward {reward}')
            reward_lst.append(reward)  # Thêm phần thưởng của mỗi episode vào danh sách
            break

    if len(agent.memory) > batch_size:
        agent.replay(batch_size)

print('Train mean % score=', round(100 * np.mean(reward_lst), 1))

mean_reward = np.mean(reward_lst)
std_reward = np.std(reward_lst)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")



  if not isinstance(terminated, (bool, np.bool8)):


Episode:    0/400 . Eps: 1.0, reward 0.0
Episode:    1/400 . Eps: 1.0, reward 0.0
Episode:    2/400 . Eps: 1.0, reward 0.0
Episode:    3/400 . Eps: 1.0, reward 0.0
Episode:    4/400 . Eps: 1.0, reward 1.0
Episode:    5/400 . Eps: 1.0, reward 0.0
Episode:    6/400 . Eps: 1.0, reward 0.0
Episode:    7/400 . Eps: 1.0, reward 0.0
Episode:    8/400 . Eps: 1.0, reward 0.0
Episode:    9/400 . Eps: 1.0, reward 0.0
Episode:   10/400 . Eps: 1.0, reward 0.0
Episode:   11/400 . Eps: 1.0, reward 0.0
Episode:   12/400 . Eps: 1.0, reward 0.0
Episode:   13/400 . Eps: 1.0, reward 0.0
Episode:   14/400 . Eps: 1.0, reward 0.0
Episode:   15/400 . Eps: 1.0, reward 0.0
Episode:   16/400 . Eps: 1.0, reward 0.0
Episode:   17/400 . Eps: 1.0, reward 0.0
Episode:   18/400 . Eps: 1.0, reward 0.0
Episode:   19/400 . Eps: 1.0, reward 0.0
Episode:   20/400 . Eps: 1.0, reward 0.0
Episode:   21/400 . Eps: 1.0, reward 0.0
Episode:   22/400 . Eps: 1.0, reward 0.0
Episode:   23/400 . Eps: 1.0, reward 0.0
Episode:   24/40

In [None]:
# test
test_wins=[]
for episode in range(test_episodes):
    state = env.reset()
    state_arr=np.zeros(state_size)
    state_arr[int(state[0])] = 1
    state= np.reshape(state_arr, [1, state_size])
    done = False
    reward=0
    state_lst = []
    state_lst.append(state)
    print('******* EPISODE ',episode, ' *******')

    for step in range(max_steps):
        action = agent.pred(state)
        result = env.step(action)
        new_state = result[0]
        reward = result[1]
        done = result[2]
        new_state_arr = np.zeros(state_size)
        new_state_arr[new_state] = 1
        new_state = np.reshape(new_state_arr, [1, state_size])
        state = new_state
        state_lst.append(state)
        if done:
            print(reward)
            # env.render()
            break

    test_wins.append(reward)
env.close()

print(' Test mean % score= ', int(100*np.mean(test_wins)))

mean_reward = np.mean(test_wins)
std_reward = np.std(test_wins)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
fig=plt.figure(figsize=(10,12))
matplotlib.rcParams.clear()
matplotlib.rcParams.update({'font.size': 16})
plt.subplot(311)
plt.scatter(list(range(len(reward_lst))), reward_lst)
plt.title('5x5 Frozen Lake Result(DQN) \n \nTrain Score')
plt.ylabel('Score')
plt.xlabel('Episode')

plt.subplot(312)
plt.scatter(list(range(len(agent.epsilon_lst))), agent.epsilon_lst)
plt.title('Epsilon')
plt.ylabel('Epsilon')
plt.xlabel('Episode')

plt.subplot(313)
plt.scatter(list(range(len(test_wins))), test_wins)
plt.title('Test Score')
plt.ylabel('Score')
plt.xlabel('Episode')
plt.ylim((0,1.1))
plt.savefig('5x5resultdqn.png',dpi=300)
plt.show()