In [1]:
import gym
import numpy as np
import random
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.losses import mean_squared_error
import keras.backend as K

from collections import deque

import matplotlib.pyplot as plt

import os
import datetime
import pickle

Using TensorFlow backend.


In [10]:
class ProgressViewer():
    def __init__(self, create_new_directory = True):
        self.losses = []
        self.rewards = []
        self.epsilons = []     
        
        if create_new_directory:
            self.__path = self.__make_dir()
    
    def add(self, reward, epsilon, loss):
        
        self.add_reward(reward)
        self.add_epsilon(epsilon)
        self.add_loss(loss)
        
        if len(self.rewards) > 99:
            self.__save_array()    
            self.rewards = []
            self.epsilons = []
            self.losses = []
    
    def add_reward(self, value):
        
        self.rewards.append(value)
        
    def add_loss(self, value):
        self.losses.append(value)
    
    def add_epsilon(self, value):
        self.epsilons.append(value)
    
    def get_rewards(self):
        return self.rewards
    
    def get_epsilons(self):
        return self.epsilons
    
    def plot_rewards(self, begin = 0, end = -1):
        plt.figure(figsize=(20,10))
        plt.plot(self.rewards)
    
    def plot_epsilons(self, begin = 0, end = -1):
        plt.figure(figsize=(20,10))
        plt.plot(self.epsilons)
        
    def plot_lossess(self, begin = 0, end = -1):
        plt.figure(figsize=(20,10))
        plt.plot(self.losses[begin:end])

    def plot_mean_rewards(self, mean = 10):
        plt.figure(figsize=(20,10))
        mean_rewards = []
        for i in range(0,len(self.rewards)-mean,mean):
            mean_rewards.append(np.mean(self.rewards[i * mean : (i+1) * mean]))
        plt.plot(mean_rewards)
    
    def plot(self, path = None):
        """
        отрисовывает график награды, изменения функции потерь и epsilon,
        а также выводит все графики на одном
        """
        x,y,z = self.__load_array(path)
        
        x1 = np.array(x) / np.max(x)
        y1 = np.array(y) / np.max(y)
        z1 = np.array(z) / np.max(z)

        grid = plt.GridSpec(2, 3)

        plt.figure(figsize=(20,10))

        q1 = plt.subplot(grid[0,0])
        q2 = plt.subplot(grid[0,1])
        q3 = plt.subplot(grid[0,2])
        q4 = plt.subplot(grid[1,:3])

        q1.plot(x, color = 'coral', label = 'reward')
        q1.legend()

        q2.plot(y, color = 'coral', label = 'loss')
        q2.legend()

        q3.plot(z, color = 'coral', label = 'epsilon')
        q3.legend()

        q4.plot(x1, color = 'coral', label = 'reward')
        q4.plot(y1, color = 'skyblue', label = 'loss')
        q4.plot(z1, color = 'grey', label = 'epsilon')
        q4.legend()
        
    def __save_array(self):
        """
        сохраняет все массивы в файл чтобы не забиваться память, 
        файл создается в директории, созданной методом makedir автоматически
        """
        filename = str(datetime.datetime.now().time()).replace(':','-')+'.visualiser'
        with open(self.__path+'//'+filename,'wb') as file:
            pickle.dump([self.rewards,
                         self.epsilons,
                         self.losses 
                         ],
                        file)
        
    def __make_dir(self):
        """
        создает автоматически дмректорию, куда будут сохраняться все файлы лога
        """
        path = 'viewer_log_'+str(datetime.datetime.now())[:16].replace(':','-')
        os.mkdir(path)
        return path
    
    def __load_array(self, path = None):
        """
        из директории, созданной makedir, собирает по всем файлам в директории 
        единые списки наград, ошибок и эпсилона (для отрисовки)
        """
        if path == None:
            path_to_log = self.__path
        else:
            path_to_log = path
            
        files = os.listdir(path_to_log)
        res_list = []
        
        for file in files:
            with open(path_to_log+'//'+file, 'rb') as f:
                var_list = pickle.load(f)    
                if len(res_list) == 0:
                    res_list = var_list.copy()
                else:
                    res_list[0] += var_list[0]
                    res_list[1] += var_list[1]
                    res_list[2] += var_list[2]

        return res_list[0],res_list[1],res_list[2] 

In [17]:
class DQN:
    def __init__(self, env):
        self.env     = env
        self.memory  = deque(maxlen=2000)
        
        self.gamma = 0.85
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9995
        self.learning_rate = 0.001
        self.tau = 0.95 # если тау 1 то веса из обучаемой сети в целевую копируются полностью, если 0 то не копируются

        self.model        = self.create_model() #сеть, которая учит функцию ценности на взаимодействии со средой
        self.target_model = self.create_model() #сеть, которая предсказывает функцию ценности
        
        self.history = 0 #нужно для мониторинга функции потерь

    def create_model(self):
        model   = Sequential()
        state_shape  = self.env.observation_space.shape
        model.add(Dense(24, input_dim=state_shape[0], activation="relu"))
        model.add(Dense(48, activation="relu"))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(self.env.action_space.n))
        model.compile(loss=mean_squared_error,
            optimizer=Adam(lr=self.learning_rate))
        return model

    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        return np.argmax(self.model.predict(state)[0])

    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])

    def replay(self):
        batch_size = 32
        if len(self.memory) < batch_size: 
            return

        samples = random.sample(self.memory, batch_size)
        for sample in samples:
            state, action, reward, new_state, done = sample
            target = self.target_model.predict(state)
            if done:
                target[0][action] = reward
            else:
                Q_future = max(self.target_model.predict(new_state)[0])
                target[0][action] = reward + Q_future * self.gamma
            history = self.model.fit(state, target, epochs=1, verbose=0)
            self.history = history.history['loss'][0]

    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)

In [18]:
env     = gym.make("CartPole-v1")
gamma   = 0.9
epsilon = .95

trials  = 1000000
trial_len = 500

updateTargetNetwork = 500
dqn_agent = DQN(env=env)
viewer = ProgressViewer(True)

In [None]:
update_step_counter = 0
for trial in range(trials):
    cur_state = env.reset().reshape(1,4)
    total_episode_reward = 0
    
    for step in range(trial_len):
        action = dqn_agent.act(cur_state)
        
        new_state, reward, done, _ = env.step(action)
        total_episode_reward += reward
        
        reward = reward if not done else -20
        
        new_state = new_state.reshape(1,4)
        dqn_agent.remember(cur_state, action, reward, new_state, done)

        dqn_agent.replay()       # internally iterates default (prediction) model
#         dqn_agent.target_train() # iterates target model
        update_step_counter += 1
        if update_step_counter == updateTargetNetwork:
            dqn_agent.target_train()
            update_step_counter = 0

        cur_state = new_state
        if done:
            viewer.add(step, dqn_agent.epsilon, dqn_agent.history)
            break        
    if trial % 10 == 0:
        print('trial:{}, mean_reward:{}'.format(trial, np.mean(viewer.rewards[-10:])))
#         print('trial:{}, mean_reward:{}'.format(trial, total_episode_reward))

trial:0, mean_reward:24.0
trial:10, mean_reward:22.3
trial:20, mean_reward:21.8
trial:30, mean_reward:13.5
trial:40, mean_reward:23.0
trial:50, mean_reward:19.7
trial:60, mean_reward:13.1
trial:70, mean_reward:17.9
trial:80, mean_reward:21.4
trial:90, mean_reward:15.9
trial:100, mean_reward:12.0
trial:110, mean_reward:25.9
trial:120, mean_reward:22.6
trial:130, mean_reward:30.9
trial:140, mean_reward:29.3
trial:150, mean_reward:19.4
trial:160, mean_reward:12.6
trial:170, mean_reward:26.0
trial:180, mean_reward:44.6
trial:190, mean_reward:51.2
trial:200, mean_reward:28.0
trial:210, mean_reward:112.3
trial:220, mean_reward:138.2
trial:230, mean_reward:181.1
trial:240, mean_reward:158.3
trial:250, mean_reward:193.3
trial:260, mean_reward:173.2
trial:270, mean_reward:159.8
trial:280, mean_reward:117.8
trial:290, mean_reward:117.1
trial:300, mean_reward:42.0
trial:310, mean_reward:119.4
trial:320, mean_reward:110.3
trial:330, mean_reward:64.9
trial:340, mean_reward:81.1
trial:350, mean_rewa