In [None]:
import sys
import gym
import pylab
import random
import numpy as np
from collections import deque
from keras.layers import Dense
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Sequential
import pandas as pd
import os

EPISODES = 30000


# 카트폴 예제에서의 DQN 에이전트
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.render = False
        self.load_model = True

        # 상태와 행동의 크기 정의
        self.state_size = state_size
        self.action_size = action_size

        # DQN 하이퍼파라미터
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.01
        self.batch_size = 128
        self.train_start = 200

        # 리플레이 메모리, 최대 크기 300
        self.memory = deque(maxlen=300)

        # 모델과 타깃 모델 생성
        self.model = self.build_model()
        self.target_model = self.build_model()

        # 타깃 모델 초기화
        self.update_target_model()

        if self.load_model:
            self.model.load_weights("./save_model/4action_dqn_rainyday_best.h5")

    # 상태가 입력, 큐함수가 출력인 인공신경망 생성
    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(24, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(24, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(24, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(24, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(24, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(24, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(self.action_size, activation='linear',
                        kernel_initializer='he_uniform'))
        model.summary()
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    # 타깃 모델을 모델의 가중치로 업데이트
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    # 입실론 탐욕 정책으로 행동 선택
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(state)
            return np.argmax(q_value[0])

    # 샘플 <s, a, r, s'>을 리플레이 메모리에 저장
    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    # 리플레이 메모리에서 무작위로 추출한 배치로 모델 학습
    def train_model(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        # 메모리에서 배치 크기만큼 무작위로 샘플 추출
        mini_batch = random.sample(self.memory, self.batch_size)

        states = np.zeros((self.batch_size, self.state_size))
        next_states = np.zeros((self.batch_size, self.state_size))
        actions, rewards, dones = [], [], []

        for i in range(self.batch_size):
            states[i] = mini_batch[i][0]
            actions.append(mini_batch[i][1])
            rewards.append(mini_batch[i][2])
            next_states[i] = mini_batch[i][3]
            dones.append(mini_batch[i][4])

        # 현재 상태에 대한 모델의 큐함수
        # 다음 상태에 대한 타깃 모델의 큐함수
        target = self.model.predict(states)
        target_val = self.target_model.predict(next_states)

        # 벨만 최적 방정식을 이용한 업데이트 타깃
        for i in range(self.batch_size):
            if dones[i]:
                target[i][actions[i]] = rewards[i]
            else:
                target[i][actions[i]] = rewards[i] + self.discount_factor * (
                    np.amax(target_val[i]))

        self.model.fit(states, target, batch_size=self.batch_size,
                       epochs=1, verbose=0)


if __name__ == "__main__":
    state_size = 5
    action_size = 4

    # DQN 에이전트 생성
    agent = DQNAgent(state_size, action_size)

    scores, episodes = [], []
    
    df_train_data = pd.read_csv("eBestEnvTrain-03.csv", names=['f_net', 'i_net', 'ff_net', 'if_net', 'return', 'action'])
    count = len(df_train_data)
    df_train_data[['f_net', 'i_net', 'ff_net', 'if_net', 'return']] = df_train_data[['f_net', 'i_net', 'ff_net', 'if_net', 'return']].apply(pd.to_numeric)     

    for e in range(EPISODES):
        done = False
        score = 0
        # env 초기화
        state = [0, 0, 0, 0, 0]
        state = np.reshape(state, [1, state_size])
        i = 0
        reward = 0
        temp_reward = 0
        position = 0
        #print('==================== Episode ', e, '=====================')
        buy = 0
        sell = 0
        hold = 0
        clear = 0
        
        while not done:          
            # 현재 상태로 행동을 선택
            action = agent.get_action(state)
            
            # 행동 df에 저장
            df_train_data.at[i, 'action'] = action
            
            # 행동 통계
            if action == 0:
                hold += 1
            elif action == 1:
                sell += 1
            elif action == 2:
                buy += 1
            else:
                clear += 1
            pass
        
            # 선택한 행동으로 환경에서 한 타임스텝 진행
            #print('----------------------------------')
            #print('step: ', i + 1, 'action: ', action)
            
            # dataframe에서 값읽어서 state 값에 할당
            f_net = df_train_data.iat[i, 0]
            i_net = df_train_data.iat[i, 1]
            ff_net = df_train_data.iat[i, 2] 
            if_net = df_train_data.iat[i, 3]
            #position = float(position)
            next_state = (f_net, i_net, ff_net, if_net, position)        

            if position == 0:
                if action == 2:
                    position = 1
                    temp_reward = df_train_data.iat[i, 4]
                elif action == 1:
                    position = -1
                    temp_reward = df_train_data.iat[i, 4] * -1
                else:
                    pass
            elif position == 1:
                if action == 2 or action == 0:
                    temp_reward += df_train_data.iat[i, 4]
                elif action == 1:
                    position = -1
                    reward = temp_reward
                    temp_reward = df_train_data.iat[i, 4] * -1
                else:
                    position = 0
                    reward = temp_reward
                    temp_reward = 0
            else:
                if action == 2:
                    position = 1
                    reward = temp_reward
                    temp_reward = df_train_data.iat[i, 4]
                elif action == 1 or action == 0:
                    temp_reward += df_train_data.iat[i, 4] * -1
                else:
                    position = 0
                    reward = temp_reward
                    temp_reward = 0
                        
            i += 1
            
            #done = (i >= count)
            # score가 -200 이하이면 에피소드 중단 
            if score < -200 or i >= count:
                done = True 
            
            done = bool(done)
            
            next_state = np.reshape(next_state, [1, state_size])
            # 에피소드가 중간에 끝나면 -1000 보상
            reward = reward if not done or i == count else -1000
            
            #print('Temp reward: ', temp_reward, 'Reward: ', reward)
            
            # 리플레이 메모리에 샘플 <s, a, r, s'> 저장
            agent.append_sample(state, action, reward, next_state, done)
            
            # 매 타임스텝마다 학습
            if len(agent.memory) >= agent.train_start:
                agent.train_model()

            score += reward
            state = next_state
            reward = 0
                        
            #print('Reward: ', reward, 'Score: ', score)
            
            checkpoint = ModelCheckpoint(filepath=os.path.join("./save_model/4action_dqn_rainyday_best.h5"), monitor='val_loss', save_best_only=True, mode='auto')

            if done:
                # 각 에피소드마다 타깃 모델을 모델의 가중치로 업데이트
                agent.update_target_model()

                score = score if score == 100000 else score + 0
                score = float(str(round(score,1)))
                
                # 에피소드마다 학습 결과 출력
                scores.append(score)
                episodes.append(e)
                pylab.plot(episodes, scores, 'b')
                pylab.savefig("./save_graph/4action_dqn.png")
                
                # 점수가 1,000 이상이면 내용 출력
                if score > 1000:
                    print("episode:", e, " step:", i, "  score:", score, " Buy:", buy, " Sell:", sell, " Hold:", hold, " Clear:", clear, " memory length:",
                          len(agent.memory), "  epsilon:", agent.epsilon)
                pass
                
                # 이전 10개 점수평균이 100,000점 이상이면 중단
                if np.mean(scores[-min(10, len(scores)):]) > 100000:
                    agent.model.save_weights("./save_model/4action_dqn_rainyday.h5")
                
    
    # Episode다 돈 뒤에 모델저장
    agent.model.save_weights("./save_model/4action_dqn_rainyday.h5")
    #sys.exit()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_145 (Dense)            (None, 24)                144       
_________________________________________________________________
dense_146 (Dense)            (None, 24)                600       
_________________________________________________________________
dense_147 (Dense)            (None, 24)                600       
_________________________________________________________________
dense_148 (Dense)            (None, 24)                600       
_________________________________________________________________
dense_149 (Dense)            (None, 24)                600       
_________________________________________________________________
dense_150 (Dense)            (None, 24)                600       
_________________________________________________________________
dense_151 (Dense)            (None, 24)                600       
__________

In [None]:
    agent.model.save_weights("./save_model/4action_dqn_rainyday.h5")