# Lecture 7. DQN

뉴럴넷을 쓰는 아이디어는 Converge 하지 못 했기 때문에, 주목 받지 못 했었다.

하지만, DeepMind에서 두 가지 문제를 해결했다.
- Correlations between samples 
- Non-stationary targets : 타겟이 흔들린다는 의미.


__Correlations between samples__
받아오는 샘플들이 연관성이 있다는 문제.

__Non-stationary targets__  
타겟이 움직인다.  
Y^(hat) = Q pred과 Y target을 갖게 만들려고 함.   
근데 이때 사용되는 theta가 Q pred과 Y target에 동일한 값이 사용됨.  
따라서 theta를 업데이트를 하게되면 Q pred이 변하고 이때 Y target도 움직이게 된다.  


## DQN's three solutions

__Go deep__  
깊은 네트워크를 사용한다.  
Layer를 늘릴수록 학습이 잘 될 것이다.  

__Capture and replay__  
- Correlations between samples
중요한 솔루션, correlation을 해결하기 위해서 experience replay를 하게됨.  
action에 따른 state를 바로 학습하지 말고 버퍼에 저장을 함. -> Store transition(pi_t, a_t, r_t, pi_t+1) in *D*  
시간이 지난 후에 버퍼에서 랜덤하게 가져와서 학습을 함. -> Sample random minibatch of transitions (pi_j, a_j, r_j, pi_j+1) from *D*  
간단한 알고리즘 이지만 학습에 매우 효과적이다.

__Separate networks: create a target network__  
- Non-stationary targets
네트워크를 하나 더 만들어서 theta와 theta_bar를 만들어서 사용함.  
pred하는 네트워크와 target에서 사용하는 네트워크를 다르게 사용함.  
그리고 Every C steps에 Q^(hat)에 Q를 복사함.

Algorithm 1: deep Q-learning with experience replay.
Initialize replay memory *D* to capacity *N*  
Initialize action-value function *Q* with random weights theta  
Initialize target action-value function *Q*^(hat) with weights theta^- = theta  
For episode = 1, *M* do  
    Initialize sequence s_1 = {x_1} and preprocessed sequence pi_q = pi(s_1)  
    For t = 1, T do  
        With probability epsilon select a random action a_t  
        otherwise select a_t = argmax_aQ(pi(s_t),a; theta)
        Execute action a_t in emulator and observe reward r_t and image x_t+1  
        Set s_t+1 = s_t, a_t, x_t+1 and preprocess pi_t+1 = pi(s_t+1)  
        Store transition (pi_t, a_t, r_t, pi_t+1) in *D*  
        Sample random minibatch of transitions (pi_j, a_j, r_j, pi_j+1) from *D*  
        
        Set y_j = r_j                                       if episode terminates at step j+1
                  r_j + gamma\*max_a'Q^(hat)(pi_j+1, a'; theta^-)         otherwise
                  
        Perform a gradient descent step on (y_j - Q(pi_j, a_j; theta))^2 with respect to the network parameters theta
        Every C steps reset Q^(hat) = Q
    End For
End For

In [3]:
class DQN:
    def __init__(self, session, input_size, output_size, name="main"):
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.net_name = name
        
        self._build_network()
        
    def _build_network(self, h_size=10, l_rate=1e-1):
        with tf.variable_scope(self.net_name):
            self._X = tf.placeholder(
                tf.float32, [None, self.input_size], name="input_x")
            
            # First layer of weights
            W1 = tf.get_variable("W1", shape=[self.input_size, h_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            layer1 = tf.nn.tanh(tf.matmul(self._X, W1))
            
            # Second layer of weights
            W2 = tf.get_variable("W2", shape=[h_size, self.output_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            
            # Q prediction
            self._Qpred = tf.matmul(layer1, W2)
        
        # We need to define the parts of the network needed for learning a
        # policy
        self._Y = tf.placeholder(
            shape=[None, self.output_size], dtype=tf.float32)
        
        # Loss function
        self._loss = tf.reduce_mean(tf.square(self._Y - self._Qpred))
        
        # Learning
        self._train = tf.train.AdamOptimizer(
            learning_rate=l_rate).minimize(self._loss)
        
    def predict(self, state):
        # 상태를 받아서 결과를 반환해 주는 함수
        X = np.reshape(state, [1, self.input_size])
        return self.session.run(self._Qpre, feed_dict={self._X: x})
    
    def update(self, x_stack, y_stack):
        # 
        return self.session.run([self._loss, self._train], feed_dict={
            self._X: x_stack, self._Y: y_stack})
        
        

## 2. Replay memory

In [5]:

# Store the previous observations in replay memory
replay_buffer = deque()

# Save the experience to our buffer
replay_buffer.append((state, action, reward, next_state, done))
if len(replay_buffer) > REPLAY_MEMORY:
    replay_buffer.popleft()
    
if episode % 10 == 1: # train every 10 episodes
    # Get a random batch of experiences.
    for _ in range(50):
        # Minibatch works better
        minibatch = random.sample(replay_buffer, 10)
        loss, _ = simple_replay_train(mainDQN, minibatch)

ModuleNotFoundError: No module named 'collections.deque'

## 3. Train from replay memory

In [1]:
def simple_replay_train(DQN, train_batch):
    x_stack = np.empty(0).reshape(0, DQN.input_size)
    y_stack = np.empty(0).reshape(0, DQN.output_size)
    
    # Get stored information from the buffer
    for state, action, reward, next_state, done in train_batch:
        Q = DQN.predict(state)
        
        # terminal?
        if done:
            Q[0, action] = reward
        else:
            # Obtain the Q' values by feeding the new state through our network
            Q[0, action] = reward + dis * np.max(DQN.predict(next_state))
            
        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])
        
    # Train our network using target and predicted Q values on each episode
    return DQN.update(x_stack, y_stack)

## Recap

1. Net-Build-init  
2. Env  
    a = ?  
    env.step(a)  
    학습을 시키지 않고 버퍼에 저장을 함.  
    random.sample() <- minibatch로 학습을 시킴.  

In [7]:
import numpy as np
import tensorflow as tf
import random
from collections import deque

import gym
env = gym.make('CartPole-v0')

# Constants defining our neural network
input_size = env.observation_space.shape[0]
output_size = env.action_space.n

dis = 0.9
REPLAY_MEMORY = 50000

# Code 4. bot play

In [None]:
def bot_play(mainDQN):
    # See our trained network in action
    s = env.reset()
    reward_sum = 0
    while True:
        env.render()
        a = np.argmax(mainDQN.predict(s))
        s, reward, done, _ = env.step(a)
        reward_sum += reward
        if done:
            print("Total score: {}".format(reward_sum))
            break

In [2]:
def main():
    max_episodes = 5000
    
    # store the previous observations in replay memory
    replay_buffer = deque()
    
    with tf.Session() as sess:
        mainDQN = DQN(sess, input_size, output_size)
        tf.global_variables_initializer().run()
    
        for episode in range(max_episodes):
            e = 1/ ((episode / 10) + 1)
            done = False
            step_count = 0
            
            state = env.reset()
            
            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    # Choose an action by greedily from the Q-network
                    action = np.argmax(mainDQN.predict(state))
                    
                # Get new state and reward from environment
                next_state, reward, done, _  = env.step(action)
                if done: # big penalty
                    reward = -100
                    
                # Save the experience to our buffer
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()
                
                state = next_state
                step_count += 1
                if step_count > 10000:
                    break
                    
            print("Episode: {} steps: {}".format(episode, step_count))
            if step_count > 10000:
                pass
                # break
                
            if episode % 10 == 1: # train every 10 episodes
                # Get a random batch of experiences.
                for _ in range(50):
                    # Minibatch works better
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = simple_replay_train(mainDQN, minibatch)
                print("Lostt: ", loss)
        boy_play(mainDQN)