In [1]:
import pandas as pd
import numpy as np
import random
import sys
import os
import pdb
import tensorflow as tf
import matplotlib.pyplot as plt
from copy import deepcopy
import time

from keras.models import Sequential, Model
import keras.layers as layers
from keras.optimizers import Adam
from keras import backend as K
from keras.layers.merge import _Merge, Multiply
import ast
import gym

random_state=0
np.random.seed(random_state)
random.seed(random_state)


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


# Agent

In [7]:
class QLayer(_Merge):
    """
    Q Layer that merges an advantage and value layer'''
    Needed for dueling dqn only
    """
    def _merge_function(self, inputs):
        '''Assume that the inputs come in as [value, advantage]'''
        output = inputs[0] + (inputs[1] - K.mean(inputs[1], axis=1, keepdims=True))
        return output

class DQNAgent:
    def __init__(self, df_batch, state_size, action_size, 
                 minibatch_size=64, gamma=.9, lr=0.0001, units=128, hidden_layers=1,
                 dueling=False, double_param=0, priority_alpha=0,
                 copy_online_to_target_ep=100, eval_after=100):
        
        # NOT FOR NOW. FUTURE WORK
        #adding priority as noise in batch
        df_batch.at[:, 'weight'] = 0.0
        for i, row in df_batch.iterrows():
            df_batch.at[i, 'priority'] = (0 + np.random.uniform(0, 0.001))**priority_alpha

        
        # setting parameters
        self.state_size = state_size
        self.action_size = action_size
        self.batch = df_batch
        
        self.minibatch_size = minibatch_size
        self.gamma = gamma
        self.learning_rate = lr
        self.units = units
        self.hidden_layers = hidden_layers
        
        self.dueling = dueling
        self.double_param = double_param
        self.priority_alpha = priority_alpha
        
        self.copy_online_to_target_ep = copy_online_to_target_ep
        self.eval_after = eval_after
        
        
        # setting up the models
        if self.dueling:
            # TODO
            self.model_1 = self._build_model_dueling()
            self.model_2 = self._build_model_dueling()
        else:
            self.model_1 = self._build_model()
            self.model_2 = self._build_model()
        
        # evaluation variables
        self.R = []
        self.ecrs = []
    
    def _build_model_dueling(self):
        inputs = layers.Input(shape=(self.state_size,))
        z = layers.Dense(self.units, kernel_initializer='glorot_normal', activation='relu')(inputs)
        for layer in range(self.hidden_layers-1):
            z = layers.Dense(self.units, kernel_initializer='glorot_normal', activation='relu')(z)
            
        value = layers.Dense(1, kernel_initializer='glorot_normal', activation='linear')(z)
        
        adv = layers.Dense(self.action_size, kernel_initializer='glorot_normal', activation='linear')(z)

        q = QLayer()([value, adv])
        
        model = Model(inputs=inputs, outputs=q)
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model
        
    def _build_model(self):
        """
        Standard DQN model
        """
        model = Sequential()
        
        model.add(layers.Dense(self.units, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_normal'))
        for layer in range(self.hidden_layers-1):
            model.add(layers.Dense(self.units, activation='relu', kernel_initializer='glorot_normal'))
        
        model.add(layers.Dense(self.action_size, activation='linear', kernel_initializer='glorot_normal'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate), metrics=[tf.keras.metrics.RootMeanSquaredError(), 'mae'])
        return model
    
    def act(self, state):
        state_array = np.array(state.reshape(1, self.state_size))
        act_values = self.model_2.predict(state_array)
        return np.argmax(act_values[0]), np.max(act_values[0])
    
    def learn(self, epoch, env=None):
        for i in range(epoch):
            self._learn_minibatch()
            
            if (i+1)%self.copy_online_to_target_ep==0:
                self.model_2.set_weights(self.model_1.get_weights())
            
            if (i+1)%self.eval_after==0:
                r = self.run_env(env)
                self.R.append(r)
                ecr = 0
                ecr = self.ecr_reward()
                print("--epoch: {}/{} | ECR: {:.5f} | R: {:.2f} --".format(i+1, epoch, ecr, r))
        
        print("--final run--")
        self.model_2 = self.model_1
        r = self.run_env(env)
        self.R.append(r)
        self.predict()
        ecr = self.ecr_reward()
        print("--epoch: {}/{} | ECR: {:.5f} | R: {:.2f} --".format(i+1, epoch, ecr, r))
    
    
    def _learn_row(self, row):
        i = row.name
        state, action, reward, next_state, done = row['state'], row['action'], row['reward'], row['next_state'], row['done']

        target_q = reward

        # For Double DQN
        rand = random.random()
        
        if rand >= self.double_param:
            if not done: 
                ns_act_values = self.model_1.predict(next_state.reshape(1,self.state_size))[0]
                a_prime = np.argmax(ns_act_values)

                target_ns_act_values = self.model_2.predict(next_state.reshape(1,self.state_size))[0]
                target_ns_q = target_ns_act_values[a_prime]

                target_q = reward + self.gamma*target_ns_q

                self.batch.at[i, 'pred_action'] = a_prime
                self.batch.at[i, 'pred_reward'] = target_q
                
            target_f = self.model_1.predict(state.reshape(1,self.state_size))
            # Prioritized Experience Reply with noise
            self.batch.loc[i, 'priority'] = (abs(target_q - target_f[0][action]) + np.random.uniform(0, 0.001))**self.priority_alpha

            target_f[0][action] = target_q
            self.model_1.fit(state.reshape(1,self.state_size), target_f, epochs=1, verbose=0)
        else:
            if not done: 
                ns_act_values = self.model_2.predict(next_state.reshape(1,self.state_size))[0]
                a_prime = np.argmax(ns_act_values)

                target_ns_act_values = self.model_1.predict(next_state.reshape(1,self.state_size))[0]
                target_ns_q = target_ns_act_values[a_prime]

                target_q = reward + self.gamma*target_ns_q

                self.batch.at[i, 'pred_action'] = a_prime
                self.batch.at[i, 'pred_reward'] = target_q

            target_f = self.model_2.predict(state.reshape(1,self.state_size))
            # Prioritized Experience Reply with noise
            self.batch.loc[i, 'priority'] = (abs(target_q - target_f[0][action]) + np.random.uniform(0, 0.001))**self.priority_alpha

            target_f[0][action] = target_q
            self.model_2.fit(state.reshape(1,self.state_size), target_f, epochs=1, verbose=0)
            
                
    
    def _learn_minibatch(self):
        # For PER
        priority_sum = self.batch['priority'].sum()
        self.batch['weight'] = self.batch['priority']/priority_sum
        
        
        minibatch = self.batch.sample(self.minibatch_size, weights=self.batch['weight'])
        minibatch.apply(self._learn_row, axis=1)
        
            
    def ecr_reward(self):
        reward = 0.0
        count = 0
        for i, row in self.batch.loc[self.batch['transition_id']==1].iterrows():
            state = row['state']
            next_state = row['next_state']
                
            reward += self.act(state)[1]
            count += 1
            
        ecr = reward/count
        self.ecrs.append(ecr)
        return ecr
    
    def _predict_row(self, row):
        i = row.name
        state = row['state']
        next_state = row['next_state']

        act, q = self.act(state)
        self.batch.loc[i, 'pred_action'] = act
        self.batch.loc[i, 'pred_reward'] = q
        
    def predict(self):
        self.batch.apply(self._predict_row, axis=1)
        
        return self.batch
    
    def run_env(self, env):
        if env is None:
            return 0
        state = env.reset()
        total_reward = 0
        while True:
            action = self.act(state)[0]
            next_state, reward, done, info = env.step(action)
            total_reward += reward
            state = next_state
            if done:
                state = env.reset()
                return total_reward
            
    def get_all_eval_df(self):
        eval_df = pd.DataFrame(columns=['ECR', 'R'])
        
        eval_df['ECR'] = self.ecrs
        eval_df['R'] = self.R
        
        return eval_df


# RUN

In [3]:
# SOME FORMATTING ISSUES WITH CSV
df = pd.read_csv('../data/CartPole-v1_10k.csv')
for i, row in df.iterrows():
    state = ast.literal_eval(row['state'])
    df.at[i, 'state'] = np.array(state)
    
    next_state = ast.literal_eval(row['next_state'])
    df.at[i, 'next_state'] = np.array(next_state)

dlt_lst = []
for i, row in df.iterrows():
    if row['done']==True and row['delayed_reward']==0:
        dlt_lst.append(i)
len(dlt_lst)    

df.drop(dlt_lst, inplace=True)
df = df.sort_values(by=['episode_id', 'transition_id'])
df.reset_index(inplace=True, drop=True)
df

print("Cartpole", "| Total transitions:", len(df), " | Total episodes:", len(df['episode_id'].unique()))
org_df = df.copy()



Cartpole | Total transitions: 222307  | Total episodes: 10000


In [8]:
result_dir = '../results/'
env = gym.make("CartPole-v1")
epoch = 300
action_size = 2
env_name = 'cartpole'
prefix = ''
for ep_size in [1000, 2000, 5000, 10000]:
    df_run = org_df.copy()
    if ep_size < len(df_run['episode_id'].unique()):
        eps = np.random.choice(df_run['episode_id'].unique(), ep_size)
        df_run = df_run.loc[df_run['episode_id'].isin(eps)]
        df_run.reset_index(drop=True, inplace=True)
    
    for reward_type in ['immediate_reward', 'delayed_reward']:
        df_run['reward'] = df_run[reward_type]
        
        for dueling, double_param, priority_alpha in [(False, 0, 0), (False, 0, 0.05), (False, 0.5, 0.05), (True, 0.5, 0.05)]:
            for random_state in [0, 1, 2]:
                df = df_run.copy()
                
                prefix = env_name + '_' + 'ep_size_' + str(ep_size) + '_' + reward_type + '_' + \
                    'dueling_' + str(dueling) + '_double_' + str(double_param) + '_priority_' + \
                    str(priority_alpha) + '_' + 'rs_' + str(random_state) + '_'
            
        
                np.random.seed(random_state)
                random.seed(random_state)
            
                print("==" + prefix + "==")
                agent = DQNAgent(df_batch=df, state_size=len(df.iloc[0]['state']), action_size=action_size, 
                                 dueling=dueling, double_param=double_param, priority_alpha=priority_alpha,
                                 copy_online_to_target_ep=100, eval_after=100)

                agent.learn(epoch, env)


                result = agent.batch
                eval_df = agent.get_all_eval_df()
                eval_df.to_pickle(result_dir + prefix +'eval.pkl')
                result.to_pickle(result_dir + prefix +'result.pkl')
                eval_df.to_csv(result_dir + prefix +'eval.csv')
                result.to_csv(result_dir + prefix +'result.csv')

                print('==run ends==')


==cartpole_ep_size_1000_immediate_reward_dueling_False_double_0_priority_0_rs_0_==
> <ipython-input-7-1d3aa81a157e>(97)learn()
-> self.model_2.set_weights(self.model_1.get_weights())
(Pdb) c
--epoch: 100/300 | ECR: 0.94674 | R: 17.00 --
> <ipython-input-7-1d3aa81a157e>(96)learn()
-> pdb.set_trace()
(Pdb) self.model_2.get_weights()
[array([[ 0.06242941, -0.07041056,  0.09408107,  0.02832429,  0.18802254,
        -0.13183612,  0.17227326, -0.19316578, -0.17014559,  0.00067618,
         0.02535787,  0.0420009 ,  0.21917357, -0.03776241,  0.131136  ,
         0.04914583, -0.05416485,  0.17947206, -0.13702536,  0.22960597,
         0.04593443, -0.04717644, -0.20502818, -0.00510414,  0.11278215,
         0.02680336, -0.24530782, -0.01075317, -0.08669105,  0.06679098,
         0.0755036 ,  0.11978197, -0.16965832, -0.06621797, -0.13838491,
         0.02194542,  0.22545326,  0.00518569,  0.16058098, -0.04515255,
         0.0935569 , -0.1608985 , -0.04173336, -0.01755823, -0.08978215,
        -

(Pdb) self.model_1.get_weights()
[array([[ 6.38095289e-02, -4.67789173e-02,  1.41406164e-01,
         2.00513005e-02,  1.85843095e-01, -1.00220487e-01,
         1.93379626e-01, -2.41595164e-01, -2.04115510e-01,
        -4.88339039e-03,  3.56676914e-02,  7.36401752e-02,
         2.59743333e-01, -3.21201831e-02,  1.43806517e-01,
         4.85068075e-02, -4.32455018e-02,  1.85324937e-01,
        -1.50241286e-01,  2.29799956e-01,  7.21978098e-02,
        -4.93025109e-02, -2.34105051e-01, -2.17211898e-02,
         1.09122522e-01, -1.07996250e-02, -2.69993603e-01,
        -4.32920158e-02, -1.01039737e-01,  5.99408075e-02,
         7.09064230e-02,  1.25633106e-01, -1.74224287e-01,
        -1.03273809e-01, -1.45518616e-01,  5.74354082e-03,
         2.23868892e-01,  1.80591289e-02,  1.82354465e-01,
        -7.70910382e-02,  7.32620135e-02, -1.45805135e-01,
        -6.90246001e-02, -2.72831228e-02, -8.93522650e-02,
        -2.15811580e-02,  2.02729553e-01,  8.27821568e-02,
        -1.60786688e-0

(Pdb) n
> <ipython-input-7-1d3aa81a157e>(97)learn()
-> self.model_2.set_weights(self.model_1.get_weights())
(Pdb) self.model_2.get_weights()
[array([[ 0.06242941, -0.07041056,  0.09408107,  0.02832429,  0.18802254,
        -0.13183612,  0.17227326, -0.19316578, -0.17014559,  0.00067618,
         0.02535787,  0.0420009 ,  0.21917357, -0.03776241,  0.131136  ,
         0.04914583, -0.05416485,  0.17947206, -0.13702536,  0.22960597,
         0.04593443, -0.04717644, -0.20502818, -0.00510414,  0.11278215,
         0.02680336, -0.24530782, -0.01075317, -0.08669105,  0.06679098,
         0.0755036 ,  0.11978197, -0.16965832, -0.06621797, -0.13838491,
         0.02194542,  0.22545326,  0.00518569,  0.16058098, -0.04515255,
         0.0935569 , -0.1608985 , -0.04173336, -0.01755823, -0.08978215,
        -0.02531181,  0.17709911,  0.11231965, -0.14047797, -0.06091133,
        -0.02706337, -0.05170104, -0.0659349 , -0.16034363,  0.20743665,
         0.09069033,  0.1381247 ,  0.17650159, -0.08504

(Pdb) c
--epoch: 200/300 | ECR: 1.83881 | R: 40.00 --
> <ipython-input-7-1d3aa81a157e>(96)learn()
-> pdb.set_trace()
(Pdb) exit


BdbQuit: 