In [3]:
import pandas as pd
import numpy as np
import random
import sys
import os
import pdb
import tensorflow as tf
import matplotlib.pyplot as plt
from copy import deepcopy
import time

from keras.models import Sequential, Model
import keras.layers as layers
from keras.optimizers import Adam
from keras import backend as K
import ast
import gym

random_state=0
np.random.seed(random_state)
random.seed(random_state)


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


# Agent

In [102]:
class DQNAgent:
    def __init__(self, df_batch, state_size, action_size, 
                 minibatch_size=64, gamma=.9, lr=0.0001, units=128,
                 vanilla=True, dueling=False, double_param=0, priority_alpha = 0,
                 copy_online_to_target_ep=100, eval_after=100):
        
        # NOT FOR NOW. FUTURE WORK
        #adding priority as noise in batch
        df_batch.at[:, 'weight'] = 0.0
        for i, row in df_batch.iterrows():
            df_batch.at[i, 'priority'] = (row['reward'] + np.random.uniform(0, 0.001))**priority_alpha
        
        
        # setting parameters
        self.state_size = state_size
        self.action_size = action_size
        self.batch = df_batch
        
        self.minibatch_size = minibatch_size
        self.gamma = gamma
        self.learning_rate = lr
        self.units = units
        
        self.vanilla = vanilla
        self.dueling = dueling
        self.double_param = double_param
        self.priority_alpha = priority_alpha
        
        self.copy_online_to_target_ep = copy_online_to_target_ep
        self.eval_after = eval_after
        
        
        # setting up the models
        if self.dueling:
            # TODO
            self.model_1 = self._build_model_dueling()
            self.model_2 = self._build_model_dueling()
        else:
            self.model_1 = self._build_model()
            self.model_2 = self._build_model()
        
        # evaluation variables
        self.R = []
        self.ecrs = []
        
        
    def _build_model(self):
        """
        Standard DQN model
        """
        model = Sequential()
        
        
        model.add(layers.Dense(self.units, activation='relu', kernel_initializer='glorot_normal'))
        model.add(layers.Dense(self.units, activation='relu', kernel_initializer='glorot_normal'))
        
#         model.add(layers.Dense(self.action_size, activation='linear', kernel_initializer='glorot_normal'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate), metrics=[tf.keras.metrics.RootMeanSquaredError(), 'mae'])
        return model
    
    def act(self, state):
        state_array = np.array(state.reshape(1, self.state_size))
        act_values = self.model_2.predict(state_array)
        return np.argmax(act_values[0]), np.max(act_values[0])
    
    def learn(self, epoch, env=None):
        for i in range(epoch):
            self._learn_minibatch()
            
            if (i+1)%self.copy_online_to_target_ep==0:
                self.model_2 = self.model_1
            
            if (i+1)%self.eval_after==0:
                r = self.run_env(env)
                self.R.append(r)
                ecr = 0
#                 t1 = time.time()
#                 ecr = self.ecr_reward()
#                 t2 = time.time()
#                 print("ecr time", t2-t1)
                print("--epoch: {}/{} | ECR: {:.5f} | R: {:.2f} --".format(i+1, epoch, ecr, r))
        
        self.model_2 = self.model_1
        r = self.run_env(env)
        self.R.append(r)

        ecr = self.ecr_reward()
        print("--final run--")
        print("--epoch: {}/{} | ECR: {:.5f} | R: {:.2f} --".format(i+1, epoch, ecr, r))
    
    
    def _learn_minibatch(self):
        priority_sum = self.batch['priority'].sum()
        self.batch['weight'] = self.batch['priority']/priority_sum
        
        
        minibatch = self.batch.sample(self.minibatch_size, weights=self.batch['weight'])
        for i, row in minibatch.iterrows():
            
            state, action, reward, next_state, done = row['state'], row['action'], row['reward'], row['next_state'], row['done']
            
            target_q = reward
            
              
            if self.vanilla:
                if not done: 
                    ns_act_values = self.model_1.predict(next_state.reshape(1,self.state_size))[0]
                    a_prime = np.argmax(ns_act_values)

                    target_ns_act_values = self.model_2.predict(next_state.reshape(1,self.state_size))[0]
                    target_ns_q = target_ns_act_values[a_prime]

                    target_q = reward + self.gamma*target_ns_q

                    self.batch.at[i, 'pred_action'] = a_prime
                    self.batch.at[i, 'pred_reward'] = target_q
                
                target_f = self.model_1.predict(state.reshape(1,self.state_size))
                # Prioritized Experience Reply with noise
                self.batch.loc[i, 'priority'] = (abs(target_q - target_f[0][action]) + np.random.uniform(0, 0.001))**self.priority_alpha


                target_f[0][action] = target_q
                if not done:
                    self.model_1.fit(state.reshape(1,self.state_size), target_f, epochs=1, verbose=0)
                else:
                    self.model_1.fit(state.reshape(1,self.state_size), target_f, epochs=10, verbose=0)
            
                
        
        
            
    def volatile(self, random_batch):
        random_batch = self.batch.loc[~self.batch['pred_reward'].isnull()].sample(100, replace=True)
        total_v = 0
        for i, row in random_batch.iterrows():
            state, action, reward, next_state, done, old_q = row['state'], row['action'], row['reward'], row['next_state'], row['done'], row['pred_reward']
            
            target_f = self.model_1.predict(state.reshape(1,self.state_size))
            pred_q = target_f[0][action]
            
            total_v += (pred_q - old_q)
        total_v = total_v/100
        return total_v
            
    def ecr_reward(self):
        self.predict(self.batch)
        reward = 0.0
        count = 0
        for i, row in self.batch.loc[self.batch['transition_id']==1].iterrows():
            state = row['state']
            next_state = row['next_state']
                
            reward += self.act(state)[1]
            count += 1
            
        ecr = reward/count
        self.ecrs.append(ecr)
        return ecr
    
    def predict(self, df):
        df['pred_action'] = -1
        df['pred_reward'] = -1
        for i, row in df.iterrows():
            state = row['state']
            next_state = row['next_state']
            
            act, q = self.act(state)
            df.loc[i, 'pred_action'] = act
            df.loc[i, 'pred_reward'] = q
        
        return df
    
    def run_env(self, env):
        if env is None:
            return 0
        state = env.reset()
        total_reward = 0
        while True:
            action = self.act(state)[0]
            next_state, reward, done, info = env.step(action)
            total_reward += reward
            state = next_state
            if done:
                state = env.reset()
                return total_reward


# RUN

In [93]:
# SOME FORMATTING ISSUES WITH CSV
df = pd.read_csv('../data/CartPole-v1_10k.csv')
for i, row in df.iterrows():
    state = ast.literal_eval(row['state'])
    df.at[i, 'state'] = np.array(state)
    
    next_state = ast.literal_eval(row['next_state'])
    df.at[i, 'next_state'] = np.array(next_state)

dlt_lst = []
for i, row in df.iterrows():
    if row['done']==True and row['delayed_reward']==0:
        dlt_lst.append(i)
len(dlt_lst)    

df.drop(dlt_lst, inplace=True)
df = df.sort_values(by=['episode_id', 'transition_id'])
df.reset_index(inplace=True, drop=True)
df

print("Cartpole", "| Total transitions:", len(df), " | Total episodes:", len(df['episode_id'].unique()))
df



Cartpole | Total transitions: 222307  | Total episodes: 10000


Unnamed: 0,episode_id,transition_id,state,action,immediate_reward,delayed_reward,done,next_state,info
0,1,0,"[-0.022543404416445246, 0.03232884247292103, 0...",0,1.0,0.0,False,"[-0.021896827566986826, -0.16306397071080267, ...",{}
1,1,1,"[-0.021896827566986826, -0.16306397071080267, ...",1,1.0,0.0,False,"[-0.02515810698120288, 0.03176435710102907, 0....",{}
2,1,2,"[-0.02515810698120288, 0.03176435710102907, 0....",1,1.0,0.0,False,"[-0.024522819839182298, 0.2264861821961012, 0....",{}
3,1,3,"[-0.024522819839182298, 0.2264861821961012, 0....",1,1.0,0.0,False,"[-0.019993096195260275, 0.42119080133853226, 0...",{}
4,1,4,"[-0.019993096195260275, 0.42119080133853226, 0...",1,1.0,0.0,False,"[-0.01156928016848963, 0.615967181288758, 0.01...",{}
...,...,...,...,...,...,...,...,...,...
222302,10000,3,"[-0.029725992124671002, 0.6073723062537161, -0...",1,1.0,0.0,False,"[-0.01757854599959668, 0.8032876496052292, -0....",{}
222303,10000,4,"[-0.01757854599959668, 0.8032876496052292, -0....",1,1.0,0.0,False,"[-0.0015127930074920956, 0.9993581291554666, -...",{}
222304,10000,5,"[-0.0015127930074920956, 0.9993581291554666, -...",1,1.0,0.0,False,"[0.01847436957561724, 1.195582202625824, -0.13...",{}
222305,10000,6,"[0.01847436957561724, 1.195582202625824, -0.13...",1,1.0,0.0,False,"[0.04238601362813372, 1.3919096434474165, -0.1...",{}


In [103]:
df['reward'] = df['delayed_reward']
epoch = 5000
agent = DQNAgent(df_batch=df, state_size=len(df.iloc[0]['state']), action_size=2, 
                 vanilla=True, dueling=False, double_param=0, priority_alpha=0.05,
                 copy_online_to_target_ep=100, eval_after=100)
env = gym.make("CartPole-v1")
agent.learn(epoch, env)

--epoch: 100/5000 | ECR: 0.00000 | R: 9.00 --
--epoch: 200/5000 | ECR: 0.00000 | R: 10.00 --
--epoch: 300/5000 | ECR: 0.00000 | R: 15.00 --
--epoch: 400/5000 | ECR: 0.00000 | R: 13.00 --
--epoch: 500/5000 | ECR: 0.00000 | R: 66.00 --
--epoch: 600/5000 | ECR: 0.00000 | R: 195.00 --
--epoch: 700/5000 | ECR: 0.00000 | R: 9.00 --
--epoch: 800/5000 | ECR: 0.00000 | R: 10.00 --
--epoch: 900/5000 | ECR: 0.00000 | R: 10.00 --
--epoch: 1000/5000 | ECR: 0.00000 | R: 10.00 --
--epoch: 1100/5000 | ECR: 0.00000 | R: 10.00 --
--epoch: 1200/5000 | ECR: 0.00000 | R: 93.00 --
--epoch: 1300/5000 | ECR: 0.00000 | R: 10.00 --
--epoch: 1400/5000 | ECR: 0.00000 | R: 10.00 --
--epoch: 1500/5000 | ECR: 0.00000 | R: 9.00 --
--epoch: 1600/5000 | ECR: 0.00000 | R: 8.00 --
--epoch: 1700/5000 | ECR: 0.00000 | R: 10.00 --
--epoch: 1800/5000 | ECR: 0.00000 | R: 9.00 --
--epoch: 1900/5000 | ECR: 0.00000 | R: 10.00 --
--epoch: 2000/5000 | ECR: 0.00000 | R: 10.00 --
--epoch: 2100/5000 | ECR: 0.00000 | R: 9.00 --
--epoc