In [3]:
import pandas as pd
import numpy as np
import random
import sys
import os
import pdb

import matplotlib.pyplot as plt
from copy import deepcopy
from datetime import datetime

from keras.models import Sequential, Model
import keras.layers as layers
from keras.optimizers import Adam
from keras import backend as K

random_state=0
np.random.seed(random_state)
random.seed(random_state)

df = pd.read_csv('../data/CartPole-v1_5k.csv')
print("Cartpole", "| Total transitions:", len(df), " | Total episodes:", len(df['episode_id'].unique()))
df


Cartpole | Total transitions: 116155  | Total episodes: 5001


Unnamed: 0,episode_id,transition_id,current state,action,reward,delayed_reward,done,next_state,info
0,1,1,[-0.04257099 -0.02952321 0.01411527 -0.03864165],1,1.0,0.0,False,[-0.04316146 0.16539351 0.01334243 -0.32683786],{}
1,1,2,[-0.04316146 0.16539351 0.01334243 -0.32683786],0,1.0,0.0,False,[-0.03985359 -0.02991583 0.00680568 -0.02997736],{}
2,1,3,[-0.03985359 -0.02991583 0.00680568 -0.02997736],1,1.0,0.0,False,[-0.0404519 0.16510786 0.00620613 -0.32050528],{}
3,1,4,[-0.0404519 0.16510786 0.00620613 -0.32050528],1,1.0,0.0,False,[-3.71497462e-02 3.60140882e-01 -2.03977347e-...,{}
4,1,5,[-3.71497462e-02 3.60140882e-01 -2.03977347e-...,1,1.0,0.0,False,[-0.02994693 0.55526568 -0.01242847 -0.90397175],{}
...,...,...,...,...,...,...,...,...,...
116150,5001,19,[ 0.0678786 0.05469085 -0.17322402 -0.5183969 ],0,1.0,0.0,False,[ 0.06897241 -0.13762336 -0.18359195 -0.28491633],{}
116151,5001,20,[ 0.06897241 -0.13762336 -0.18359195 -0.28491633],1,1.0,0.0,False,[ 0.06621995 0.05957742 -0.18929028 -0.62941968],{}
116152,5001,21,[ 0.06621995 0.05957742 -0.18929028 -0.62941968],0,1.0,0.0,False,[ 0.06741149 -0.13246942 -0.20187867 -0.40181452],{}
116153,5001,22,[ 0.06741149 -0.13246942 -0.20187867 -0.40181452],0,-1.0,0.0,True,[ 0.06476211 -0.32424159 -0.20991496 -0.17894814],{}


In [4]:
class DQNAgent:
    def __init__(self, df_batch, state_size, action_size, 
                 minibatch_size=32, gamma=.9, lr=0.0001, units=128,
                 dueling=False, double_param=0, priority_aplha = 0,
                 copy_online_to_target_ep=100, eval_after=100):
        
        # setting parameters
        self.state_size = state_size
        self.action_size = action_size
        self.batch = df_batch
        
        self.minibatch_size = minibatch_size
        self.gamma = gamma
        self.learning_rate = lr
        self.units = units
        
        self.dueling = dueling
        self.double_param = double_param
        self.priority_aplha = priority_aplha
        
        self.copy_online_to_target_ep = copy_online_to_target_ep
        self.eval_after = eval_after
        
        
        # setting up the models
        if self.dueling:
            # TODO
            self.model_1 = self._build_model_dueling()
            self.model_2 = self._build_model_dueling()
        else:
            self.model_1 = self._build_model()
            self.model_2 = self._build_model()
        
        # evaluation variables
        self.R = []
        self.ecrs = []
        
    def _build_model(self):
        """
        Standard DQN model
        """
        model = Sequential()
        
        
        model.add(layers.Dense(self.units, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_normal'))
        model.add(layers.Dense(self.units, activation='relu', kernel_initializer='glorot_normal'))
        
        model.add(layers.Dense(self.action_size, activation='linear', kernel_initializer='glorot_normal'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate), metrics=[tf.keras.metrics.RootMeanSquaredError(), 'mae'])
        return model
    
    def learn(self, epoch, env=None):
        for i in range(epoch):
            self._learn_minibatch()
            
            if (i+1)%self.copy_online_to_target_ep==0:
                self.model_2 = self.model_1
            
            if (i+1)%self.eval_after==0:
                r = self.run_env(env)
                self.R.append(r)
                
                ecr = self.ecr_reward()
                print("--epoch: {}/{} | ECR: {:.5f} | R: {:.2f} --".format(i+1, epoch, ecr, r))
        
        self.model_2 = self.model_1
        r = self.run_env(env)
        self.R.append(r)

        ecr = self.ecr_reward()
        print("--epoch: {}/{} | ECR: {:.5f} | R: {:.2f} --".format(i+1, epoch, ecr, r))
    
    
    def _learn_minibatch(self):
        priority_sum = self.batch['priority'].sum()
        self.batch['weight'] = self.batch['priority']/priority_sum
        
        minibatch = self.batch.sample(self.minibatch_size, weights=self.batch['weight'])
        
        for i, row in minibatch.iterrows():
            
            state, action, reward, next_state, done = row['state'], row['action'], row['reward'], row['next_state'], row['done']
            
            target_q = reward
            
            state_array = row['state_array']
            next_state_array = row['next_state_array']
            
            model_1 = True
            # double_param==0 means Regular DQN with model_2 as Target
            if np.random.rand() < self.double_param:
                model_1 = False
                
            if not done:    
                # double Q learning
                if model_1:
                    if self.mode=="normal":
                        ns_act_values = self.model_1.predict(next_state_array)[0]
                        a_prime = np.argmax(ns_act_values)
                    elif self.mode=="random":
                        a_prime = np.random.choice(range(self.action_size))
                    else:
                        a_prime = int(self.mode)
                    a_prime = self._filter_bcq(row, a_prime)
                    target_ns_act_values = self.model_2.predict(next_state_array)[0]
                    target_ns_q = target_ns_act_values[a_prime]
                else:
                    if self.mode=="normal":
                        ns_act_values = self.model_1.predict(next_state_array)[0]
                        a_prime = np.argmax(ns_act_values)
                    elif self.mode=="random":
                        a_prime = np.random.choice(range(self.action_size))
                    else:
                        a_prime = int(self.mode)
                    a_prime = self._filter_bcq(row, a_prime)
                    target_ns_act_values = self.model_1.predict(next_state_array)[0]
                    target_ns_q = target_ns_act_values[a_prime]                
                
                target_q = reward + self.gamma*target_ns_q
                
                self.batch.at[i, 'pred_action'] = a_prime
                self.batch.at[i, 'pred_reward'] = target_q
                
            if model_1:
                target_f = self.model_1.predict(state_array)
                
                # Prioritized Experience Reply with noise
                self.batch.loc[i, 'priority'] = (abs(target_q - target_f[0][action]) + np.random.uniform(0, 0.001))**self.priority_aplha

                target_f[0][action] = target_q
                self.model_1.fit(state_array, target_f, epochs=1, verbose=0)
            else:
                target_f = self.model_2.predict(state_array)
                
                # Prioritized Experience Reply with noise
                self.batch.loc[i, 'priority'] = (abs(target_q - target_f[0][action]) + np.random.uniform(0, 0.001))**self.priority_aplha

                target_f[0][action] = target_q
                self.model_2.fit(state_array, target_f, epochs=1, verbose=0) 
    
    
    
    # TODO also, implement a predict loop first
    def ecr_reward(self):
        reward = 0.0
        count = 0
        for i, row in df_test.loc[df_test['transition_number']==0].iterrows():
            state_array = row['state_array']
            next_state_array = row['next_state_array']
                
            reward += self.act(state_array)[1]
            count += 1
            
        ecr = reward/count
        self.ecrs.append(ecr)
        return ecr
    
    # TODO
    def run_env(self, env):
        if env is None:
            return 0
        state = env.reset()
        while True:
            state_array = np.array(state).reshape(1, self.state_size)
            
            action = self.act(state_array)[0]
            next_state, reward, done, info = env.step(action)
            state = next_state
            if done:
                state = env.reset()
                prevs = []
                for j in range(self.lookback):
                    prevs.append((0,) * (self.state_size))
                return reward
