In [None]:
import os
import copy
import math
import numbers
import numpy as np
from datetime import datetime
import warnings
from rl.random import OrnsteinUhlenbeckProcess
# OpenAI
import gym
import pybulletgym
#PyTorch
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch import distributions as pyd
from torch.distributions.normal import Normal

# Fix Numpy seed
np.random.seed(0)
# Fix Torch seed
T.manual_seed(0)
# Misc
device = T.device("cuda:0" if T.cuda.is_available() else "cpu")
T.autograd.set_detect_anomaly(True)

In [None]:
class FIFO_Buffer:
        def __init__(self, max_size, state_shape, actions_shape):
#             Internal management Variables
            self.size = max_size
            self.cnt = 0
#             Variables to store
            self.state = np.zeros((max_size, state_shape))
            self.action = np.zeros((max_size, actions_shape))
            self.reward = np.zeros((max_size, 1))
            self.next_state = np.zeros((max_size, state_shape))
            self.not_done = np.zeros((max_size, 1))
            
        def store(self, state, action, reward, next_state, done):
#             Keep index within bounds
            index = self.cnt % self.size
#             Store transition
            self.state[index] = state
            self.reward[index] = reward
            self.action[index] = action
            self.next_state[index] = next_state
            self.not_done[index] = 1. - done
#             Update counter
            self.cnt += 1
    
        def get_samples(self, batch_size):
#             Filter out empty memory locations
            size = min(self.cnt, self.size)
#             Get samples
            batch = np.random.choice(size, batch_size, replace = True)
            states = self.state[batch]
            actions = self.action[batch]
            rewards = self.reward[batch]
            next_states = self.next_state[batch]
            not_dones = self.not_done[batch]
            
            return states, actions, rewards, next_states, not_dones
        
        def get_sameples_tensor(self, batch_size):
#             Filter out empty memory locations
            size = min(self.cnt, self.size)
#             Get samples
            batch = np.random.choice(size, batch_size, replace = True)
            states = self.state[batch]
            actions = self.action[batch]
            rewards = self.reward[batch]
            next_states = self.next_state[batch]
            not_dones = self.not_done[batch]
            
            return (T.tensor(states, dtype=T.float).to(device), T.tensor(actions, dtype=T.float).to(device), 
                    T.tensor(rewards, dtype=T.float).to(device), T.tensor(next_states, dtype=T.float).to(device), 
                    T.tensor(not_dones, dtype=T.float).to(device)
                   )

In [None]:
def default_network_initialization(layers):
    for i in range(len(layers)):
        if layers[i].out_features == 1:
            f = 1 / np.sqrt(0.003)
        else:
            f = 1 / np.sqrt(layers[i].out_features)
        T.nn.init.uniform_(layers[i].weight.data, a=-f, b=f)
        T.nn.init.uniform_(layers[i].bias.data, a=-f, b=f)

In [None]:
class Network(nn.Module):
    def __init__(self, input_dims, fc1_dims, fc2_dims, output_dims, name='network', chpt_dir='unknwon'):
        super(Network, self).__init__()
#         Network settings
        self.layers = nn.ModuleList().to(device)
#         Checkpoint system
        self.checkpoint_file = os.path.join(chpt_dir, name+'.h5')
        if not os.path.exists(chpt_dir):
            os.makedirs(chpt_dir)
#        Input Layer
        self.layers.append(
            nn.Linear(input_dims, fc1_dims)
        )    
#         Dense Layer
        self.layers.append(
            nn.Linear(fc1_dims, fc2_dims)
        )
#         Output Layer
        self.layers.append(
            nn.Linear(fc2_dims, output_dims)
        )
    
        self.to(device)
    
    def forward(self, input_data):
        val = F.relu(self.layers[0](input_data))
        val = F.relu(self.layers[1](val))
        val = self.layers[2](val)
        return val
    
    def save(self):
        T.save(self.state_dict(), self.checkpoint_file)
        
    def load(self):
        self.load_state_dict(T.load(self.checkpoint_file))

In [None]:
class Critic_Default(Network):
    def forward(self, state, actions):
        return super().forward(T.cat([state, actions], 1))

In [None]:
class Actor_Default(Network):
    def __init__(self, state_dims, action_dims, fc1_dims, fc2_dims, max_action, name='', chpt_dir=''):
        super().__init__(state_dims, fc1_dims, fc2_dims, action_dims, name, chpt_dir)
        self.max_action = max_action
        
        self.to(device)
    
    def forward(self, input_data):
        return T.tanh(super().forward(input_data)) * self.max_action

In [None]:
class Agent:
    def remember(self, state, action, reward, next_state, done):
        raise NotImplementedError
        
    def select_action(self, state, eval = False):
        raise NotImplementedError
    
    def train(self):
        raise NotImplementedError
        
    def save_models(self):
        print("Not Implemented")
        
    def load_models(self):
        raise NotImplementedError

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt

def test_agent(env, agent, n_timesteps, visualize = True, print_only = True):
    if visualize:
        env.render()
#     Plotting
    if not print_only:
        plt_timesteps = []
        plt_rewards = []

        fig, ax = plt.subplots(1)
        ax.set_xlabel('Training step')
        ax.set_ylabel('Reward')
        plt.ion()
        lp = ax.plot([],[])[0]
      
        fig.show()
        fig.canvas.draw()
#     Initialize
    state, done = env.reset(), False
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0
    best_reward = -99999999

    f = open(agent.dir+"/performance.txt", "w+")
    f.write(str(f'Start_time {str(datetime.now())}\n'))
    f.close()
    for t in range(n_timesteps):
        episode_timesteps += 1

        action = agent.select_action(np.array(state))
        next_state, reward, done, info = env.step(action)
        agent.remember(state, action, reward, next_state, float(done))

        episode_reward += reward
        agent.train()
        state = next_state

        if done:
          log = f"{datetime.now()} Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}"
          # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
          if not print_only:
              plt_timesteps.append(t+1)
              plt_rewards.append(episode_reward)
              lp.set_xdata(plt_timesteps)
              lp.set_ydata(plt_rewards)
              ax.relim()
              ax.autoscale_view(True, True, True)
              fig.canvas.draw()
          else:
              print(log)
          f = open(agent.dir+"/performance.txt", "a")
          f.write(f'{log}\n')
          f.close()
          # Reset environment
          if reward > best_reward:
              best_reward = reward
              agent.save_models()
          
          state, done = env.reset(), False
          episode_reward = 0
          episode_timesteps = 0
          episode_num += 1
    agent.save_models()
    print(datetime.now())
    print("END")