In [1]:
from google.colab import drive
drive.mount('mnt')
!cp "mnt/My Drive/Colab Notebooks/installer.ipynb" . 
%run 'installer.ipynb'

Mounted at mnt
Collecting keras-rl
[?25l  Downloading https://files.pythonhosted.org/packages/ab/87/4b57eff8e4bd834cea0a75cd6c58198c9e42be29b600db9c14fafa72ec07/keras-rl-0.4.2.tar.gz (40kB)
[K     |████████████████████████████████| 40kB 4.9MB/s 
Building wheels for collected packages: keras-rl
  Building wheel for keras-rl (setup.py) ... [?25l[?25hdone
  Created wheel for keras-rl: filename=keras_rl-0.4.2-cp36-none-any.whl size=48382 sha256=00a442e24cadfe464e2ad1946bce9e3fc84d3b73786f46ca13e921817b44153a
  Stored in directory: /root/.cache/pip/wheels/7d/4d/84/9254c9f2e8f51865cb0dac8e79da85330c735551d31f73c894
Successfully built keras-rl
Installing collected packages: keras-rl
Successfully installed keras-rl-0.4.2
Cloning into 'lib/pybullet-gym'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 746 (delta 1), reused 1 (delta 0), pack-reused 740[K
Receiving objects: 100% (746/746

In [None]:
restart_runtime()

In [1]:
!cp "mnt/My Drive/Colab Notebooks/base_setup.ipynb" . 
%run 'base_setup.ipynb'

In [2]:
class Critic_DDPG(Network):
    def __init__(self, state_dims, action_dims, fc1_dim, fc2_dim, name='critic', chpt_dir='ddpg'):
        super(Network, self).__init__()
#         Network settings
        self.layers = nn.ModuleList().to(device)
#         Checkpoint system
        self.checkpoint_file = os.path.join(chpt_dir, name+'.h5')
        if not os.path.exists(chpt_dir):
            os.makedirs(chpt_dir)
#         Neural Network
        self.layers.append(
            nn.Linear(state_dims, fc1_dim)
        )
        self.layers.append(
            nn.Linear(fc1_dim + action_dims, fc2_dim)
        )
#         Q Layer
        self.layers.append(
            nn.Linear(fc2_dim, 1)
        )
        self.to(device)
    
    def forward(self, state, action):
        val = F.relu(self.layers[0](state))
        val = F.relu(self.layers[1](T.cat([val, action], 1)))
        val = self.layers[2](val)
        return val

In [21]:
class DDPG_Agent(Agent):
    def __init__(self, state_dims, action_dims, name = "",
                 alpha = 1e-4, beta = 1e-3, gamma = 0.99, tau = 0.001, replay_size = pow(10, 6),
                 fc1_dim = 400, fc2_dim = 300, batch_size = 64,
                 theta = 0.15, sigma = 0.2,
                 env=None):
#         Store parameters
        self.noise = OrnsteinUhlenbeckProcess(size = action_dims, theta=theta, mu=0, sigma=sigma)
        self.memory = FIFO_Buffer(replay_size, state_dims, action_dims)
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.max_action = env.action_space.high[0]
        self.min_action = env.action_space.low[0]
        self.action_dims = action_dims
        
#         Checkpoints Folder
        self.dir = name+"_ddpg"
        if not os.path.exists(self.dir):
            os.makedirs(self.dir)        
        
#         Create Networks
        self.actor = Actor_Default(
            state_dims, action_dims, fc1_dim, fc2_dim, self.max_action, name="actor", chpt_dir=self.dir
        )
        self.critic = Critic_DDPG(
            state_dims, action_dims, fc1_dim, fc2_dim, name="critic", chpt_dir=self.dir
        )     
        
        self.target_actor = Actor_Default(
            state_dims, action_dims, fc1_dim, fc2_dim, self.max_action, name="target_actor", chpt_dir=self.dir
        )
        self.target_critic = Critic_DDPG(
            state_dims, action_dims, fc1_dim, fc2_dim, name="target_critic", chpt_dir=self.dir
        )
        
        default_network_initialization(self.actor.layers)
        default_network_initialization(self.critic.layers)
        default_network_initialization(self.target_actor.layers)
        default_network_initialization(self.target_critic.layers)
        
#         Create Optimizer
        self.actor_optimizer = T.optim.Adam(self.actor.parameters(), lr=alpha)
        self.critic_optimizer = T.optim.Adam(self.critic.parameters(), lr=beta, weight_decay=1e-2)
        
#         Init target networks
        self.update_target_networks(tau = 1)
    
    def update_target_networks(self, tau=None):
        if tau is None:
            tau = self.tau

        for param, target_param in zip(self.actor.parameters(), self.target_actor.parameters()):
            target_param.data.copy_(tau * param.data + (1. - tau) * target_param.data)
            
        for param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()):
            target_param.data.copy_(tau * param.data + (1. - tau) * target_param.data)
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.store(state, action, reward, next_state, done)
        
    def select_action(self, state):
#         Compute action from state
        state = T.tensor(state, dtype=T.float).to(device)
        actions = self.actor.forward(state).cpu().data.numpy().flatten()
#         Add noise
        actions += self.noise.sample()
        actions = np.clip(actions, self.min_action, self.max_action)
    
        return actions
    
    def train(self):
#         Sample from Replay buffer
        state, action, reward, next_state, not_done = self.memory.get_sameples_tensor(self.batch_size)

#         Critic Update
        target_Q = self.target_critic.forward(next_state, self.target_actor.forward(next_state))
        target_Q = reward + (self.gamma * target_Q * not_done).detach()
        current_Q = self.critic(state, action)
        
        critic_loss = F.mse_loss(current_Q, target_Q)
        
#         Optimization
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        
#         Actor Update
        actor_loss = -self.critic(state, self.actor(state)).mean()

#         Optimization
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        
#         Soft update on Target Network
        self.update_target_networks()
        
    def save_models(self):
        self.actor.save()
        self.target_actor.save()
        self.critic.save()
        self.target_critic.save()
        
    def load_models(self):
        self.actor.load()
        self.target_actor.load()
        self.critic.load()
        self.target_critic.load()

In [31]:
# Environment
ENV_NAME = 'InvertedPendulumPyBulletEnv-v0'
env = gym.make(ENV_NAME)
env.seed(0)
# Agent    
agent = DDPG_Agent(
    name = ENV_NAME,
    state_dims = env.observation_space.shape[0], env=env, action_dims=env.action_space.shape[0],
    fc1_dim = 64, fc2_dim = 64
)
# Run Test
test_agent(env, agent, 100000, False)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2021-01-28 17:22:34.504340 Total T: 51134 Episode Num: 5105 Episode T: 10 Reward: 10.000
2021-01-28 17:22:35.027855 Total T: 51145 Episode Num: 5106 Episode T: 11 Reward: 11.000
2021-01-28 17:22:35.521666 Total T: 51156 Episode Num: 5107 Episode T: 11 Reward: 11.000
2021-01-28 17:22:35.880390 Total T: 51163 Episode Num: 5108 Episode T: 7 Reward: 7.000
2021-01-28 17:22:36.238897 Total T: 51171 Episode Num: 5109 Episode T: 8 Reward: 8.000
2021-01-28 17:22:36.690684 Total T: 51180 Episode Num: 5110 Episode T: 9 Reward: 9.000
2021-01-28 17:22:37.202404 Total T: 51191 Episode Num: 5111 Episode T: 11 Reward: 11.000
2021-01-28 17:22:37.729090 Total T: 51202 Episode Num: 5112 Episode T: 11 Reward: 11.000
2021-01-28 17:22:38.338967 Total T: 51215 Episode Num: 5113 Episode T: 13 Reward: 13.000
2021-01-28 17:22:38.764583 Total T: 51224 Episode Num: 5114 Episode T: 9 Reward: 9.000
2021-01-28 17:22:39.196831 Total T: 51233 Episode Num

In [32]:
from google.colab import files
files.download(agent.dir+'/performance.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>