In [1]:
import os
import time
import gym
import argparse
from tensorboardX import SummaryWriter
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import experience, utils, common
from model import Actor_Discrete, Critic, AgentA2C_Discrete

In [2]:
ENV_NAME = "MountainCar-v0" ; STOP_REWARD = -120
#ENV_NAME = "CartPole-v0"; STOP_REWARD = 190
#ENV_NAME = 'LunarLander-v2'; STOP_REWARD = 190

GAMMA = 0.99
LR = 0.0005 
LR_RATIO = 5  # crt_lr / act_lr
TEST_ITERS = 50_000
MAX_STEPS = 5_000_000

GAE_LAMBDA = 0.95
TRAJECTORY_SIZE =  256 
BATCH_SIZE = 64 //2 
PPO_EPS = 0.2
PPO_EPOCHES = 4
ENTROPY_BONUS = 0.001 
HID_SIZE = 200 

In [3]:
def calc_adv_ref(trajectory, net_crt, states_v, device="cpu"):
    """
    By trajectory calculate advantage and 1-step ref value
    :param trajectory: trajectory list
    :param net_crt: critic network
    :param states_v: states tensor
    :return: tuple with advantage numpy array and reference values
    """
    values_v = net_crt(states_v)
    values = values_v.squeeze().data.cpu().numpy()
    last_gae = 0.0
    result_adv = []
    result_ref = []
    for val, next_val, (exp,) in zip(
        reversed(values[:-1]), reversed(values[1:]), reversed(trajectory[:-1])
    ):
        if exp.done:
            delta = exp.reward - val
            last_gae = delta
        else:
            delta = exp.reward + GAMMA * next_val - val
            last_gae = delta + GAMMA * GAE_LAMBDA * last_gae
        result_adv.append(last_gae)
        result_ref.append(last_gae + val)  # advantage + value = q_value

    adv_v = torch.FloatTensor(list(reversed(result_adv)))
    ref_v = torch.FloatTensor(list(reversed(result_ref)))
    return adv_v.to(device), ref_v.to(device)

In [4]:
device = torch.device('cpu')
save_path = os.path.join("saves", "ppo-" + f"{ENV_NAME}")
os.makedirs(save_path, exist_ok=True)
env = gym.make(ENV_NAME)
test_env = gym.make(ENV_NAME)
obs_size = test_env.observation_space.shape[0]
act_size = test_env.action_space.n

In [5]:
class Actor_Discrete(nn.Module):
    def __init__(self, obs_size, act_size):
        super(Actor_Discrete, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(obs_size, HID_SIZE),
            nn.ReLU(), 
            #nn.Linear(HID_SIZE, HID_SIZE),
            #nn.ReLU(), 
            nn.Linear(HID_SIZE, act_size),
            nn.Softmax(dim=-1)
        )
        
    def forward(self, x):
        return self.net(x)

    
class Critic(nn.Module):
    def __init__(self, obs_size):
        super(Critic, self).__init__()

        self.value = nn.Sequential(
            nn.Linear(obs_size, HID_SIZE),
            nn.ReLU(),
            #nn.Linear(HID_SIZE, HID_SIZE),
            #nn.ReLU(),
            nn.Linear(HID_SIZE, 1),
        )

    def forward(self, x):
        return self.value(x) 

In [6]:
probs = torch.rand((6,3))
probs

tensor([[0.4447, 0.1329, 0.9013],
        [0.0490, 0.4662, 0.2285],
        [0.1484, 0.5669, 0.3991],
        [0.2055, 0.5994, 0.1501],
        [0.5368, 0.0106, 0.2218],
        [0.0261, 0.1805, 0.0350]])

In [7]:
dist = Categorical(probs)
dist.sample()

tensor([2, 2, 1, 2, 0, 1])

In [8]:
net_act = Actor_Discrete(obs_size, act_size).to(device)
net_crt = Critic(obs_size).to(device)
agent = AgentA2C_Discrete(net_act, device=device)
exp_source = experience.ExperienceSource(env, agent, steps_count=1)
opt_crt = optim.Adam(net_crt.parameters(), lr=LR)
opt_act = optim.Adam(net_act.parameters(), lr=LR / LR_RATIO)

In [9]:
writer = SummaryWriter(
        comment="-ppo_"
        + f"{ENV_NAME}-L{LR}R{LR_RATIO}_T{TRAJECTORY_SIZE}"
        + f"B{BATCH_SIZE}_E{PPO_EPOCHES}" 
    )

trajectory = []
best_reward = None

In [10]:
with utils.RewardTracker(writer) as tracker:
    for step_idx, exp in enumerate(exp_source):
        if step_idx > MAX_STEPS:
            print(f"Training Stopped after {MAX_STEPS}!")
            break
        rewards_steps = exp_source.pop_rewards_steps()
        if rewards_steps:
            rewards, steps = zip(*rewards_steps)
            writer.add_scalar("episode_steps", np.mean(steps), step_idx)
            tracker.reward(np.mean(rewards), step_idx)

        if step_idx % TEST_ITERS == 0:
            ts = time.time()
            with torch.no_grad():
                rewards, steps = common.test_net_discrete(
                    net_act, test_env, device=device
                )
            print(
                "Test done is %.2f sec, reward %.2f, steps %d"
                % (time.time() - ts, rewards, steps)
            )
            writer.add_scalar("test_reward", rewards, step_idx)
            writer.add_scalar("test_steps", steps, step_idx)
            if best_reward is None or best_reward < rewards:
                if best_reward is not None:
                    print(
                        "Best reward updated: %.2f -> %.2f" % (best_reward, rewards)
                    )
                    name = "best_%+.2f_%d.dat" % (rewards, step_idx)
                    fname = os.path.join(save_path, name)
                    # torch.save(net_act.state_dict(), fname)
                best_reward = rewards
                if best_reward > STOP_REWARD:
                    print("Solved!")
                    break

        trajectory.append(exp)
        if len(trajectory) < TRAJECTORY_SIZE + 1:
            continue

        traj_states = [t[0].state for t in trajectory]
        traj_actions = [t[0].action for t in trajectory]
        traj_states_v = torch.FloatTensor(np.array(traj_states)).to(device)
        traj_actions_v = torch.FloatTensor(np.array(traj_actions)).to(device)

        traj_adv_v, traj_ref_v = calc_adv_ref(
            trajectory, net_crt, traj_states_v, device=device
        )

        #mu_v = net_act(traj_states_v)
        #var_v = torch.exp(net_act.logstd) ** 2
        #old_logprob_v = common.calc_logprob(mu_v, var_v, traj_actions_v)
        action_probs = net_act(traj_states_v)
        dist = Categorical(action_probs)
        old_logprob_v = dist.log_prob(torch.tensor(traj_actions))
        # print(old_logprob_v.shape)  # [T, 2]

        # normalize advantages (mean should be zero)
        traj_adv_v = traj_adv_v - torch.mean(traj_adv_v)
        traj_adv_v /= torch.std(traj_adv_v)

        # drop last entry from the trajectory, as our adv and ref value calculated without it
        trajectory = trajectory[:-1]
        old_logprob_v = old_logprob_v[:-1].detach()

        for epoch in range(PPO_EPOCHES):
            for batch_ofs in range(0, len(trajectory), BATCH_SIZE):
                batch_l = batch_ofs + BATCH_SIZE
                states_v = traj_states_v[batch_ofs:batch_l]
                actions_v = traj_actions_v[batch_ofs:batch_l]
                batch_adv_v = traj_adv_v[batch_ofs:batch_l]
                # print(batch_adv_v.shape) #[batch_size]
                batch_adv_v = batch_adv_v.unsqueeze(-1)
                batch_ref_v = traj_ref_v[batch_ofs:batch_l]
                batch_old_logprob_v = old_logprob_v[batch_ofs:batch_l]

                opt_crt.zero_grad()
                value_v = net_crt(states_v)
                loss_value = F.mse_loss(value_v.squeeze(-1), batch_ref_v)
                loss_value.backward()
                # if CLIP_GRAD > 0:
                #     torch.nn.utils.clip_grad_norm_(net_crt.parameters(), CLIP_GRAD)
                opt_crt.step()

                opt_act.zero_grad()
                #mu_v = net_act(states_v)
                #batch_var_v = torch.exp(net_act.logstd) ** 2
                #logprob_pi_v = common.calc_logprob(mu_v, batch_var_v, actions_v)
                action_probs = net_act(states_v)
                dist = Categorical(action_probs)
                logprob_pi_v = dist.log_prob(actions_v)
                entropy = dist.entropy() 
                
                ratio_v = torch.exp(logprob_pi_v - batch_old_logprob_v)
                surr_obj_v = batch_adv_v * ratio_v
                c_ratio_v = torch.clamp(ratio_v, 1.0 - PPO_EPS, 1.0 + PPO_EPS)
                clipped_surr_v = batch_adv_v * c_ratio_v
                
                loss_policy = -torch.min(surr_obj_v, clipped_surr_v).mean() - ENTROPY_BONUS* entropy.mean() 
                loss_policy.backward()
                opt_act.step()


        trajectory.clear()

Test done is 0.15 sec, reward -200.00, steps 200
400: done 2 episodes, mean reward -200.000, speed 343.40 f/s
2200: done 11 episodes, mean reward -200.000, speed 1758.65 f/s
4200: done 21 episodes, mean reward -200.000, speed 1785.30 f/s
6200: done 31 episodes, mean reward -200.000, speed 1852.71 f/s
8200: done 41 episodes, mean reward -200.000, speed 1987.37 f/s
10200: done 51 episodes, mean reward -200.000, speed 1878.65 f/s
12200: done 61 episodes, mean reward -200.000, speed 1853.14 f/s
14200: done 71 episodes, mean reward -200.000, speed 1887.20 f/s
16200: done 81 episodes, mean reward -200.000, speed 1833.11 f/s
18200: done 91 episodes, mean reward -200.000, speed 1924.99 f/s
20200: done 101 episodes, mean reward -200.000, speed 1909.15 f/s
22200: done 111 episodes, mean reward -200.000, speed 1866.08 f/s
24200: done 121 episodes, mean reward -200.000, speed 1886.69 f/s
26200: done 131 episodes, mean reward -200.000, speed 1935.21 f/s
28200: done 141 episodes, mean reward -200.00

238600: done 1193 episodes, mean reward -200.000, speed 1820.24 f/s
240600: done 1203 episodes, mean reward -200.000, speed 1840.86 f/s
242400: done 1212 episodes, mean reward -200.000, speed 1774.82 f/s
244400: done 1222 episodes, mean reward -200.000, speed 1937.13 f/s
246400: done 1232 episodes, mean reward -200.000, speed 1815.79 f/s
248400: done 1242 episodes, mean reward -200.000, speed 1868.30 f/s
Test done is 0.16 sec, reward -200.00, steps 200
250200: done 1251 episodes, mean reward -200.000, speed 1629.14 f/s
252200: done 1261 episodes, mean reward -200.000, speed 1867.96 f/s
254200: done 1271 episodes, mean reward -200.000, speed 1847.84 f/s
256200: done 1281 episodes, mean reward -200.000, speed 1980.86 f/s
258200: done 1291 episodes, mean reward -200.000, speed 1860.62 f/s
260200: done 1301 episodes, mean reward -200.000, speed 1877.15 f/s
262200: done 1311 episodes, mean reward -200.000, speed 1846.17 f/s
264200: done 1321 episodes, mean reward -200.000, speed 1863.20 f/s

468400: done 2342 episodes, mean reward -200.000, speed 1873.66 f/s
470400: done 2352 episodes, mean reward -200.000, speed 1886.26 f/s
472400: done 2362 episodes, mean reward -200.000, speed 1838.57 f/s
474200: done 2371 episodes, mean reward -200.000, speed 1750.12 f/s
476000: done 2380 episodes, mean reward -200.000, speed 1761.57 f/s
478000: done 2390 episodes, mean reward -200.000, speed 1928.92 f/s
480000: done 2400 episodes, mean reward -200.000, speed 1807.30 f/s
482000: done 2410 episodes, mean reward -200.000, speed 1873.67 f/s
484000: done 2420 episodes, mean reward -200.000, speed 1815.17 f/s
485800: done 2429 episodes, mean reward -200.000, speed 1774.45 f/s
487800: done 2439 episodes, mean reward -200.000, speed 1819.52 f/s
489800: done 2449 episodes, mean reward -200.000, speed 1899.52 f/s
491800: done 2459 episodes, mean reward -200.000, speed 1816.94 f/s
493800: done 2469 episodes, mean reward -200.000, speed 1862.11 f/s
495800: done 2479 episodes, mean reward -200.000

Test done is 0.16 sec, reward -200.00, steps 200
701400: done 3507 episodes, mean reward -200.000, speed 1432.68 f/s
703400: done 3517 episodes, mean reward -200.000, speed 1945.95 f/s
705400: done 3527 episodes, mean reward -200.000, speed 1879.66 f/s
707400: done 3537 episodes, mean reward -200.000, speed 1873.82 f/s
709400: done 3547 episodes, mean reward -200.000, speed 1788.44 f/s
711200: done 3556 episodes, mean reward -200.000, speed 1799.91 f/s
713200: done 3566 episodes, mean reward -200.000, speed 1825.32 f/s
715200: done 3576 episodes, mean reward -200.000, speed 1900.96 f/s
717200: done 3586 episodes, mean reward -200.000, speed 1893.27 f/s
719200: done 3596 episodes, mean reward -200.000, speed 1876.22 f/s
721200: done 3606 episodes, mean reward -200.000, speed 1889.64 f/s
723200: done 3616 episodes, mean reward -200.000, speed 1900.10 f/s
725200: done 3626 episodes, mean reward -200.000, speed 1961.87 f/s
727200: done 3636 episodes, mean reward -200.000, speed 1870.84 f/s

931000: done 4655 episodes, mean reward -200.000, speed 1822.96 f/s
933000: done 4665 episodes, mean reward -200.000, speed 1839.03 f/s
935000: done 4675 episodes, mean reward -200.000, speed 1828.42 f/s
937000: done 4685 episodes, mean reward -200.000, speed 1934.08 f/s
939000: done 4695 episodes, mean reward -200.000, speed 1811.94 f/s
941000: done 4705 episodes, mean reward -200.000, speed 1926.54 f/s
943000: done 4715 episodes, mean reward -200.000, speed 1886.87 f/s
945000: done 4725 episodes, mean reward -200.000, speed 1877.28 f/s
947000: done 4735 episodes, mean reward -200.000, speed 1987.39 f/s
949000: done 4745 episodes, mean reward -200.000, speed 1912.23 f/s
Test done is 0.15 sec, reward -200.00, steps 200
950800: done 4754 episodes, mean reward -200.000, speed 1632.01 f/s
952800: done 4764 episodes, mean reward -200.000, speed 1906.60 f/s
954800: done 4774 episodes, mean reward -200.000, speed 1857.77 f/s
956800: done 4784 episodes, mean reward -200.000, speed 1957.92 f/s

1158400: done 5792 episodes, mean reward -200.000, speed 1905.18 f/s
1160400: done 5802 episodes, mean reward -200.000, speed 1910.78 f/s
1162400: done 5812 episodes, mean reward -200.000, speed 1960.69 f/s
1164400: done 5822 episodes, mean reward -200.000, speed 1858.59 f/s
1166400: done 5832 episodes, mean reward -200.000, speed 1805.54 f/s
1168400: done 5842 episodes, mean reward -200.000, speed 1804.05 f/s
1170400: done 5852 episodes, mean reward -200.000, speed 1790.83 f/s
1172400: done 5862 episodes, mean reward -200.000, speed 1915.02 f/s
1174200: done 5871 episodes, mean reward -200.000, speed 1779.76 f/s
1176200: done 5881 episodes, mean reward -200.000, speed 1796.42 f/s
1178200: done 5891 episodes, mean reward -200.000, speed 1844.15 f/s
1180200: done 5901 episodes, mean reward -200.000, speed 1818.61 f/s
1182200: done 5911 episodes, mean reward -200.000, speed 1785.33 f/s
1184000: done 5920 episodes, mean reward -200.000, speed 1760.47 f/s
1186000: done 5930 episodes, mean 

1386600: done 6933 episodes, mean reward -200.000, speed 1853.88 f/s
1388600: done 6943 episodes, mean reward -200.000, speed 1805.26 f/s
1390600: done 6953 episodes, mean reward -200.000, speed 1928.46 f/s
1392400: done 6962 episodes, mean reward -200.000, speed 1789.69 f/s
1394400: done 6972 episodes, mean reward -200.000, speed 1837.85 f/s
1396400: done 6982 episodes, mean reward -200.000, speed 1835.91 f/s
1398400: done 6992 episodes, mean reward -200.000, speed 1847.82 f/s
Test done is 0.16 sec, reward -200.00, steps 200
1400200: done 7001 episodes, mean reward -200.000, speed 1608.06 f/s
1402200: done 7011 episodes, mean reward -200.000, speed 1854.95 f/s
1404200: done 7021 episodes, mean reward -200.000, speed 1963.41 f/s
1406200: done 7031 episodes, mean reward -200.000, speed 1886.18 f/s
1408200: done 7041 episodes, mean reward -200.000, speed 1924.78 f/s
1410200: done 7051 episodes, mean reward -200.000, speed 1900.66 f/s
1412200: done 7061 episodes, mean reward -200.000, spe

1615400: done 8077 episodes, mean reward -200.000, speed 1827.87 f/s
1617400: done 8087 episodes, mean reward -200.000, speed 1892.90 f/s
1619400: done 8097 episodes, mean reward -200.000, speed 1907.64 f/s
1621400: done 8107 episodes, mean reward -200.000, speed 1993.36 f/s
1623400: done 8117 episodes, mean reward -200.000, speed 1856.39 f/s
1625400: done 8127 episodes, mean reward -200.000, speed 1854.48 f/s
1627400: done 8137 episodes, mean reward -200.000, speed 1875.49 f/s
1629400: done 8147 episodes, mean reward -200.000, speed 1833.86 f/s
1631200: done 8156 episodes, mean reward -200.000, speed 1677.58 f/s
1633200: done 8166 episodes, mean reward -200.000, speed 1917.99 f/s
1635200: done 8176 episodes, mean reward -200.000, speed 1825.92 f/s
1637200: done 8186 episodes, mean reward -200.000, speed 1833.74 f/s
1639000: done 8195 episodes, mean reward -200.000, speed 1799.88 f/s
1641000: done 8205 episodes, mean reward -200.000, speed 1799.92 f/s
1643000: done 8215 episodes, mean 

1844800: done 9224 episodes, mean reward -200.000, speed 1876.72 f/s
1847000: done 9235 episodes, mean reward -200.000, speed 1983.81 f/s
1849000: done 9245 episodes, mean reward -200.000, speed 1870.13 f/s
Test done is 0.14 sec, reward -200.00, steps 200
1850600: done 9253 episodes, mean reward -200.000, speed 1590.36 f/s
1852600: done 9263 episodes, mean reward -200.000, speed 1887.44 f/s
1854600: done 9273 episodes, mean reward -200.000, speed 1898.68 f/s
1856600: done 9283 episodes, mean reward -200.000, speed 1868.24 f/s
1858800: done 9294 episodes, mean reward -200.000, speed 2007.48 f/s
1860800: done 9304 episodes, mean reward -200.000, speed 1913.14 f/s
1862800: done 9314 episodes, mean reward -200.000, speed 1873.46 f/s
1864800: done 9324 episodes, mean reward -200.000, speed 1865.04 f/s
1866800: done 9334 episodes, mean reward -200.000, speed 1987.24 f/s
1868800: done 9344 episodes, mean reward -200.000, speed 1914.56 f/s
1870800: done 9354 episodes, mean reward -200.000, spe

KeyboardInterrupt: 

In [None]:
a = torch.tensor([[0.3721, 0.2557, 0.3722]])

In [None]:
Categorical(a).entropy()

In [None]:
entropy

In [None]:
loss_value