In [None]:
# %pip uninstall numpy -Y
# %pip install numpy==1.23.5


Usage:   
  pip3 uninstall [options] <package> ...
  pip3 uninstall [options] -r <requirements file> ...

no such option: -Y
Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m76.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.23.5 which is incompatible.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.23.5 which is in

In [7]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import torch.multiprocessing as mp
import time
import numpy as np

In [8]:
# Set Hyperparmeters
n_train_processes = 3
LR = 0.0002
update_intervals = 5
GAMMA = 0.98
max_train_ep = 300
max_test_ep = 400

In [9]:
class ActorCritic(nn.Module):
  def __init__(self):
    super(ActorCritic,self).__init__()
    self.fc1 = nn.Linear(4,256)
    self.fc_pi = nn.Linear(256,2)
    self.fc_v = nn.Linear(256,1)

  def pi(self,x,softmax_dim=0):
    x = F.relu(self.fc1(x))
    prob = F.softmax(self.fc_pi(x), dim=softmax_dim)
    return prob

  def v(self, x):
    x = F.relu(self.fc1(x))
    v = self.fc_v(x)
    return v

In [10]:
# Train
lock = mp.Lock()
def train(global_model, rank):
  local_model = ActorCritic()
  local_model.load_state_dict(global_model.state_dict())

  optimizer = optim.Adam(global_model.parameters(),lr=LR)

  env = gym.make('CartPole-v1', new_step_api=True)

  for n_epi in range(max_train_ep):
    done = False
    s = env.reset()

    while not done:
      s_lst, a_lst, r_lst = [],[],[]
      for i in range(update_intervals):
        prob = local_model.pi(torch.from_numpy(s).float())
        m = Categorical(prob)
        a = m.sample().item()
        s_prime,r,done,info, _ = env.step(a)

        s_lst.append(s)
        a_lst.append([a])
        r_lst.append(r/100.0)

        s = s_prime
        if done:
          break

      s_final = torch.tensor(np.array(s_prime), dtype=torch.float)
      R = 0.0 if done else local_model.v(s_final).item()
      td_target_lst = []

      # Get Advantage (using Returns G_t)
      for reward in r_lst[::-1]:
        R = reward + GAMMA * R
        td_target_lst.append([R])
      td_target_lst.reverse()

      # s_batch,a_batch,td_target = torch.tensor(np.array(s_lst), dtype=torch.float), torch.tensor(a_lst), torch.tensor(td_target_lst)
      s_batch,a_batch,td_target = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), torch.tensor(td_target_lst)

      advantage = td_target - local_model.v(s_batch)

      pi = local_model.pi(s_batch, softmax_dim=1)
      pi_a = pi.gather(1,a_batch)
      loss = -torch.log(pi_a)*advantage.detach() + F.smooth_l1_loss(local_model.v(s_batch), td_target.detach())

      optimizer.zero_grad()
      loss.mean().backward()
      for global_param, local_param in zip(global_model.parameters(),local_model.parameters()):
        # global_param.grad = local_param.grad.clone()
        global_param._grad = local_param.grad
      optimizer.step()
      local_model.load_state_dict(global_model.state_dict())

  env.close()
  print("Traning_Process {} reached maximum episode.".format(rank))

In [11]:
def test(global_model):
  env = gym.make("CartPole-v1")
  score = 0.0
  print_interval = 20

  for n_epi in range(max_test_ep):
    done = False
    s = env.reset()

    while not done:
      prob = global_model.pi(torch.from_numpy(s).float())
      a = Categorical(prob).sample().item()
      s_prime, r, done,_ = env.step(a)
      s = s_prime
      score += r

    if n_epi % print_interval == 0 and n_epi != 0:
      print(f"[Episode]: {n_epi} [Avg Score]: {(score/print_interval):.2f}")
      score = 0.0
      time.sleep(1)

  env.close()

In [12]:
if __name__ == '__main__':
  global_model = ActorCritic()
  global_model.share_memory()

  processes = []
  for rank in range(n_train_processes + 1):
    if rank == 0:
      p = mp.Process(target = test, args=(global_model,))
    else:
      p = mp.Process(target = train, args=(global_model,rank,))

    p.start()
    processes.append(p)

  for p in processes:
    p.join()

  deprecation(
  deprecation(
Process Process-5:
Traceback (most recent call last):
  File "/usr/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-11-9549c789071b>", line 13, in test
    s_prime, r, done,_ = env.step(a)
                         ^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gym/wrappers/time_limit.py", line 60, in step
    self.env.step(action),
    ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gym/wrappers/order_enforcing.py", line 37, in step
    return self.env.step(action)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gym/wrappers/step_api_compatibility.py", line 52, in step
    step_returns = self.env.step(action)
                   ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gym/wrappers/env_ch