In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install stable-baselines3

Collecting stable-baselines3
  Downloading stable_baselines3-1.3.0-py3-none-any.whl (174 kB)
[?25l[K     |█▉                              | 10 kB 22.6 MB/s eta 0:00:01[K     |███▊                            | 20 kB 23.8 MB/s eta 0:00:01[K     |█████▋                          | 30 kB 28.8 MB/s eta 0:00:01[K     |███████▌                        | 40 kB 31.7 MB/s eta 0:00:01[K     |█████████▍                      | 51 kB 31.5 MB/s eta 0:00:01[K     |███████████▎                    | 61 kB 29.3 MB/s eta 0:00:01[K     |█████████████▏                  | 71 kB 31.3 MB/s eta 0:00:01[K     |███████████████                 | 81 kB 33.7 MB/s eta 0:00:01[K     |████████████████▉               | 92 kB 35.9 MB/s eta 0:00:01[K     |██████████████████▊             | 102 kB 38.2 MB/s eta 0:00:01[K     |████████████████████▋           | 112 kB 38.2 MB/s eta 0:00:01[K     |██████████████████████▌         | 122 kB 38.2 MB/s eta 0:00:01[K     |████████████████████████▍       | 13

In [3]:
import gym
from gym import error, spaces, utils
from gym.utils import seeding
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

In [4]:
class ForagingReplenishingPatches(gym.Env):
    def __init__(self, block_type=1, manual_play=False):
        self.reset_flag = False
        self.action_space = spaces.Discrete(8)
        self.observation_space = spaces.Discrete(8)
        self.block_type = block_type
        self.HARVEST_ACTION_ID = 8

        if self.block_type == 1:
            self.rewards = np.asarray([0, 70, 70, 0, 70, 0, 70, 0])
        elif self.block_type == 2:
            self.rewards = np.asarray([0, 0, 70, 70, 0, 70, 0, 70])
        elif self.block_type == 3:
            self.rewards = np.asarray([70, 0, 0, 70, 70, 0, 70, 0])

        self.rewarding_sites = np.arange(8)[self.rewards > 0]
        self.current_state = 0
        self.time_elapsed = 1.307

        self.farmer_reward = 0
        self.init_env_variables()
        if manual_play:
            self.init_foraging_img()
            self.manual_play()

    def replenish_rewards(self):
        if self.block_type == 1:
            replenish_rates = np.asarray([0, 4, 4, 0, 4, 0, 4, 0])
        elif self.block_type == 2:
            replenish_rates = np.asarray([0, 0, 8, 2, 0, 5, 0, 8])
        elif self.block_type == 3:
            replenish_rates = np.asarray([2, 0, 0, 4, 8, 0, 16, 0])
        replenish_rates[self.current_state] = 0
        self.rewards += replenish_rates
        self.rewards = np.clip(self.rewards, 0, 200)

    def step(self, action):
        self.time_elapsed += self.time_dist[str(self.current_state) + "to" + str(action)]
        self.current_state = action
        if self.time_elapsed >= 300:
            self.reset_flag = True
            return (self.current_state, 0 , self.reset_flag, {})
        
        self.time_elapsed += 1
        reward_old = self.farmer_reward
        if self.current_state in self.rewarding_sites:
            self.replenish_rewards()
            self.farmer_reward += self.rewards[self.current_state] * 0.90
            self.rewards[self.current_state] = (self.rewards[self.current_state] * 0.9)

        if self.time_elapsed >= 300:
            self.reset_flag = True
        return (self.current_state, self.farmer_reward - reward_old, self.reset_flag, {})

    def reset(self):
        self.reset_flag = False
        if self.block_type == 1:
            self.rewards = np.asarray([0, 70, 70, 0, 70, 0, 70, 0])
        elif self.block_type == 2:
            self.rewards = np.asarray([0, 0, 70, 70, 0, 70, 0, 70])
        elif self.block_type == 3:
            self.rewards = np.asarray([70, 0, 0, 70, 70, 0, 70, 0])
        self.rewarding_sites = np.arange(8)[self.rewards > 0]
        self.current_state = 0
        self.time_elapsed = 2
        self.farmer_reward = 0
        return self.current_state

    def render(self, mode="human"):
        print("Current State:", self.current_state, "Current Total Reward:", self.farmer_reward)

    def close(self):
        cv2.destroyAllWindows()
        return None

    def init_env_variables(self, first_point_angle=0):
        a = 1 / (2 * np.sin(np.pi / 8))  # fix a (radius) for unit side octagon
        self.octagon_points = np.asarray(
            [
                (
                    a * np.sin(first_point_angle + n * np.pi / 4),
                    a * np.cos(first_point_angle + n * np.pi / 4),
                )
                for n in range(8)
            ]
        )
        self.time_dist = {}
        for i in range(8):
            for j in range(8):
                dist = np.linalg.norm(self.octagon_points[i] - self.octagon_points[j])
                self.time_dist.update({str(i) + "to" + str(j): dist})

In [5]:
from stable_baselines3.common.env_checker import check_env
env = ForagingReplenishingPatches(block_type=3)
check_env(env, warn=True)

In [None]:
env.reset()
for i in range(300):
    action = np.random.randint(8)
    state, reward, done, _ = env.step(action)
    print(action, state, reward, done)
    if done:
        break

In [6]:
%cd /content/drive/MyDrive/Sem 5/CS698
!mkdir a2c_forager_tensorboard
!ls

/content/drive/MyDrive/Sem 5/CS698
mkdir: cannot create directory ‘a2c_forager_tensorboard’: File exists
 190816_CS698_Assignment_1.ipynb   MDP_Foraging_A2C.ipynb
 a2c_forager_tensorboard	   MDP_Foraging_DQN.ipynb
 assignment_3.ipynb		   MDP_Foraging_PPO.ipynb
'CS698 Project Group 7'		   ppo_forager_tensorboard
 dqn_forager_tensorboard	   saved_models


In [7]:
from stable_baselines3 import A2C

env.reset()

model = A2C("MlpPolicy", env, verbose=1, tensorboard_log="./a2c_forager_tensorboard/")
model.learn(total_timesteps=10**6)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|    value_loss         | 2.93e+04 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 108       |
|    ep_rew_mean        | 1.29e+04  |
| time/                 |           |
|    fps                | 672       |
|    iterations         | 170700    |
|    time_elapsed       | 1268      |
|    total_timesteps    | 853500    |
| train/                |           |
|    entropy_loss       | -2.78e-06 |
|    explained_variance | 0         |
|    learning_rate      | 0.0007    |
|    n_updates          | 170699    |
|    policy_loss        | -0        |
|    value_loss         | 2.65e+04  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 108       |
|    ep_rew_mean        | 1.29e+04  |
| time/                 |           |
|    fps                |

<stable_baselines3.a2c.a2c.A2C at 0x7f238e688e10>

In [8]:
# save
%cd /content/drive/MyDrive/Sem 5/CS698
!mkdir saved_models
%cd saved_models

model.save("A2CmlpPolicy1M")

/content/drive/MyDrive/Sem 5/CS698
mkdir: cannot create directory ‘saved_models’: File exists
/content/drive/My Drive/Sem 5/CS698/saved_models


In [9]:
print(model.policy)

ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (shared_net): Sequential()
    (policy_net): Sequential(
      (0): Linear(in_features=8, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=8, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=8, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)


In [10]:
obs = env.reset()
while True:
    action, _state = model.predict(obs, deterministic=False)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
      obs = env.reset()
      break

Current State: 4 Current Total Reward: 63.0
Current State: 6 Current Total Reward: 140.4
Current State: 3 Current Total Reward: 210.60000000000002
Current State: 4 Current Total Reward: 281.70000000000005
Current State: 6 Current Total Reward: 379.80000000000007
Current State: 3 Current Total Reward: 450.00000000000006
Current State: 4 Current Total Reward: 528.3000000000001
Current State: 6 Current Total Reward: 645.3000000000001
Current State: 3 Current Total Reward: 715.5000000000001
Current State: 4 Current Total Reward: 800.1000000000001
Current State: 6 Current Total Reward: 934.2000000000002
Current State: 3 Current Total Reward: 1004.4000000000002
Current State: 4 Current Total Reward: 1094.4
Current State: 6 Current Total Reward: 1243.8000000000002
Current State: 3 Current Total Reward: 1314.0000000000002
Current State: 4 Current Total Reward: 1409.4000000000003
Current State: 6 Current Total Reward: 1572.3000000000004
Current State: 3 Current Total Reward: 1642.5000000000005


In [11]:
model.get_parameters()

{'policy': OrderedDict([('mlp_extractor.policy_net.0.weight',
               tensor([[-9.1432e-01, -2.1537e-01,  1.2031e-01, -9.4590e-01,  4.6043e-01,
                        -3.1834e-01, -1.7426e-01, -3.6149e-01],
                       [ 7.5066e-01,  5.3743e-02, -7.7319e-02,  1.0059e+00, -8.7626e-01,
                        -2.0594e-02, -1.3527e-01,  4.3355e-02],
                       [ 8.2763e-01,  1.4608e-01,  3.6795e-01,  9.1494e-01, -1.3388e-01,
                         1.5311e-01, -1.1129e-01, -3.0329e-01],
                       [ 1.2195e-01, -1.6966e-01,  1.7468e-02,  2.0041e-01, -9.8405e-01,
                        -1.9903e-01, -3.4175e-01, -9.7914e-02],
                       [-1.1470e-01, -1.4296e-01,  2.7748e-01, -4.0146e-02,  5.4066e-02,
                         1.6390e-01, -7.0100e-02,  6.2697e-02],
                       [ 6.9986e-02, -8.4267e-02,  3.7976e-01,  3.3204e-01,  1.7756e-01,
                        -7.1440e-02,  1.0114e-02, -1.1269e-01],
                    

In [None]:
!tensorboard --logdir ./a2c_forager_tensorboard/

2021-11-07 13:00:12.505587: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-11-07 13:00:12.505682: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (e94e3b02a9d3): /proc/driver/nvidia/version does not exist

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.6.0 at http://localhost:6006/ (Press CTRL+C to quit)
