In [6]:
!pip install stable-baselines3

Collecting stable-baselines3
  Downloading stable_baselines3-1.3.0-py3-none-any.whl (174 kB)
[?25l[K     |█▉                              | 10 kB 19.6 MB/s eta 0:00:01[K     |███▊                            | 20 kB 22.6 MB/s eta 0:00:01[K     |█████▋                          | 30 kB 16.0 MB/s eta 0:00:01[K     |███████▌                        | 40 kB 10.8 MB/s eta 0:00:01[K     |█████████▍                      | 51 kB 5.6 MB/s eta 0:00:01[K     |███████████▎                    | 61 kB 6.2 MB/s eta 0:00:01[K     |█████████████▏                  | 71 kB 5.9 MB/s eta 0:00:01[K     |███████████████                 | 81 kB 6.6 MB/s eta 0:00:01[K     |████████████████▉               | 92 kB 4.9 MB/s eta 0:00:01[K     |██████████████████▊             | 102 kB 5.3 MB/s eta 0:00:01[K     |████████████████████▋           | 112 kB 5.3 MB/s eta 0:00:01[K     |██████████████████████▌         | 122 kB 5.3 MB/s eta 0:00:01[K     |████████████████████████▍       | 133 kB 5.3

In [1]:
import gym
from gym import error, spaces, utils
from gym.utils import seeding
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

In [18]:
class ForagingReplenishingPatches(gym.Env):
    def __init__(self, block_type=1, manual_play=False):
        self.reset_flag = False
        self.action_space = spaces.Discrete(8)
        self.observation_space = spaces.Discrete(8)
        self.block_type = block_type
        self.HARVEST_ACTION_ID = 8

        if self.block_type == 1:
            self.rewards = np.asarray([0, 70, 70, 0, 70, 0, 70, 0])
        elif self.block_type == 2:
            self.rewards = np.asarray([0, 0, 70, 70, 0, 70, 0, 70])
        elif self.block_type == 3:
            self.rewards = np.asarray([70, 0, 0, 70, 70, 0, 70, 0])

        self.rewarding_sites = np.arange(8)[self.rewards > 0]
        self.current_state = 0
        self.time_elapsed = 1.307

        self.farmer_reward = 0
        self.init_env_variables()
        if manual_play:
            self.init_foraging_img()
            self.manual_play()

    def replenish_rewards(self):
        if self.block_type == 1:
            replenish_rates = np.asarray([0, 4, 4, 0, 4, 0, 4, 0])
        elif self.block_type == 2:
            replenish_rates = np.asarray([0, 0, 8, 2, 0, 5, 0, 8])
        elif self.block_type == 3:
            replenish_rates = np.asarray([2, 0, 0, 4, 8, 0, 16, 0])
        replenish_rates[self.current_state] = 0
        self.rewards += replenish_rates
        self.rewards = np.clip(self.rewards, 0, 200)

    def step(self, action):
        self.time_elapsed += self.time_dist[str(self.current_state) + "to" + str(action)]
        self.current_state = action
        if self.time_elapsed >= 300:
            self.reset_flag = True
            return (self.current_state, 0 , self.reset_flag, {})
        
        self.time_elapsed += 1
        reward_old = self.farmer_reward
        if self.current_state in self.rewarding_sites:
            self.replenish_rewards()
            self.farmer_reward += self.rewards[self.current_state] * 0.90
            self.rewards[self.current_state] = (self.rewards[self.current_state] * 0.9)

        if self.time_elapsed >= 300:
            self.reset_flag = True
        return (self.current_state, self.farmer_reward - reward_old, self.reset_flag, {})

    def reset(self):
        self.reset_flag = False
        if self.block_type == 1:
            self.rewards = np.asarray([0, 70, 70, 0, 70, 0, 70, 0])
        elif self.block_type == 2:
            self.rewards = np.asarray([0, 0, 70, 70, 0, 70, 0, 70])
        elif self.block_type == 3:
            self.rewards = np.asarray([70, 0, 0, 70, 70, 0, 70, 0])
        self.rewarding_sites = np.arange(8)[self.rewards > 0]
        self.current_state = 0
        self.time_elapsed = 2
        self.farmer_reward = 0
        return self.current_state

    def render(self, mode="human"):
        print("Current State:", self.current_state, "Current Total Reward:", self.farmer_reward)

    def close(self):
        cv2.destroyAllWindows()
        return None

    def init_env_variables(self, first_point_angle=0):
        a = 1 / (2 * np.sin(np.pi / 8))  # fix a (radius) for unit side octagon
        self.octagon_points = np.asarray(
            [
                (
                    a * np.sin(first_point_angle + n * np.pi / 4),
                    a * np.cos(first_point_angle + n * np.pi / 4),
                )
                for n in range(8)
            ]
        )
        self.time_dist = {}
        for i in range(8):
            for j in range(8):
                dist = np.linalg.norm(self.octagon_points[i] - self.octagon_points[j])
                self.time_dist.update({str(i) + "to" + str(j): dist})

In [19]:
from stable_baselines3.common.env_checker import check_env
env = ForagingReplenishingPatches(block_type=3)
check_env(env, warn=True)

In [None]:
env.reset()
for i in range(300):
    action = np.random.randint(8)
    state, reward, done, _ = env.step(action)
    print(action, state, reward, done)
    if done:
        break

In [25]:
from stable_baselines3 import PPO
env.reset()
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10**5)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 113      |
|    ep_rew_mean     | 5.89e+03 |
| time/              |          |
|    fps             | 1402     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 113          |
|    ep_rew_mean          | 6.06e+03     |
| time/                   |              |
|    fps                  | 1051         |
|    iterations           | 2            |
|    time_elapsed         | 3            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0005789475 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    en

<stable_baselines3.ppo.ppo.PPO at 0x7f905c7d79d0>

In [26]:
obs = env.reset()
while True:
    action, _state = model.predict(obs, deterministic=False)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
      obs = env.reset()
      break

Current State: 6 Current Total Reward: 63.0
Current State: 3 Current Total Reward: 129.60000000000002
Current State: 3 Current Total Reward: 189.00000000000003
Current State: 6 Current Total Reward: 274.5
Current State: 3 Current Total Reward: 331.2
Current State: 6 Current Total Reward: 422.1
Current State: 4 Current Total Reward: 528.3000000000001
Current State: 6 Current Total Reward: 623.7
Current State: 6 Current Total Reward: 709.2
Current State: 4 Current Total Reward: 819.0
Current State: 1 Current Total Reward: 819.0
Current State: 6 Current Total Reward: 909.9
Current State: 4 Current Total Reward: 1015.1999999999999
Current State: 4 Current Total Reward: 1109.6999999999998
Current State: 6 Current Total Reward: 1219.4999999999998
Current State: 3 Current Total Reward: 1302.2999999999997
Current State: 6 Current Total Reward: 1414.7999999999997
Current State: 6 Current Total Reward: 1515.5999999999997
Current State: 3 Current Total Reward: 1596.5999999999997
Current State: 6 

In [27]:
# save
%cd /content/drive/MyDrive/Sem 5/CS698
!mkdir saved_models
%cd saved_models

model.save("PPOmlpPolicy")

/content/drive/MyDrive/Sem 5/CS698
/content/drive/MyDrive/Sem 5/CS698/saved_models


In [28]:
print(model.policy)

ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (shared_net): Sequential()
    (policy_net): Sequential(
      (0): Linear(in_features=8, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=8, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=8, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)


In [29]:
%cd /content/drive/MyDrive/Sem 5/CS698
!mkdir ppo_forager_tensorboard
!ls

/content/drive/My Drive/Sem 5/CS698
190816_CS698_Assignment_1.ipynb  ppo_forager_tensorboard
MDP_Foraging.ipynb		 saved_models


In [30]:
from stable_baselines3 import PPO
env.reset()
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log="./ppo_forager_tensorboard/")
model.learn(total_timesteps=10**6)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|    value_loss           | 2.85e+06      |
-------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 125          |
|    ep_rew_mean          | 1.59e+04     |
| time/                   |              |
|    fps                  | 819          |
|    iterations           | 252          |
|    time_elapsed         | 629          |
|    total_timesteps      | 516096       |
| train/                  |              |
|    approx_kl            | 0.0006557259 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.22        |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 1.41e+06     |
|    n_updates            | 2510         |
|    policy_gradient_loss | -0.000577    |
|    value_loss           | 2.

<stable_baselines3.ppo.ppo.PPO at 0x7f905c8418d0>

In [32]:
# save
%cd /content/drive/MyDrive/Sem 5/CS698
!mkdir saved_models
%cd saved_models

model.save("PPOmlpPolicy1M")

/content/drive/My Drive/Sem 5/CS698
mkdir: cannot create directory ‘saved_models’: File exists
/content/drive/MyDrive/Sem 5/CS698/saved_models


In [31]:
!tensorboard --logdir ./ppo_forager_tensorboard/

2021-11-07 13:00:12.505587: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-11-07 13:00:12.505682: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (e94e3b02a9d3): /proc/driver/nvidia/version does not exist

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.6.0 at http://localhost:6006/ (Press CTRL+C to quit)
