In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install stable-baselines3

Collecting stable-baselines3
  Downloading stable_baselines3-1.3.0-py3-none-any.whl (174 kB)
[?25l[K     |█▉                              | 10 kB 21.5 MB/s eta 0:00:01[K     |███▊                            | 20 kB 26.8 MB/s eta 0:00:01[K     |█████▋                          | 30 kB 12.5 MB/s eta 0:00:01[K     |███████▌                        | 40 kB 9.4 MB/s eta 0:00:01[K     |█████████▍                      | 51 kB 5.2 MB/s eta 0:00:01[K     |███████████▎                    | 61 kB 5.8 MB/s eta 0:00:01[K     |█████████████▏                  | 71 kB 5.5 MB/s eta 0:00:01[K     |███████████████                 | 81 kB 6.2 MB/s eta 0:00:01[K     |████████████████▉               | 92 kB 4.7 MB/s eta 0:00:01[K     |██████████████████▊             | 102 kB 5.1 MB/s eta 0:00:01[K     |████████████████████▋           | 112 kB 5.1 MB/s eta 0:00:01[K     |██████████████████████▌         | 122 kB 5.1 MB/s eta 0:00:01[K     |████████████████████████▍       | 133 kB 5.1 

In [1]:
import gym
from gym import error, spaces, utils
from gym.utils import seeding
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
class ForagingReplenishingPatches(gym.Env):
    def __init__(self, block_type=1, manual_play=False):
        self.reset_flag = False
        self.action_space = spaces.Discrete(8)
        self.observation_space = spaces.Discrete(8)
        self.block_type = block_type
        self.HARVEST_ACTION_ID = 8

        if self.block_type == 1:
            self.rewards = np.asarray([0, 70, 70, 0, 70, 0, 70, 0])
        elif self.block_type == 2:
            self.rewards = np.asarray([0, 0, 70, 70, 0, 70, 0, 70])
        elif self.block_type == 3:
            self.rewards = np.asarray([70, 0, 0, 70, 70, 0, 70, 0])

        self.rewarding_sites = np.arange(8)[self.rewards > 0]
        self.current_state = 0
        self.time_elapsed = 1.307

        self.farmer_reward = 0
        self.init_env_variables()
        if manual_play:
            self.init_foraging_img()
            self.manual_play()

    def replenish_rewards(self):
        if self.block_type == 1:
            replenish_rates = np.asarray([0, 4, 4, 0, 4, 0, 4, 0])
        elif self.block_type == 2:
            replenish_rates = np.asarray([0, 0, 8, 2, 0, 5, 0, 8])
        elif self.block_type == 3:
            replenish_rates = np.asarray([2, 0, 0, 4, 8, 0, 16, 0])
        replenish_rates[self.current_state] = 0
        self.rewards += replenish_rates
        self.rewards = np.clip(self.rewards, 0, 200)

    def step(self, action):
        self.time_elapsed += self.time_dist[str(self.current_state) + "to" + str(action)]
        self.current_state = action
        if self.time_elapsed >= 300:
            self.reset_flag = True
            return (self.current_state, 0 , self.reset_flag, {})
        
        self.time_elapsed += 1
        reward_old = self.farmer_reward
        if self.current_state in self.rewarding_sites:
            self.replenish_rewards()
            self.farmer_reward += self.rewards[self.current_state] * 0.90
            self.rewards[self.current_state] = (self.rewards[self.current_state] * 0.9)

        if self.time_elapsed >= 300:
            self.reset_flag = True
        return (self.current_state, self.farmer_reward - reward_old, self.reset_flag, {})

    def reset(self):
        self.reset_flag = False
        if self.block_type == 1:
            self.rewards = np.asarray([0, 70, 70, 0, 70, 0, 70, 0])
        elif self.block_type == 2:
            self.rewards = np.asarray([0, 0, 70, 70, 0, 70, 0, 70])
        elif self.block_type == 3:
            self.rewards = np.asarray([70, 0, 0, 70, 70, 0, 70, 0])
        self.rewarding_sites = np.arange(8)[self.rewards > 0]
        self.current_state = 0
        self.time_elapsed = 2
        self.farmer_reward = 0
        return self.current_state

    def render(self, mode="human"):
        print("Current State:", self.current_state, "Current Total Reward:", self.farmer_reward)

    def close(self):
        cv2.destroyAllWindows()
        return None

    def init_env_variables(self, first_point_angle=0):
        a = 1 / (2 * np.sin(np.pi / 8))  # fix a (radius) for unit side octagon
        self.octagon_points = np.asarray(
            [
                (
                    a * np.sin(first_point_angle + n * np.pi / 4),
                    a * np.cos(first_point_angle + n * np.pi / 4),
                )
                for n in range(8)
            ]
        )
        self.time_dist = {}
        for i in range(8):
            for j in range(8):
                dist = np.linalg.norm(self.octagon_points[i] - self.octagon_points[j])
                self.time_dist.update({str(i) + "to" + str(j): dist})

In [3]:
from stable_baselines3.common.env_checker import check_env
env = ForagingReplenishingPatches(block_type=3)
check_env(env, warn=True)

In [None]:
env.reset()
for i in range(300):
    action = np.random.randint(8)
    state, reward, done, _ = env.step(action)
    print(action, state, reward, done)
    if done:
        break

In [10]:
%cd /content/drive/MyDrive/Sem 5/CS698
!mkdir dqn_forager_tensorboard
!ls

/content/drive/MyDrive/Sem 5/CS698
 190816_CS698_Assignment_1.ipynb   MDP_Foraging_DQN.ipynb
 assignment_3.ipynb		   MDP_Foraging_PPO.ipynb
'CS698 Project Group 7'		   ppo_forager_tensorboard
 dqn_forager_tensorboard	   saved_models


In [11]:
from stable_baselines3 import DQN
env.reset()
model = DQN("MlpPolicy", env, verbose=1, tensorboard_log="./dqn_forager_tensorboard/")
model.learn(total_timesteps=10**6)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 10.9     |
|    n_updates        | 202920   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 104      |
|    ep_rew_mean      | 1.01e+04 |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 7932     |
|    fps              | 950      |
|    time_elapsed     | 906      |
|    total_timesteps  | 862100   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 14.1     |
|    n_updates        | 203024   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 104      |
|    ep_rew_mean      | 1.01e+04 |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 7

<stable_baselines3.dqn.dqn.DQN at 0x7fa509c71b10>

In [12]:
# save
%cd /content/drive/MyDrive/Sem 5/CS698
!mkdir saved_models
%cd saved_models

model.save("DQNmlpPolicy1M")

/content/drive/MyDrive/Sem 5/CS698
mkdir: cannot create directory ‘saved_models’: File exists
/content/drive/My Drive/Sem 5/CS698/saved_models


In [13]:
print(model.policy)

DQNPolicy(
  (q_net): QNetwork(
    (features_extractor): FlattenExtractor(
      (flatten): Flatten(start_dim=1, end_dim=-1)
    )
    (q_net): Sequential(
      (0): Linear(in_features=8, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): ReLU()
      (4): Linear(in_features=64, out_features=8, bias=True)
    )
  )
  (q_net_target): QNetwork(
    (features_extractor): FlattenExtractor(
      (flatten): Flatten(start_dim=1, end_dim=-1)
    )
    (q_net): Sequential(
      (0): Linear(in_features=8, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): ReLU()
      (4): Linear(in_features=64, out_features=8, bias=True)
    )
  )
)


In [14]:
obs = env.reset()
while True:
    action, _state = model.predict(obs, deterministic=False)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
      obs = env.reset()
      break

Current State: 3 Current Total Reward: 63.0
Current State: 6 Current Total Reward: 140.4
Current State: 4 Current Total Reward: 217.8
Current State: 6 Current Total Reward: 301.5
Current State: 4 Current Total Reward: 378.0
Current State: 6 Current Total Reward: 467.1
Current State: 4 Current Total Reward: 542.7
Current State: 6 Current Total Reward: 637.2
Current State: 4 Current Total Reward: 711.9000000000001
Current State: 6 Current Total Reward: 810.9000000000001
Current State: 4 Current Total Reward: 884.7
Current State: 6 Current Total Reward: 988.2
Current State: 4 Current Total Reward: 1061.1000000000001
Current State: 6 Current Total Reward: 1168.2
Current State: 4 Current Total Reward: 1240.2
Current State: 6 Current Total Reward: 1350.9
Current State: 4 Current Total Reward: 1422.9
Current State: 6 Current Total Reward: 1536.3000000000002
Current State: 4 Current Total Reward: 1608.3000000000002
Current State: 6 Current Total Reward: 1724.4
Current State: 4 Current Total Re

In [15]:
model.get_parameters()

{'policy': OrderedDict([('q_net.q_net.0.weight',
               tensor([[ 7.2962e-01,  1.0741e+00,  1.0223e+00,  6.7102e-01,  1.0572e+00,
                         3.4556e-01,  1.0934e+00,  2.8047e-01],
                       [ 6.3311e-01,  7.8729e-01,  8.8087e-01,  4.7836e-01,  1.1168e+00,
                         8.0082e-01,  5.6212e-01,  5.2122e-01],
                       [ 8.9332e-01,  9.7489e-01,  5.1817e-01,  1.3802e+00,  2.2610e-01,
                         9.2036e-01,  1.6358e-01,  3.1591e-01],
                       [ 8.1712e-01,  4.0101e-01,  8.6494e-01,  9.6137e-01,  1.1411e+00,
                         4.3268e-01,  5.8420e-01,  9.1820e-01],
                       [ 5.7680e-01,  8.8425e-01,  6.0767e-01,  8.1532e-01,  2.5864e-01,
                         6.3461e-01,  7.7287e-01,  2.0983e-01],
                       [ 4.0546e-01,  1.1411e+00,  8.1375e-01,  6.5960e-01,  6.2882e-01,
                         9.9988e-01,  1.1415e+00,  8.5557e-01],
                       [ 2.8968e-

In [None]:
!tensorboard --logdir ./dqn_forager_tensorboard/

2021-11-07 13:00:12.505587: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-11-07 13:00:12.505682: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (e94e3b02a9d3): /proc/driver/nvidia/version does not exist

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.6.0 at http://localhost:6006/ (Press CTRL+C to quit)
