In [1]:
from gym import Env
from gym.envs.registration import EnvSpec
import numpy as np
from gym.spaces import MultiDiscrete,Box
from graph_tool.all import *
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from makegraph import *
import matplotlib.pyplot as plt

In [2]:
def simulatepandemic(self,actions):
    action = np.divide(actions,np.sum(actions)) #relative availability of vaccine is calculated
    update_state(self.g,action)
    erate = extract_economy(self.g)
    self.state = graph_to_matrix(self.g)
    self.timestep += 1
    return self.state,erate

def initializepandemic(self):
    self.g = make_graph(self.size,self.distribution)
    self.state = graph_to_matrix(self.g)
    return self.state

In [3]:

class PanEnv(Env):
    def __init__(self,size,distribution):
        self.size = size #population size
        self.spec = EnvSpec('GymEnv-v0')
        self.action_space = MultiDiscrete(nvec=[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10])
        self.observation_space = Box(low=-np.inf,high=np.inf,shape=[self.size,7])
        
        
        self.distribution = distribution #age distribution, country name as string
        self.g = make_graph(self.size,self.distribution)
                
        #state observation as matrix
        self.state = graph_to_matrix(self.g)
        
        self.timestep = 1
    def step(self,actions):
        result = simulatepandemic(self,actions)
        state = result[0]
        E_t = result[1]
        #observation (object): agent's observation of the current environment
        #r ans s taken from nature paper
        r = 8
        s = 5
        
        total_population = np.sum(state[:,0])+np.sum(state[:,1])+np.sum(state[:,2])+np.sum(state[:,3])+np.sum(state[:,4])+np.sum(state[:,5])
        
        active_cases = np.sum(state[:,1])+np.sum(state[:,4])
        #A_t and D_t from nature paper
        
        A_t = (active_cases/total_population) * 100
        
        D_t = np.sum(state[:,5])/total_population
        self.reward_1 =  E_t * np.exp(-r * A_t) - s * D_t
        reward = self.reward_1
        
       
        #print(reward)
        #reward (float) : amount of reward returned after previous action
        #negative reward: punishment > change weights a lot, push away from causing weights, positive rewards pull
        #do reward compared to reward from previous step
        #naive example: reward = -sum(infected) > we want a reward where the cumulative sum of infections until end
        #is minimized
        #exp. solution: store information in self, summed infections, normalized by time
        done = False
        if((self.timestep > 19) or np.sum(self.state[:,1])==0):
            done = True
        #done (bool): episode is done after 20 timesteps or when there are no longer infected agents
        info = {}
        #info (dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning)'''
        return state, reward, done, info
    def reset(self):
        #returns initial state
        self.state = initializepandemic(self)
        return self.state

In [4]:
from stable_baselines3.common.env_checker import check_env
#check_env(Env)

In [5]:
#matrix representation:
#[S, I, R, Sv, Iv, D]

In [6]:
env = PanEnv(size=1000,distribution='Japan')
env.reset()

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [7]:
actions = np.full(20,1)

In [8]:
env.step(actions)

(array([[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 1.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]]),
 -0.004264393394687684,
 False,
 {})

In [9]:
print(np.sum(env.state[:,0]),np.sum(env.state[:,1]),np.sum(env.state[:,2]),np.sum(env.state[:,3]),np.sum(env.state[:,4]),)

949.0 9.0 0.0 41.0 0.0


In [10]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.callbacks import BaseCallback
#tensorflow for graphs
import tensorflow as tf
from stable_baselines3.common.evaluation import evaluate_policy
env = PanEnv(size=100,distribution='Japan')
log_dir = "./gym/"
env = Monitor(env,log_dir)
model = PPO("MlpPolicy",env,verbose=1)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)

2021-12-03 12:11:01.738919: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-03 12:11:01.738945: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Using cpu device
Wrapping the env in a DummyVecEnv.


In [11]:
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:0.40 +/- 0.73


In [12]:

# Parallel environments

env = DummyVecEnv([lambda: PanEnv(size=1000,distribution='Japan')])

In [13]:
#the class is needed if we want to sample more frequently
'''
https://github.com/DLR-RM/stable-baselines3/issues/309

^^^^^^ this was very helpful ^^^^^^
'''
class TensorboardCallback(BaseCallback):
    """
    Custom callback for plotting additional values in tensorboard.
    """

    def __init__(self, verbose=1):
        super(TensorboardCallback, self).__init__(verbose)
        self.cum_rew_1 = 0
        #self.rew_1 = 0
        
    def _on_rollout_end(self) -> None:
        self.logger.record("rollout/cum_rew_1", self.cum_rew_1)
        #self.logger.record("rollout/rew_1", self.rew_1)
        # reset var once recorded
        self.cum_rew_1 = 0
        self.rew_1 = 0
        

    def _on_step(self) -> bool:
        # Log scalar value (here a random variable)
        #value = np.random.random()
        #self.logger.record('random_value', value)
        
        #self.logger.dump(self.num_timesteps)
        self.cum_rew_1 += self.training_env.get_attr("reward_1")[0]
        #self.rew_1 = self.training_env.get_attr("reward_1")[0]
        return True

In [14]:
model = PPO("MlpPolicy", env, verbose=1,tensorboard_log="./ppo_name_tensorboard/") #multilayer
rewards_callback = TensorboardCallback()
model.learn(total_timesteps=25000,tb_log_name="first_run",callback=rewards_callback) #training loop
#model.save("ppo_cartpole")

#del model # remove to demonstrate saving and loading

#model = PPO.load("ppo_cartpole")
#https://stable-baselines.readthedocs.io/en/master/guide/tensorboard.html?highlight=tensorboard
#^^^ tensorboard for displaying things, but I am stuck, upd: i guess it works now ^^^

Using cpu device
Logging to ./ppo_name_tensorboard/first_run_46


KeyboardInterrupt: 

In [None]:
#after training, reward should have improved
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
model.save("ppo_1")

In [None]:
gym.spaces??

In [None]:





mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
env = PanEnv(size=1000,distribution='Japan')
obs = env.reset()
done = False
obs_storage = []
rewards_storage = []
action_storage = []
while not done:
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    obs_storage.append(obs)
    rewards_storage.append(rewards)
    action_storage.append(action)

In [None]:
def get_sums(obs,index):
    sums = []
    for i in np.arange(0,len(obs)):
        sums.append(np.sum(obs_storage[i][:,index]))
    return sums
def show_pandemic(obs):
    all_sums = []
    for i in np.arange(0,7):
        all_sums.append(get_sums(obs,i))
    sval,ival,rval,vsval,vival,dval,economy_list = all_sums[0],all_sums[1],all_sums[2],all_sums[3],all_sums[4],all_sums[5],all_sums[6]
    tlist = np.arange(0,len(obs))
    plt.figure()
    plt.plot(tlist, sval, label = "S")
    plt.plot(tlist, rval, label = "R")
    plt.plot(tlist, vsval, label = "Sv")
    plt.plot(tlist, vival, label = "Iv")
    plt.plot(tlist, ival, label = "I")
    plt.plot(tlist, dval, label = "D")
    plt.plot(tlist,economy_list, label = "Economy")
    plt.legend()
    plt.show()
def show_actions(actions):
    fo i in np.arange(0,len(obs)):
        
    n, x, _ = plt.hist(action_storage[0], bins=np.linspace(1, 21, 20), 
                   histtype=u'step', density=True)  
    plt.show()

In [None]:
print(action_storage[0])

In [None]:
show_pandemic(obs_storage)
n, x, _ = plt.hist(action_storage[0], bins=np.linspace(1, 21, 20), 
                   histtype=u'step', density=True)  
plt.show()

In [None]:
#split into learning and testing
model.learn(total_timesteps = 5000)
# store/accumulate rewards