In [None]:
conda install tensorflow=2.0 python=3.7

In [None]:
import random
from typing import List
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0-preview is required
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
#np.random.seed(42)
#tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# To get smooth animations
import matplotlib.animation as animation
mpl.rc('animation', html='jshtml')

from IPython.display import clear_output, display
from time import sleep



class Person:
    def __init__(self):
        self.work_loc = 0
        self.home_loc = 0
        self.store_loc = 0
        self.has_virus_num_steps = 0
        self.age = 0
        self.is_dead = 0
        self.is_contagious = 0
        self.degree_of_synonyms = 0
        self.day_of_week_go_shopping = 0
        self.does_follow_lockdown = 0
        self.went_to_store = 0
        self.will_go_to_hospital = 0
        
class Environment:
       
    def __init__(self):
        self.reset()
        
    def reset(self) -> List[float]:
        self.current_steps = 0
        self.current_day = 0
        self.people = []
        self.people_current_location = {}
        self.hospital_loc = 400    
        
        self.num_people_infected = []
        self.num_people_died = []
        self.num_people_recovered = []
        
        self.total_num_people_infected = []
        self.total_num_people_died = []
        self.total_num_people_recovered = []
        
        #for i in range(111):
        for i in range(441):
            self.people_current_location[i] = []
                
        
        #for i in range(100):
        for i in range(1200):
            person = Person()
            person.home_loc = random.randint(0,399)
            person.work_loc = random.randint(400,438)
            person.store_loc = random.randint(439,440)
            person.age = random.randint(0,100)
            person.day_of_week_go_shopping = random.randint(1, 7)
            person.does_follow_lockdown = random.randint(0,9)
            person.will_go_to_hospital = random.randint(0,9)
            
            self.people.append(person)
                    
            starting_loc = random.randint(0,10) 
            
            if starting_loc < 8:
                self.people_current_location[person.home_loc].append(person)
            elif starting_loc < 9:
                self.people_current_location[person.work_loc].append(person)
            else:
                person.went_to_store = 1
                self.people_current_location[person.store_loc].append(person)
                
        
        for i in range(2):
            self.people[i].has_virus_num_steps = 1
            self.people[i].is_contagious = 1
            self.people[i].degree_of_synonyms = random.randint(1,10)
        
        self.num_people_infected.append(2)
        self.total_num_people_infected.append(2)
        self.num_people_died.append(0)
        self.total_num_people_died.append(0)
        self.num_people_recovered.append(0)
        self.total_num_people_recovered.append(0)
        
        return self.get_observation()
                
    def get_observation(self):
        observations = []
        
        observations.append(self.current_steps)
        
        for person in sorted(self.people, key = lambda x: (x.is_contagious, x.is_dead)):
            #observations.append(person.work_loc)
            #observations.append(person.home_loc)
            #observations.append(person.store_loc)
            #observations.append(person.has_virus_num_steps)
            #observations.append(person.age)
            observations.append(person.is_dead)
            observations.append(person.is_contagious)
            #observations.append(person.degree_of_synonyms)
            #observations.append(person.day_of_week_go_shopping)
            #observations.append(person.does_follow_lockdown)
            #observations.append(person.went_to_store)
            #observations.append(person.will_go_to_hospital)
                
        #return np.array(observations).reshape(50,50,1).astype('float32')
        return np.array(observations)
        
    def get_actions(self) -> List[int]:
        return [0, 1, 2, 3]

    def is_done(self) -> bool:
        return self.current_steps > 10000

    def step(self, action: int, iteration: int, episode: int) -> float:
        
        
        self.current_steps += 1
        
        self.current_day = int(self.current_steps / 3) + 1
        
        if self.is_done():
            raise Exception("Game is over")
                
        self.move_people(action)
                
        reward = 0                
        """
        for person in self.people:
            if person.is_dead == 1:
                reward -= .5
            elif person.is_contagious == 1:
                reward -= .5
            elif person.is_contagious == 0 and person.has_virus_num_steps > 0:
                reward -= .5    
            else:
                reward += 1
        """
        for person in self.people:
            if person.has_virus_num_steps == 0:
                reward += 1 
            else:
                reward -= 1
        #reward = (reward*self.current_steps + 1)
        
        obs = self.get_observation()
        done = self.is_done()
        info = {}
        
        
        
        
        
        if episode == 9: #(episode == 9) or (iteration == 0 and episode == 0):
            #clear_output(wait=True)
            #print(f)  # use display(f)
            #print(self.people_current_location)
            
            print("iteration",iteration,"Steps",self.current_steps, "infected",self.num_people_infected[-1], \
                  "died",self.num_people_died[-1] \
                  ,"recovered",self.num_people_recovered[-1], \
                  "total infected",self.total_num_people_infected[-1],"total died",self.total_num_people_died[-1] \
                  ,"total recovered",self.total_num_people_recovered[-1])

            if iteration == 0 or iteration == 10:
                X = []
                Y = []
                C = []

                for current_loc in self.people_current_location.keys():
                    for i in range(len(self.people_current_location[current_loc])):
                        X.append(int((current_loc)/21)+1)
                        Y.append(int((current_loc)%21)+1)
                        if self.people_current_location[current_loc][i].has_virus_num_steps == 0:
                            C.append('g')
                        elif self.people_current_location[current_loc][i].is_contagious == 1:
                            C.append('r')
                        else:
                            C.append('y')

                plt.scatter(X, Y, s=75, c=C, alpha=.5)

                plt.xlim(0, 22)
                plt.xticks([])
                plt.ylim(0, 22)
                plt.yticks([])

                plt.show()

                #sleep(2)
        
        return obs, reward, done, info

    def move_people(self, action):
        
        num_people_infected = 0
        num_people_died = 0
        num_people_recovered = 0
        
        people_current_location = {}
        for i in range(441):
            people_current_location[i] = []

        for current_loc in self.people_current_location.keys():
            for i in range(len(self.people_current_location[current_loc])):
                person = self.people_current_location[current_loc][i]
                                
                if person.has_virus_num_steps > 0:
                    person.has_virus_num_steps += 1
                
                if person.has_virus_num_steps > 42 and person.degree_of_synonyms > 8:
                    num_people_died += 1
                    person.is_dead = 1
                    continue
                elif person.has_virus_num_steps > 42 and person.is_contagious == 1:
                    num_people_recovered += 1
                    person.is_contagious = 0
                elif person.has_virus_num_steps > 15 and person.degree_of_synonyms > 6 and \
                    person.will_go_to_hospital > 4:
                    people_current_location[self.hospital_loc].append(person)
                    continue
                # 3
                if action == 1: 
                    people_current_location[person.home_loc].append(person)
                elif action == 2 and person.does_follow_lockdown < 8: 
                    people_current_location[person.home_loc].append(person)
                elif action == 3 and person.does_follow_lockdown < 8 and \
                    (self.current_day % person.day_of_week_go_shopping) == 0 and \
                    person.went_to_store == 0:
                    person.went_to_store = 1
                    people_current_location[person.store_loc].append(person)
                elif action == 3 and person.does_follow_lockdown < 8 and \
                    (self.current_day % person.day_of_week_go_shopping) == 0 and \
                    person.went_to_store == 1:
                    people_current_location[person.home_loc].append(person)
                elif action == 3 and person.does_follow_lockdown < 8 and \
                    (self.current_day % person.day_of_week_go_shopping) != 0:
                    person.went_to_store = 0
                    people_current_location[person.home_loc].append(person)
                else:
                    if self.people_current_location[current_loc][i].home_loc == current_loc:
                        people_current_location[person.work_loc].append(person)
                    elif self.people_current_location[current_loc][i].work_loc == current_loc and \
                        person.day_of_week_go_shopping == self.current_day:
                        people_current_location[person.store_loc].append(person)
                    elif self.people_current_location[current_loc][i].store_loc == current_loc:
                        people_current_location[person.home_loc].append(person)
                    else:
                        people_current_location[person.home_loc].append(person)
                        
        for current_loc in people_current_location.keys():
            if next((x for x in people_current_location[current_loc] if x.is_contagious == 1), None) != None:
                for i in range(len(people_current_location[current_loc])):
                    if people_current_location[current_loc][i].has_virus_num_steps == 0 and \
                        ((random.randint(0, 9) >= 8 and people_current_location[current_loc][i].store_loc == current_loc) or \
                         (random.randint(0, 9) >= 5 and people_current_location[current_loc][i].work_loc == current_loc and current_loc != 400) or \
                        (random.randint(0, 1000) >= 1 and people_current_location[current_loc][i].work_loc == current_loc and current_loc == 400) or \
                        (people_current_location[current_loc][i].home_loc == current_loc)):
                        people_current_location[current_loc][i].has_virus_num_steps = 1
                        people_current_location[current_loc][i].is_contagious = 1
                        people_current_location[current_loc][i].degree_of_synonyms = random.randint(1,10)
                        num_people_infected += 1
                        break
        
        self.people_current_location = people_current_location
        
        self.num_people_infected.append(num_people_infected)
        self.total_num_people_infected.append(num_people_infected+self.total_num_people_infected[-1])
        self.num_people_died.append(num_people_died)
        self.total_num_people_died.append(num_people_died+self.total_num_people_died[-1])
        self.num_people_recovered.append(num_people_recovered)
        self.total_num_people_recovered.append(num_people_recovered+self.total_num_people_recovered[-1])
                        

            
            
            
            
            
            
            
            
class Agent:
    def __init__(self):
        self.total_reward = 0.0
        self.n_inputs = 4 # == env.observation_space.shape[0]

        
        
    def step(self, env: Environment):
        current_obs = env.get_observation()
        actions = env.get_actions()
        obs, reward, done, info = env.step(random.choice(actions))
        self.total_reward += reward
        
        print(reward, self.total_reward)

"""
if __name__ == "__main__":
    env = Environment()
    agent = Agent()

    while not env.is_done():
        agent.step(env)

    print("Total reward got: %.4f" % agent.total_reward)
"""


def play_one_step(env, obs, model, loss_fn, iteration, episode):
    global random_picks
    global model_picks

    with tf.GradientTape() as tape:
        #print(np.newaxis)
        model_probas = model(obs[np.newaxis])

        model_class = tf.math.argmax(model_probas, axis=1, output_type=tf.dtypes.int32)[0]


        #print(obs)
        #left_proba = model.predict(obs)
        rand_probas = tf.random.uniform([1, 4])
        #rand_probas = rand_probas/tf.keras.backend.sum(rand_probas)
        
        #if episode == 9:
        #rand_probas = rand_probas * ((10-iteration)*.5)
            
        rand_class = tf.math.argmax(rand_probas, axis=1, output_type=tf.dtypes.int32)[0]
                
        if (10-(iteration)) > episode:
            action = rand_class
            y_target = rand_probas
            random_picks+=1
        else:
            if model_probas[0, model_class] > rand_probas[0, rand_class]:
                action = model_class
                y_target = model_probas
                model_picks+=1
            else:
                action = rand_class
                y_target = rand_probas
                random_picks+=1
                
        pred = model_probas
        #y_target = to_categorical(action, num_classes=3)
        #pred = to_categorical(model_class, num_classes=3)
        #y_target = tf.keras.backend.one_hot(action, num_classes=3)
        #pred = tf.keras.backend.one_hot(model_class, num_classes=3)
    
        #print(y_target, pred)
        tape.watch(y_target)
        tape.watch(pred)
        loss = tf.reduce_mean(loss_fn(y_target, pred))
        #print(loss)
    grads = tape.gradient(loss, model.trainable_variables)
    #print(grads)
    obs, reward, done, info = env.step(int(action), iteration, episode)
    #obs, reward, done, info = env.step(1, iteration)
    return obs, reward, done, grads, int(action)

def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn, iteration):
    all_rewards = []
    all_grads = []
    
    action_history_map[iteration] = {}
    action_history_map[iteration][0] = 0
    action_history_map[iteration][1] = 0
    action_history_map[iteration][2] = 0
    action_history_map[iteration][3] = 0

    
    for episode in range(n_episodes):
        
        current_rewards = []
        current_grads = []
        obs = env.reset()
        for step in range(n_max_steps):
            obs, reward, done, grads, action = play_one_step(env, obs, model, loss_fn, iteration, episode)
            action_history_map[iteration][action] = action_history_map[iteration][action] + 1
            current_rewards.append(reward)
            current_grads.append(grads)
            #print(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads

def discount_rewards(rewards, discount_rate):
    discounted = np.array(rewards)
    #for step in range(len(rewards) - 2, -1, -1):
    #    discounted[step] += discounted[step + 1] * discount_rate
    return discounted

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate)
                              for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean) / reward_std
            for discounted_rewards in all_discounted_rewards]


for z in range(5):
    action_history_map = {}
    random_picks = 0
    model_picks = 0

    n_iterations = 11
    n_episodes_per_update = 15
    n_max_steps = 90

    discount_rate = 1 #0.95

    optimizer = keras.optimizers.Adam(lr=0.01)
    loss_fn = keras.losses.categorical_crossentropy

    #np.random.seed(42)
    #tf.random.set_seed(42)

    env = Environment()
    rewards = []

    model = keras.models.Sequential([

        keras.layers.Dense(128, activation="relu", input_shape=[len(env.get_observation())]),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(64, activation="relu"),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(32, activation="relu"),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(4, activation="softmax"),
    ])


    #tf.keras.backend.set_floatx('float16')
    """
    model = keras.models.Sequential([
        keras.layers.Conv2D(64, 7, activation="relu", padding="same", input_shape=env.get_observation().shape),
        keras.layers.MaxPooling2D(2),
        keras.layers.Conv2D(128, 3, activation="relu", padding="same"),
        keras.layers.Conv2D(128, 3, activation="relu", padding="same"),
        keras.layers.MaxPooling2D(2),
        keras.layers.Conv2D(256, 3, activation="relu", padding="same"),
        keras.layers.Conv2D(256, 3, activation="relu", padding="same"),
        keras.layers.MaxPooling2D(2),
        keras.layers.Flatten(),
        keras.layers.Dense(128, activation="relu"),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(64, activation="relu"),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(3, activation="softmax" ),
    ])
    """
    #model.kernel_initializer="he_normal"
    action_history = []

    for iteration in range(n_iterations):
        all_rewards, all_grads = play_multiple_episodes(
            env, n_episodes_per_update, n_max_steps, model, loss_fn, iteration)
        total_rewards = sum(map(sum, all_rewards))                     # Not shown in the book

        #print("Iteration: {}, mean rewards: {:.1f}\n".format(          # Not shown
        #    iteration, total_rewards / n_episodes_per_update), end="") # Not shown
        #sleep(3)
        rewards.append(total_rewards / n_episodes_per_update)
        #all_final_rewards = discount_and_normalize_rewards(all_rewards,
        #                                                   discount_rate)
        all_final_rewards = all_rewards
        all_mean_grads = []
        for var_index in range(len(model.trainable_variables)):
            mean_grads = tf.reduce_mean(
                [final_reward * all_grads[episode_index][step][var_index]
                 for episode_index, final_rewards in enumerate(all_final_rewards)
                     for step, final_reward in enumerate(final_rewards)], axis=0)
            
            all_mean_grads.append(mean_grads)
        optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))
        print(action_history_map, "R", random_picks, "M", model_picks)
    print(action_history_map)
    plt.figure(figsize=(8, 4))
    plt.plot(rewards)
    plt.xlabel("Iteration", fontsize=14)
    plt.ylabel("Sum of rewards", fontsize=14)
    #save_fig("dqn_rewards_plot")
    plt.show()