In [None]:
import numpy as np
import math
import helicopter
import tqdm
import random
import matplotlib.pyplot as plt
import statistics as st
import time
import copy 

In [None]:
class ITERABLE_STEPS():
    def __init__(self,env, H, actions):
        self.enviroment=env
        self.H_ref = H      
        self.actions = actions
        self.index = 0
        self.TOP = len(actions)
        return None

    def __iter__(self):
        return self

    def __next__(self):
        if self.index == self.TOP:
            raise StopIteration
        else:
            i = self.index
            self.index += 1
            return (Copy(self.enviroment), self.actions[i], self.H_ref)     

    def __del__(self):
        return None

In [None]:
class Policy():    
    def __init__(self, ):
        self.policy = dict()

    def new(self,key,control,value):
        print(key)
        try:
            # The key state is already seen
            controls, values, actions_freq , total_freq = self.policy[key]
            # Iteration over controls available in key until reach it
            if control in controls:
                print("Estado con acción ya escogida")
                print(self.policy[key])
                i = 0
                for j in controls:
                    if j == control:
                        break
                    else:
                        i += 1
                values[i]=(values[i]+value)/2
                actions_freq[i]+=1                               
            #If state has not this control, then add it with actual reward and update freq
            else:
                print("Estado con acción aún no vista")
                print(self.policy[key])
                controls.append(control)
                values.append(value)
                actions_freq.append(1)
            total_freq+=1
            self.policy[key] = (controls,values, actions_freq, total_freq)
            print(self.policy[key])
        except:
            # The key state is new, so their values are created
            #Set of controls, Set of q_values,freq of choosing this control, total freq of being in this state
            print("Nuevo estado agregado")            
            self.policy[key] = ([control],[value],[1],1) 
            print(self.policy[key])
            
    def __repr__(self):
        s = "Q-Table with {0} states".format(len(self.policy.keys()))
        for key in self.policy.keys():
            s+="\nState {0} controls:{1}".format(key,self.policy[key][0])
        return s
    
    def call(self, key,mode):        
        try: 
            print("Modo:",mode)
            print("State:",key)
            (controls,q_values,freqs,total_freq)= self.policy[key]
            print("Encontre el estado:",self.policy[key])
            if mode=="stochastic":
                print("Seleccion estocastica de controles")
                freq_r = np.array(freqs) / total_freq
                action_max=np.random.choice(controls, 1, p=freq_r)[0]
                return action_max
            elif mode=="deterministic":
                print("selección determinista de controles")
                action_max= controls[q_values.index(max(q_values))]
                return  action_max
            else:
                return False
        except:
            return False
    

In [None]:
def rollout(env, H,observation,K,A,N_SAMPLES,vision,epsilon):  
    grid, pos, remain_steps= observation
    actions= env.available_actions(pos)   
    NEXT_STEPS=ITERABLE_STEPS(env,H,actions)
    q_values={}
    for action_step in NEXT_STEPS:       
        action_s, q_value= Simulation(action_step,K,A,N_SAMPLES,vision)        
        q_values[action_s]=q_value   
    if random.random() > epsilon:
        print("Explotation")
        #Maximization over all possible action controls (max u_xk)
        action_max= max(q_values.keys(), key=(lambda k: q_values[k]))
    else:
        print("Exploration")
        action_max=random.choice(list(q_values.keys()))   
    return(action_max,q_values[action_max])

In [None]:
def Simulation(simulation_args,K,A,N_SAMPLES,vision):    
    total_q_value=0
    alpha=A
    env_c, action, H = simulation_args  
    #Make first step(1-step-lookahead deterministic)
    observation, g_k, done, info = env_c.step(action)
    checkpoint = env_c.make_checkpoint()
    total_q_value=0   
    #Follow the heuristic K-steps   
    for n_samples in range(N_SAMPLES):        
        t_cost=0
        q_value=0        
        env_c.load_checkpoint(checkpoint)      
        for r_iter in range(K):                
                action_H = H(observation,vision) 
                observation, cost, done, info = env_c.step(action_H)                
                cost=cost*alpha 
                alpha=alpha*A
                t_cost=t_cost + cost               
        q_value=g_k+t_cost #q-value for one sample
        total_q_value=total_q_value+q_value #Adding up values of N_SAMPLES               
    total_q_value=total_q_value/N_SAMPLES #Average of q value cost over all samples   
    del env_c
    return (action,total_q_value)

In [None]:
def Heuristic(observation,vision):
    grid, pos, remain_steps= observation
    
    #Add borders in Grid perimeter according to vision range of helicopter, in order
    #to explore without boundary limits (Example: 2 vision add 2 cells at each side of Grid)
    Pad_grid=ExpandGrid(grid,vision)    
    
    #Get neighborhood in agent current position and vision range
    neighborhood= get_neighborhood(Pad_grid,pos,vision) 
    
    #Count fire cells by zone(8 zones)    
    burned_densities={}
    
    #Up Zone
    up_zone=neighborhood[ 0:neighborhood.shape[0]-(vision+1),0:neighborhood.shape[1]] #Get Up Zone
    up_burned=Count_Burned_Trees(env,up_zone) #Get fire cells in up zone    
    burned_densities["up"]=up_burned #Add zone and fire density to dictionary
    
    #Up Left Zone
    up_left_zone=neighborhood[ 0:neighborhood.shape[0]-(vision+1),0:neighborhood.shape[0]-(vision+1) ]
    up_left_burned=Count_Burned_Trees(env,up_zone)   
    burned_densities["up_left"]=up_left_burned    
    
    #Up Right Zone
    up_right_zone=neighborhood[ 0:neighborhood.shape[0]-(vision+1),neighborhood.shape[0]-vision:neighborhood.shape[0] ]
    up_right_burned=Count_Burned_Trees(env,up_right_zone)   
    burned_densities["up_right"]=up_right_burned    
    
    #Down Zone
    down_zone=neighborhood[ neighborhood.shape[0]-vision:neighborhood.shape[0],0:neighborhood.shape[1]]
    down_burned=Count_Burned_Trees(env,down_zone)    
    burned_densities["down"]=down_burned
    
    #Down Left
    down_left_zone=neighborhood[ neighborhood.shape[0]-vision:neighborhood.shape[0], 0:neighborhood.shape[0]-(vision+1) ]
    down_left_burned=Count_Burned_Trees(env,down_left_zone)    
    burned_densities["down_left"]=down_left_burned    
    
    #Down Right
    down_right_zone=neighborhood[ neighborhood.shape[0]-vision:neighborhood.shape[0], neighborhood.shape[0]-vision:neighborhood.shape[0] ]
    down_right_burned=Count_Burned_Trees(env,down_right_zone)   
    burned_densities["down_right"]=down_right_burned   
    
    #Left Zone
    left_zone=neighborhood[ 0:neighborhood.shape[0],0:neighborhood.shape[0]-(vision+1)]
    left_burned=Count_Burned_Trees(env,left_zone)    
    burned_densities["left"]=left_burned
    
    #Right Zone
    right_zone=neighborhood[ 0:neighborhood.shape[1],neighborhood.shape[0]-vision:neighborhood.shape[0]]
    right_burned=Count_Burned_Trees(env,right_zone)   
    burned_densities["right"]=right_burned
    
    #Action based on burned trees/zone
    actions= ((1,2,3),
              (4,5,6),
              (7,8,9))
    
    #Max function will return a (key,value) tuple of the maximum value from the dictionary
    mx_tuple = max(burned_densities.items(),key = lambda x:x[1]) 
    #Mx_tuple[1] indicates maximum dictionary items value
    max_list =[i[0] for i in burned_densities.items() if i[1]==mx_tuple[1]] 
    
    #Apply Heuristic Rules according to fire cells in each zone
    #If there are more than 1 max burn zone, choose randomly
    if len(max_list) > 1: 
        a=random.choice(max_list)
        if a=="up":
            action=actions[0][1]
        elif a=="down":
            action=actions[2][1]
        elif a=="left":
            action=actions[1][0]
        elif a=="right":
            action=actions[1][2]
        elif a=="up_left":
            action=actions[0][0]
        elif a=="up_right":
            action=actions[0][2]
        elif a=="down_left":
            action=actions[2][0]
        elif a=="down_right":
            action=actions[2][2]
    #If there is only one zone with max fire density (move in up,down,right,left or corners only)
    elif len(max_list)==1:
        if max_list[0]=="up":
            action=actions[0][1]
        elif max_list[0]=="down":
            action=actions[2][1]
        elif max_list[0]=="left":
            action=actions[1][0]
        elif max_list[0]=="right":
            action=actions[1][2]
        elif max_list[0]=="up_left":
            action=actions[0][0]
        elif max_list[0]=="up_right":
            action=actions[0][2]
        elif max_list[0]=="down_left":
            action=actions[2][0]
        elif max_list[0]=="down_right":
            action=actions[2][2]
        else:
            action=random.randint(1, 9)
    act=action        
    return act

In [None]:
#Receives a grid zone and count fire cells
def Count_Burned_Trees(env,zone):
    counter=0
    for row in range(zone.shape[0]):
        for col in range(zone.shape[1]):
            if zone[row][col]==env.fire:
                counter+=1
    return counter

In [None]:
#Get neighborhood of agent according to vision range
def get_neighborhood(grid,pos,vision):
    pos_row=pos[0]
    pos_col=pos[1]    
    neighborhood=grid[pos_row:pos_row+1+vision*2,pos_col:pos_col+1+vision*2]
    return neighborhood

In [None]:
def ExpandGrid(grid,vision):        
        size = grid.shape        
        PadGrid = np.zeros((size[0],size[1]), dtype=np.int16)        
        for i in range(size[0]):
            for j in range(size[1]):
                if(grid[i][j][0]==1):
                    PadGrid[i][j]=0
                elif(grid[i][j][1]==1):
                    PadGrid[i][j]=1
                else:
                    PadGrid[i][j]=2
        size=PadGrid.shape
        PadGrid2 = np.zeros((size[0]+2*vision,size[1]+2*vision), dtype=np.int16)
        PadGrid2[vision:-vision,vision:-vision] = PadGrid
        return PadGrid2

In [None]:
def Copy(env):
    n_env = helicopter.EnvMakerForestFire(init_pos_row=env.pos_row,init_pos_col=env.pos_col,n_row = env.n_row, n_col = env.n_col,
                                          p_tree = env.p_tree, p_fire =env.p_fire, moves_before_updating = env.moves_before_updating,
                                          reward_type = env.reward_type, reward_tree = env.reward_tree,reward_fire = env.reward_fire,
                                          reward_empty =env.reward_empty, reward_hit = env.reward_hit,sub_tree = env.sub_tree,
                                          sub_empty = env.sub_empty, sub_fire = env.sub_fire, sub_rock = env.sub_rock,sub_lake = env.sub_lake,
                                          ip_tree = env.ip_tree, ip_empty =env.ip_empty, ip_fire =env.ip_fire, ip_rock = env.ip_rock,
                                          ip_lake = env.ip_lake)
    n_env.grid = copy.deepcopy(env.grid)      
    n_env.total_reward = copy.deepcopy(env.total_reward)   
    n_env.total_hits=copy.deepcopy(env.total_hits)
    n_env.remaining_moves=copy.deepcopy(env.remaining_moves)
    
    return n_env

In [None]:
# Environment parameters
N_ROW = 16
N_COL = 16
Init_Row=7
Init_Col=7
P_FIRE = 0.03
P_TREE = 0.1
# Symbols for cells
TREE = 0
FIRE = 2
EMPTY = 1
FREEZE = 8 #Movements of Helicopter after update Automata

In [None]:
if __name__ == '__main__':
    #Create a new enviroment with initial parameters
    env = helicopter.EnvMakerForestFire(n_row = N_ROW, n_col = N_COL, p_tree = P_TREE, p_fire = P_FIRE,
                 init_pos_row = Init_Row, init_pos_col = Init_Col, moves_before_updating = FREEZE,  
                 tree = TREE, empty = EMPTY, fire = FIRE)  
  
    # First observation
    observation = env.reset()
    
    #Create a copy of enviroment with initial observation
    env_1 = Copy(env)
    observation_1 = observation   
    
    # Making checkpoints
    checkpoint_env = env.make_checkpoint()
    checkpoint_env_1 = env_1.make_checkpoint()
    
    #Create a new empty Policy     
    policy= Policy()
    
    #Rollout Variables
    N_TEST= 20      #Number of Training total simulations of Rollout
    N_STEPS=50      #Number of steps in rollout(50 updates of enviroment and 20*8 movements of agent)
    A=0.9           #Discount factor for future rewards
    K=10            #Rollout Steps on horizon for the heuristic 
    N_SAMPLES=20    #Number of samples trajectories in rollout to calculate expected value
    vision= 1       #Range to lookup in cells in helicpter heuristic  
    epsilon=0.99    #Epsilon for exploration in state space 

In [None]:
    RO_RESULTS=[]
    H_RESULTS=[]
    RO_RESULTS_C=[]
    H_RESULTS_C=[]
    start = time.time()
    for n_test in range(N_TEST):        
        print("Test:",n_test)           
        env.load_checkpoint(checkpoint_env)
        env_1.load_checkpoint(checkpoint_env_1) 
        rollout_cost=0
        heuristic_cost=0
        rollout_cost_step=[]
        heuristic_cost_step=[]        
        for i in tqdm.tqdm(range(FREEZE * N_STEPS)):
            print("Step:",i)
            #env.render()
            r_action, q_value=rollout(env,Heuristic,observation,K,A,N_SAMPLES,vision,epsilon)
            epsilon=epsilon*0.99
            h_action=Heuristic(observation_1,vision)
            #Update Policy            
            policy.new(env.Encode(),r_action,q_value)            
            ###############

            print("Rollout Action:",r_action)
            print("Heuristic Action:",h_action)
            #Next Steps
            observation, ro_cost, _, _ = env.step(r_action)
            observation_1, h_cost, _, _ = env_1.step(h_action)  

            rollout_cost += ro_cost  #Acumulative cost for rollout          
            rollout_cost_step.append(rollout_cost)  #List of cost over time

            heuristic_cost += h_cost
            heuristic_cost_step.append(heuristic_cost)

            print("Rollout in step {} is: {}".format(i,rollout_cost))
            print("Heuristic in step {} is: {}".format(i,heuristic_cost))
        print("Rollout:",rollout_cost)
        print("heuristic:",heuristic_cost)
        #Costs p/test
        RO_RESULTS.append(rollout_cost)             
        H_RESULTS.append(heuristic_cost)
        #Cumulative costs p/test
        RO_RESULTS_C.append(rollout_cost_step)
        H_RESULTS_C.append(heuristic_cost_step)
    print("Total time execution %.3f s"%(time.time()-start))

In [None]:
    print(RO_RESULTS_C)

In [None]:
    #OBTAIN AVERAGE COSTS PER STAGE
    ITER=[]
    ITER2=[]
    RO_RESULTS_C=np.array(RO_RESULTS_C)    
    for i in range(RO_RESULTS_C.shape[1]):
        IT=[]
        IT2=[]
        for j in range(RO_RESULTS_C.shape[0]):
            IT.append(RO_RESULTS_C[j][i])
            IT2.append(H_RESULTS_C[j][i])
        ITER.append(st.mean(IT))
        ITER2.append(st.mean(IT2))        
    
    x = np.arange(RO_RESULTS_C.shape[1])    
    plt.xlabel('Steps')
    plt.ylabel('Average Reward')
    plt.title('Average reward over 20 test (1 step-1 vision)')
    plt.plot(x,ITER ,label='Rollout')
    plt.plot(x,ITER2, label='Heuristic')
    plt.legend()

In [None]:
    #Grapg of total cost by test
    RO_RESULTS = np.array(RO_RESULTS)    
    H_RESULTS = np.array(H_RESULTS) 
    RO_RESULTS_MEAN=[]    
    RO_RESULTS_MEAN.append(st.mean(np.array(RO_RESULTS_C[0])))   
    x = np.arange(20)    
    plt.xlabel('Test')
    plt.ylabel('Final Total Reward (1 step- 1 vision)')
    plt.title('Rollout Tests')
    plt.plot(x,RO_RESULTS ,label='Rollout')
    plt.plot(x,H_RESULTS, label='Heuristic')
    plt.legend()    

In [None]:
    #Simulation of enviroment with Trained Policy    
    observation = env.reset()
    env_1 = env.Copia()
    observation_=observation
    fig=env.render()
    fig.savefig('Pictures/Env.png')
    total_reward = 0
    total_reward_step=[]
    total_reward_h = 0
    total_reward_step_h=[]   
    N_STEPS=50
    for j in tqdm.tqdm(range(N_STEPS*FREEZE)):
        p_action=policy.call(env.Encode(),"stochastic")
        print(env.Encode())
        h_action=Heuristic(observation_,vision)
        if p_action:
            print("Tomando Accion de Politica")
            observation, cost, done, info = env.step(p_action)           
            fig=env.render()
            s='Pictures/Env' + str(j) + '.png'
            fig.savefig(s)
        else:
            print("Usando Heurística")
            action=rollout(env,Heuristic,observation,K,A,N_SAMPLES,vision,0.1)
            print(action[0])
            observation, cost, done, info = env.step(action[0])
            fig=env.render()
            s='Pictures/Env' + str(j) + '.png'
            fig.savefig(s)
        observation_, cost_, done_, info_ = env_1.step(h_action)
        total_reward += cost
        total_reward_h += cost_
        total_reward_step.append(total_reward)
        total_reward_step_h.append(total_reward_h)
        
    print(total_reward)
    print(total_reward_h)
    env.render()
    #End of simulation

In [None]:
    total_reward_step = np.array(total_reward_step)    
    total_reward_step_h = np.array(total_reward_step_h) 
    
    plt.xlabel('Episodes')
    plt.ylabel('Reward')
    plt.title('Reward over time')
    plt.plot(total_reward_step ,label='Rollout')
    plt.plot(total_reward_step_h, label='Heuristic')
    plt.legend()    
    

In [None]:
s = "Q-Table with {0} states".format(len(policy.policy.keys()))
states=policy.policy.keys()
f= open("Policy_1s_1v.txt","w+")
for state in states:
    s+="\n{0} {1} {2} {3} {4}".format(state,
                                  policy.policy[state][0],
                                  policy.policy[state][1],
                                  policy.policy[state][2],
                                  policy.policy[state][3])
    f.write(s)
    s=""
f.close()