# Plume tracking

## Create the environment

* State, S = [x_t, y_t, c_t]
* Transition State, S_t = [x_(t+1), y_(t+1), c_(t+1) + Noise]

In [1]:
import numpy as np
from scipy.sparse import spdiags
import matplotlib.pyplot as plt
import random

class EnvGeneration:
    def __init__(self, grid_size = 50, D = 0.2, velocity_vector = [lambda t: 3.0, lambda t: 2.0], source_pos = [-1.6, -1.6], 
                source_strength = 1, source_spread = 15, Lx=4, Ly=4, Lt=2, Nt=2000):
        self.n = grid_size
        self.Nt = Nt
        
        self.Lx = Lx
        self.Ly = Ly
        self.Lt = Lt

        self.dx = Lx / (grid_size - 1)
        self.dy = Ly / (grid_size - 1)
        self.dt = Lt / (Nt - 1)
        self.x = np.linspace(-Lx / 2, Lx / 2, grid_size)
        self.y = np.linspace(-Ly / 2, Ly / 2, grid_size)
        self.t = np.linspace(0, Lt, Nt)
        self.X, self.Y = np.meshgrid(self.x, self.y)

        self.D = D
        self.V_x, self.V_y = velocity_vector
        self.source_pos = source_pos
        self.source_strength = source_strength
        self.source_spread = source_spread

        self.u_new = np.zeros((grid_size, grid_size))
        self.u_old = np.zeros((grid_size, grid_size))

    def init_solution(self):
        self.f = np.zeros((self.n, self.n))
        for i in range(1, self.n-1):
            for j in range(1, self.n-1):
                self.f[i, j] = self.source_strength * np.exp(-self.source_spread * (((self.x[i] - self.source_pos[0]))**2 + ((self.y[j] - self.source_pos[1]))**2))
                """ if np.random.rand() < 0.1:
                    self.f[i, j] = 0
                else:
                    self.f[i, j] += np.random.normal(loc=0, scale=self.f[i, j] * 0.2) """

        self.u_vec = np.zeros((self.n * self.n))
        self.u_new = self.u_new.flatten()
        self.u_old = self.u_old.flatten()
        self.f_vec = self.f.flatten()

        self.E = self.sparseE(self.n)
        self.e = np.ones(self.n * self.n)

    def sparseE(self, n):
        total_nodes = n * n
        diagonals = np.zeros((5, total_nodes))
        main_diag = np.ones(total_nodes)
        diagonals[2, :] = main_diag
        upper_diag = np.ones(total_nodes - n)
        diagonals[3, :-n] = upper_diag
        lower_diag = np.ones(total_nodes - n)
        diagonals[1, n:] = lower_diag
        left_diag = np.ones(total_nodes - 1)
        left_diag[np.arange(1, total_nodes) % n == 0] = 0
        diagonals[0, 1:] = left_diag
        right_diag = np.ones(total_nodes - 1)
        right_diag[np.arange(total_nodes - 1) % n == n - 1] = 0
        diagonals[4, :-1] = right_diag
        offsets = [-n, -1, 0, 1, n]
        return spdiags(diagonals, offsets, total_nodes, total_nodes, format='csr')

    def C_Derivative(self, t, u_vec):
        D_fluct = self.D + np.random.normal(loc=0, scale=0.2)
        alpha_y = D_fluct / (self.dy**2) - self.V_y(t) / (2 * self.dy)
        alpha_x = D_fluct / (self.dx**2) - self.V_x(t) / (2 * self.dx)
        beta = - (2 * D_fluct / (self.dx**2) + 2 * self.D / (self.dy**2))
        gamma_x = D_fluct / (self.dx**2) + self.V_x(t) / (2 * self.dx)
        gamma_y = D_fluct / (self.dy**2) + self.V_y(t) / (2 * self.dy)

        row = [gamma_y * self.e, gamma_x * self.e, beta * self.e, alpha_x * self.e, alpha_y * self.e]
        diags = [-self.n, -1, 0, 1, self.n]

        A = spdiags(row, diags, self.n * self.n, self.n * self.n, format='csr')

        k = A.dot(u_vec) + self.f_vec
        return k

    def solve(self):
        for n in range(self.Nt - 1):
            k1 = self.C_Derivative(self.t[n], self.u_old)

            # Apply zero Dirichlet boundary conditions to k1
            k1[0:self.n] = 0  # Top boundary
            k1[-self.n:] = 0  # Bottom boundary
            k1[::self.n] = 0  # Left boundary
            k1[self.n - 1::self.n] = 0  # Right boundary

            k2 = self.C_Derivative(self.t[n] + self.dt / 2, self.u_old + k1 * self.dt / 2)

            # Apply zero Dirichlet boundary conditions to k2
            k2[0:self.n] = 0  # Top boundary
            k2[-self.n:] = 0  # Bottom boundary
            k2[::self.n] = 0  # Left boundary
            k2[self.n - 1::self.n] = 0  # Right boundary

            k3 = self.C_Derivative(self.t[n] + self.dt / 2, self.u_old + k2 * self.dt / 2)

            # Apply zero Dirichlet boundary conditions to k3
            k3[0:self.n] = 0  # Top boundary
            k3[-self.n:] = 0  # Bottom boundary
            k3[::self.n] = 0  # Left boundary
            k3[self.n - 1::self.n] = 0  # Right boundary

            k4 = self.C_Derivative(self.t[n] + self.dt, self.u_old + k3 * self.dt)

            # Apply zero Dirichlet boundary conditions to k4
            k4[0:self.n] = 0  # Top boundary
            k4[-self.n:] = 0  # Bottom boundary
            k4[::self.n] = 0  # Left boundary
            k4[self.n - 1::self.n] = 0  # Right boundary

            self.u_new = self.u_old + (1 / 6) * self.dt * (k1 + 2 * k2 + 2 * k3 + k4)
            self.u_old = self.u_new

        self.u_vec = self.u_new
        self.u = self.u_vec.reshape(self.n, self.n)
        self.u[self.u < 0] = 0
        
        # spread the solution to the edges
        self.u[0, :] = self.u[1, :] * 0.5
        self.u[-1, :] = self.u[-2, :] * 0.5
        self.u[:, 0] = self.u[:, 1] * 0.5
        self.u[:, -1] = self.u[:, -2] * 0.5
        
        if np.max(self.u) > 100:
            raise RuntimeError("Simulation diverged.")

    def plot_solution(self):
        plt.imshow(self.u, extent=[-self.Lx/2, self.Lx/2, -self.Ly/2, self.Ly/2], origin='lower', cmap='jet')
        plt.colorbar()
        # add the source position
        plt.scatter(self.source_pos[1], self.source_pos[0], c='r', marker='x')
        plt.show()
    
    def scale(self, max_val):
        self.u = max_val * (self.u - self.u.min()) / (self.u.max() - self.u.min())
        self.u = np.round(self.u)
    
    def random_source_positon(self):
        # random inside the grid but not too close to the edges
        self.source_pos = [random.uniform(-1.5, 1.5), random.uniform(-1.5, 1.5)]
        
    def random_source_strength(self):
        self.source_strength = random.uniform(0.9, 1)
        
    def random_source_spread(self):
        self.source_spread = random.uniform(15, 25)
        
    def random_D(self):
        self.D = random.uniform(0.2, 0.3)
        
    def random_velocity_vector(self):
        if self.source_pos[1] < 0 and self.source_pos[0] < 0:
            # Source in bottom left corner, velocity should point towards top right
            random_x = random.uniform(0, 4) * self.n / 20
            random_y = random.uniform(0, 4) * self.n / 20
        elif self.source_pos[1] > 0 and self.source_pos[0] > 0:
            # Source in top right corner, velocity should point towards bottom left
            random_x = random.uniform(-4, 0) * self.n / 20
            random_y = random.uniform(-4, 0) * self.n / 20
        elif self.source_pos[1] < 0 and self.source_pos[0] > 0:
            # Source in top left corner, velocity should point towards bottom right
            random_x = random.uniform(0, 4) * self.n / 20
            random_y = random.uniform(-4, 0) * self.n / 20
        else:
            # Source in bottom right corner, velocity should point towards top left
            random_x = random.uniform(-4, 0) * self.n / 20
            random_y = random.uniform(0, 4) * self.n / 20

        self.V_x = lambda t: random_x
        self.V_y = lambda t: random_y
    
    def random_time(self):
        self.Lt = random.uniform(0.5, 5)
        self.t = np.linspace(0, self.Lt, self.Nt)
        
    def print_parameters(self):
        print(f"Grid size: {self.n}")
        print(f"D: {self.D}")
        print(f"Velocity vector: {self.V_x(0)}, {self.V_y(0)}")
        print(f"Source position: {[self.source_pos[1], self.source_pos[0]]}")
        print(f"Source strength: {self.source_strength}")
        print(f"Source spread: {self.source_spread}")
        print(f"Time: {self.Lt}")
    
    def save_solution(self, path, filename):
        np.save(path + filename, self.u)
    
    def save_image(self, path, filename):
        plt.imshow(self.u, extent=[-self.Lx/2, self.Lx/2, -self.Ly/2, self.Ly/2], origin='lower', cmap='viridis')
        plt.colorbar()
        plt.savefig(path + filename + ".png")
        plt.close()
    
    def add_noise(self, noise_level):
        self.u += np.random.normal(loc=0, scale=np.mean(self.u) * noise_level, size=(self.n, self.n))
        self.u[self.u < 0] = 0

In [2]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from matplotlib import cm
import pygame
import matplotlib.pyplot as plt

class MySim(gym.Env):
    def __init__(self):
        super(MySim, self).__init__()
        env = EnvGeneration(Nt=500, grid_size=20)
        env.random_source_positon()
        env.random_source_strength()
        env.random_source_spread()
        env.random_D()
        env.random_velocity_vector()
        env.random_time()
        env.init_solution()
        #env.print_parameters()
        env.solve()
        # TODO: env.add_noise(0.1)
        #env.plot_solution()
        self.maze = np.array(env.u)
        self.num_rows, self.num_cols = self.maze.shape

        # Plume model information
        # transform the source position to the grid using the grid size and domain size, env.n is the grid size, env.Lx is the domain size, env.source_pos is the source position as a scalar value in the domain not in the grid
        self.source_pos = env.n * (env.source_pos[0] + env.Lx/2) / env.Lx, env.n * (env.source_pos[1] + env.Ly/2) / env.Ly
        # round the source position to the nearest integer using round function
        self.source_pos = np.array([round(self.source_pos[0]), round(self.source_pos[1])], dtype=np.int64)
        self.r0 = self.source_pos
        self.sigma = env.source_spread
        self.domain_length = env.Lx
        self.domain_width = env.Ly
        self.D = env.D
        self.V_x = env.V_x(0)
        self.V_y = env.V_y(0)

        # Training steps information
        self.total_step = 0
        self.total_reward = 0

        # Starting position is the other side of the grid from the source
        self.start_pos = np.array([np.random.randint(0, self.num_rows), np.random.randint(0, self.num_cols)])
        if self.start_pos[0] < 0:
            self.start_pos[0] = 0
        if self.start_pos[1] < 0:
            self.start_pos[1] = 0
        if self.start_pos[0] > self.num_rows-1:
            self.start_pos[0] = self.num_rows-1
        if self.start_pos[1] > self.num_cols-1:
            self.start_pos[1] = self.num_cols-1
        self.start_pos = np.array(self.start_pos, dtype=np.int64)
        self.visited = np.array([self.start_pos])

        self.current_pos = self.start_pos
        self.previous_pos = self.current_pos

        # Observation space and action space
        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.Box(low=np.array([0, 0]), 
                                            high= np.array([self.num_rows-1, self.num_cols-1]),
                                            shape=(2,),
                                            dtype=np.int64)
        
        # Render related
        pygame.init()
        self.cell_size = int(500/self.num_rows)
        self.screen = pygame.display.set_mode((self.num_cols * self.cell_size, self.num_rows * self.cell_size))
        
        self.action = 0

        self.rewards = []
        
        self.total_episode = 0

    def reset(self,**kwargs):
        # reset the environment every 10 episodes
        #if self.total_episode % 10 == 0:
        env = EnvGeneration(Nt=500, grid_size=20)
        env.random_source_positon()
        env.random_source_strength()
        env.random_source_spread()
        env.random_D()
        env.random_velocity_vector()
        env.random_time()
        env.init_solution()
        #env.print_parameters()
        env.solve()
        # TODO: env.add_noise(0.1)
        #env.plot_solution()
        
        self.maze = np.array(env.u)
        self.num_rows, self.num_cols = self.maze.shape
    
        self.total_episode = 0
        """ else:
            self.total_episode += 1 """
        
        # Plume model information
        # transform the source position to the grid using the grid size and domain size, env.n is the grid size, env.Lx is the domain size, env.source_pos is the source position as a scalar value in the domain not in the grid
        self.source_pos = env.n * (env.source_pos[0] + env.Lx/2) / env.Lx, env.n * (env.source_pos[1] + env.Ly/2) / env.Ly
        # round the source position to the nearest integer using round function
        self.source_pos = np.array([round(self.source_pos[0]), round(self.source_pos[1])], dtype=np.int64)
        self.r0 = self.source_pos
        self.sigma = env.source_spread
        self.domain_length = env.Lx
        self.domain_width = env.Ly
        self.D = env.D
        self.V_x = env.V_x(0)
        self.V_y = env.V_y(0)
        
        # random starting position
        self.start_pos = np.array([np.random.randint(0, self.num_rows), np.random.randint(0, self.num_cols)])
        if self.start_pos[0] < 0:
            self.start_pos[0] = 0
        if self.start_pos[1] < 0:
            self.start_pos[1] = 0
        if self.start_pos[0] > self.num_rows-1:
            self.start_pos[0] = self.num_rows-1
        if self.start_pos[1] > self.num_cols-1:
            self.start_pos[1] = self.num_cols-1
        self.start_pos = np.array(self.start_pos, dtype=np.int64)
        
        self.current_pos = self.start_pos
        self.previous_pos = self.current_pos
        self.visited = np.array([self.start_pos])
        self.total_step = 0
        self.total_reward = 0
        self.render()
        return self.current_pos, {}

    def step(self, action):
        # Move the agent based on the selected action
        new_pos = np.array(self.current_pos)
        if action == 0:  # Up
            new_pos[0] += 1
        elif action == 1:  # Down
            new_pos[0] -= 1
        elif action == 2:  # Left
            new_pos[1] -= 1
        elif action == 3:  # Right
            new_pos[1] += 1

        max_pos = np.unravel_index(np.argmax(self.maze, axis=None), self.maze.shape)
        # Check if the new position is valid
        if self._is_valid_position(new_pos):
            #reward = 1000 * self._compute_reward_2(self.current_pos, action)
            normalized_maze = self.maze / np.max(self.maze) * 10
            reward = normalized_maze[self.current_pos[0], self.current_pos[1]] - 1
            # reward = (normalized_maze[new_pos[0], new_pos[1]] - normalized_maze[self.current_pos[0], self.current_pos[1]])
            # reward = 1000 * (self.maze[new_pos[0], new_pos[1]] - self.maze[self.current_pos[0], self.current_pos[1]])
            reward -= 1 if self._is_visited(new_pos) else 0
            self.previous_pos = self.current_pos
            self.current_pos = new_pos
            self.visited = np.append(self.visited, [new_pos], axis=0)
            # reward of 20 if the agent reaches the highest concentration
            # get the position of the highest concentration
            if np.array_equal(self.current_pos, max_pos):
                reward += 20
        else:
            # The agent collides obstacles or moves out of the domain
            reward = -5.0
        
        done = (self.total_step > 200) or (np.linalg.norm(self.current_pos - max_pos) < 1)
        self.total_step += 1
        self.total_reward += reward

        self.rewards.append(reward)
        
        self.action = action
        
        return self.current_pos, reward, bool(done), False, {}

    def _compute_physical_reward(self,pos, action):
        # Length and width of each grid
        grid_length = self.domain_length / self.num_cols
        grid_width = self.domain_width / self.num_rows

        # The structure for runge-kutta method
        #   2
        #1  0  3
        #   4
        # left: 1, right: 3, up: 2, down: 4, center: 0
        # concentration of left, right, up, down, center without using self.r0
        concentration = np.zeros(5)
        concentration[0] = self.maze[pos[0], pos[1]]
        # left
        if pos[1] > 0:
            concentration[1] = self.maze[pos[0], pos[1] - 1]
        else:
            concentration[1] = self.maze[pos[0], pos[1]]
        # right
        if pos[1] < self.num_cols-1:
            concentration[3] = self.maze[pos[0], pos[1] + 1]
        else:
            concentration[3] = self.maze[pos[0], pos[1]]
        # up
        if pos[0] > 0:
            concentration[2] = self.maze[pos[0] - 1, pos[1]]
        else:
            concentration[2] = self.maze[pos[0], pos[1]]
        # down
        if pos[0] < self.num_rows-1:
            concentration[4] = self.maze[pos[0] + 1, pos[1]]
        else:
            concentration[4] = self.maze[pos[0], pos[1]]

        # Calculate the gradient of the concentration
        concentration_gradient = np.zeros(2)
        
        if action == 0:  # Up
            concentration_gradient[0] = 0
            concentration_gradient[1] = (concentration[2] - concentration[0]) / (grid_length)
        elif action == 1:  # Down
            concentration_gradient[0] = 0
            concentration_gradient[1] = (concentration[4] - concentration[0]) / (grid_length)
        elif action == 2:  # Left
            concentration_gradient[0] = (concentration[1] - concentration[0]) / (grid_width)
            concentration_gradient[1] = 0
        elif action == 3:  # Right
            concentration_gradient[0] = (concentration[3] - concentration[0]) / (grid_width)
            concentration_gradient[1] = 0
        else:
            concentration_gradient[0] = 0
            concentration_gradient[1] = 0

        # Calculate the laplacian of the concentration
        concentration_laplacian = (concentration[3] - 2 * concentration[0] + concentration[1]) / (grid_length ** 2) + (concentration[4] - 2 * concentration[0] + concentration[2]) / (grid_width ** 2)
        # Calculate physical reward using the concentration gradient and laplacian
        #TODO: reward_physical = -self.D * concentration_laplacian + self.V_x * concentration_gradient[0] + self.V_y * concentration_gradient[1]
        
        reward_physical = np.abs(self.V_x) * concentration_gradient[0] + np.abs(self.V_y) * concentration_gradient[1]
        
        # print(-self.D * concentration_laplacian, self.V_x * concentration_gradient[0], self.V_y * concentration_gradient[1])

        return reward_physical
    
    def _compute_reward_2(self, pos, action):
        gamma = 0.1

        # current concentration
        current_concentration = self.maze[pos[0], pos[1]]
        # concentration of the previous position
        """ previous_concentration = self.maze[self.previous_pos[0], self.previous_pos[1]]

        if current_concentration > previous_concentration:
            is_movement_towards_source = True
        else:
            is_movement_towards_source = False
        
        # Reward for movement towards the pollution source
        r_movement = 0.5 if is_movement_towards_source else -0. """
    
        # Proximity reward (if needed)
        #TODO r_proximity = np.exp(-gamma * np.linalg.norm(current_concentration - previous_concentration))

        """ # current position
        pos_x = self.domain_length * (pos[0] - 0.5) / self.num_cols - self.domain_length / 2
        pos_y = self.domain_width * (pos[1] - 0.5) / self.num_rows - self.domain_width / 2

        if pos_x == 0 and pos_y == 0:
            is_hit_boundary = True
        else:
            is_hit_boundary = False
        
        # Penalty for hitting the boundaries of the grid
        r_boundary = -0.1 if is_hit_boundary else 0.0 """

        # Pyshical reward
        r_pi = self._compute_physical_reward(pos, action)

        # visited reward
        r_visited = 0.0
        if self._is_visited(pos):
            r_visited = -5

        # Calculate total reward
        total_reward = r_pi # + r_visited # + r_movement + r_boundary + r_proximity
    
        # Print the reward components
        #print(f"Reward for movement towards source: {r_movement}")
        #print(f"Proximity reward: {r_proximity}")
        #print(f"Penalty for hitting boundary: {r_boundary}")
        #print(f"Physical reward: {r_pi}")
        #print(f"Total reward: {total_reward}")

        return total_reward
    
    def _is_visited(self,pos):
        if np.any(np.all(pos == self.visited, axis=1)):
            return True
        else:
            return False

    def _is_valid_position(self, pos):
        row, col = pos
        # If agent goes out of the grid
        if row < 0 or col < 0 or row >= self.num_rows or col >= self.num_cols:
            return False
        else:
            return True
        
    def render(self):
        self.screen.fill((255,255,255))
        # Draw env elements one cell at a time

        for row in range(self.num_rows - 1, -1, -1):  # Iterate over rows in reverse order
            for col in range(self.num_cols):
                cell_left = col * self.cell_size
                cell_top = (self.num_rows - row - 1) * self.cell_size  # Calculate cell_top inverted

                # Draw the grid
                pygame.draw.rect(self.screen, (0, 0, 0), (cell_left, cell_top, self.cell_size, self.cell_size), 1)

                # draw the pollution concentration
                color = cm.jet(self.maze[row, col]/np.max(self.maze))
                pygame.draw.rect(self.screen, [int(255*color[0]), int(255*color[1]), int(255*color[2])], (cell_left, cell_top, self.cell_size, self.cell_size))

                # Draw visited block with gray color
                # if self._is_visited([row,col]):
                #     pygame.draw.rect(self.screen, [150,150,150], (cell_left, cell_top, self.cell_size, self.cell_size))

                # starting position
                if row == self.start_pos[0] and col == self.start_pos[1]:
                    pygame.draw.rect(self.screen, (0, 255, 0), (cell_left, cell_top, self.cell_size, self.cell_size))

                if np.array_equal(np.array(self.current_pos), np.array([row, col])):  # Agent position
                    pygame.draw.circle(self.screen, (255, 255, 255), (cell_left+self.cell_size/2, cell_top+self.cell_size/2),self.cell_size/2)

                if row == self.source_pos[0] and col == self.source_pos[1]:  # Source position
                    pygame.draw.circle(self.screen, (0, 0, 0), (cell_left+self.cell_size/2, cell_top+self.cell_size/2),self.cell_size/2)
        # Plot the path of visited positions with lines
        # The same function with "Draw visited block with gray color", but more nice-looking
        for i in range(0,len(self.visited)-1):
            next_cell_left = self.visited[i][1] * self.cell_size
            next_cell_top  = self.visited[i][0] * self.cell_size
            previous_cell_left = self.visited[i+1][1] * self.cell_size
            previous_cell_top  = self.visited[i+1][0] * self.cell_size
            next_coordianate = [next_cell_left+self.cell_size/2, next_cell_top+self.cell_size/2]
            previous_coordiante = [previous_cell_left+self.cell_size/2, previous_cell_top+self.cell_size/2]
            pygame.draw.line(self.screen, [0,0,0], 
                                (next_coordianate[0],next_coordianate[1]), 
                                (previous_coordiante[0],previous_coordiante[1]), 2)
        # write the total reward on the screen and the current reward
        font = pygame.font.Font(None, 36)
        text = font.render("Total reward: " + str(self.total_reward), True, (255, 255, 255))
        self.screen.blit(text, (10, 10))

        if len(self.rewards) > 0:
            text = font.render("Current reward: " + str(self.rewards[-1]), True, (255, 255, 255))
            self.screen.blit(text, (10, 40))
        
        # write the current step on the screen in the bottom right corner
        text = font.render(str(self.total_step), True, (255, 255, 255))
        self.screen.blit(text, (self.num_cols * self.cell_size - 50, self.num_rows * self.cell_size - 50))
        
        # write the action on the screen in the bottom left corner
        action_text = ["Up", "Down", "Left", "Right"]
        text = font.render(action_text[self.action], True, (255, 255, 255))
        self.screen.blit(text, (50, self.num_rows * self.cell_size - 50))
        
        pygame.display.update()  # Update the display
        
    def plot_sum_reward(self):
        plt.plot(np.cumsum(self.rewards))
        plt.show()

    def close(self):
        if self.screen is not None:
            pygame.display.quit()
            pygame.quit()
            self.isopen = False

# Testing the environment

In [None]:
pygame.quit()

In [3]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
import numpy as np
import pygame
import matplotlib.pyplot as plt

env = MySim()

# obs = env.reset()

env.render()

#check_env(env, warn=True)

## Train

In [41]:
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.vec_env import VecEnv

class EpisodeRewardCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(EpisodeRewardCallback, self).__init__(verbose)
        self.total_episode_reward = 0
        self.episode_rewards_list = []
        self.step = 0
        self.episode = 0
    def _on_step(self) -> bool:
        self.total_episode_reward += self.locals["rewards"]
        # print(f"Episode: {self.episode}, Steps: {self.step}, Total reward:", self.total_episode_reward)
        if self.locals.get("dones"): # or self.locals.get("done"):
            self.episode += 1
            print(f"Episode: {self.episode}, Steps: {self.step}, Total reward:", self.total_episode_reward)
            self.episode_rewards_list.append(self.total_episode_reward)
            self.step = 0
            self.total_episode_reward = 0
        else:
            self.step += 1

        return True

class RenderCallback(BaseCallback):
    def __init__(self, env: VecEnv, render_freq: int = 1000):
        super(RenderCallback, self).__init__()
        self.env = env
        self.render_freq = render_freq
        self.n_calls = 0

    def _on_step(self) -> bool:
        self.n_calls += 1
        if self.n_calls % self.render_freq == 0:
            self.env.render()
        return True

In [42]:
model = PPO('MlpPolicy', env=env, verbose=0)
""" # change learning rate
model.learning_rate = 0.0001
# change batch size
model.batch_size = 64
# change number of epochs
model.n_epochs = 10 """
model.learn(total_timesteps=20000, callback=[EpisodeRewardCallback(), RenderCallback(env)])

model.save(f"ppo_maze_rewardFn2")

Episode: 1, Total reward: [-438.01947]
Episode: 2, Total reward: [287.48917]
Episode: 3, Total reward: [-807.0194]
Episode: 4, Total reward: [-508.51456]
Episode: 5, Total reward: [-260.73047]
Episode: 6, Total reward: [-478.84967]
Episode: 7, Total reward: [-543.54266]
Episode: 8, Total reward: [-373.631]
Episode: 9, Total reward: [-433.93506]
Episode: 10, Total reward: [45.358505]
Episode: 11, Total reward: [-297.9713]
Episode: 12, Total reward: [459.52405]
Episode: 13, Total reward: [-444.46576]
Episode: 14, Total reward: [-705.5806]
Episode: 15, Total reward: [426.25378]
Episode: 16, Total reward: [-170.17331]
Episode: 17, Total reward: [364.41156]
Episode: 18, Total reward: [509.7627]
Episode: 19, Total reward: [-54.89569]
Episode: 20, Total reward: [-58.70429]
Episode: 21, Total reward: [-49.004055]
Episode: 22, Total reward: [-64.696724]
Episode: 23, Total reward: [542.7012]
Episode: 24, Total reward: [77.191345]
Episode: 25, Total reward: [-132.56818]
Episode: 26, Total reward:

## Test

In [5]:
import gymnasium as gym
from stable_baselines3 import PPO
import pygame

# env.close()

model = PPO.load("ppo_maze_rewardFn2")

env = MySim()

obs, _ = env.reset()

frame_count = 0
done = False
while not done:
    pygame.event.get()
    action, state = model.predict(observation = obs)#, deterministic=True)

    obs, reward, done, _, info = env.step(action)
    
    # print({0: "Up", 1: "Down", 2: "Left", 3: "Right"}[action.item()])
    
    env.render()
    frame_count += 1
    #filename = r"C:\Users\Noufalmesafri\Desktop\Deep Learning\Assignment\images\screen3_%04d.png" % (frame_count)
    #screen = env.screen
    #pygame.image.save(screen, filename)
    
    # print(f"Step: {frame_count}, Current reward: {reward}, Total reward: {env.total_reward}")
    pygame.time.wait(100)

print("FINISHED")
print(f"Step: {frame_count}, Total reward: {env.total_reward}")
env.plot_sum_reward()

env.close()

KeyboardInterrupt: 

### Output mp4

In [135]:
!ffmpeg -r 5 -f image2 -s 500x500 -i screen_%04d.png window_video.mp4

ffmpeg version 4.3.1 Copyright (c) 2000-2020 the FFmpeg developers
  built with Microsoft (R) C/C++ Optimizing Compiler Version 19.00.24215.1 for x64
  configuration: --prefix=/c/Work/Source/fdm-qml/ffmpeg-build/windows/ffmpeg/../../prebuilt/windows/x64 --toolchain=msvc --extra-cflags=-MD --arch=x86_64 --disable-x86asm --disable-iconv --disable-network --enable-filter=stereo3d --enable-libmp3lame --enable-libdav1d --extra-cflags='-I/c/Work/Source/fdm-qml/ffmpeg-build/windows/ffmpeg/../../../zlib-build/prebuilt/include -I/c/Work/Source/fdm-qml/ffmpeg-build/windows/ffmpeg/../../../libpng-build/prebuilt/include -I/c/Work/Source/fdm-qml/ffmpeg-build/windows/ffmpeg/../../../lame-build/prebuilt/include -I/c/Work/Source/fdm-qml/ffmpeg-build/windows/ffmpeg/../../../libaom-build/prebuilt/include -I/c/Work/Source/fdm-qml/ffmpeg-build/windows/ffmpeg/../../../libdav1d-build/prebuilt/include' --extra-ldflags='-L/c/Work/Source/fdm-qml/ffmpeg-build/windows/ffmpeg/../../../lame-build/prebuilt/windows/