In [13]:
import warnings
warnings.filterwarnings('ignore')
import ale_py
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack, VecTransposeImage
from stable_baselines3.common.atari_wrappers import MaxAndSkipEnv
import torch
import numpy as np
import wandb
from wandb.integration.sb3 import WandbCallback

from gymnasium.wrappers import MaxAndSkipObservation, ResizeObservation, GrayscaleObservation, FrameStackObservation, ReshapeObservation
from stable_baselines3.common.monitor import Monitor
import matplotlib.pyplot as plt
import os
from stable_baselines3 import DQN
from stable_baselines3.common.callbacks import CheckpointCallback
gym.register_envs(ale_py)
from datetime import datetime
from stable_baselines3 import A2C
from stable_baselines3.ppo.policies import MlpPolicy
from wandb.integration.sb3 import WandbCallback
import collections
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecEnvWrapper
from stable_baselines3 import PPO
import cv2
from gymnasium.spaces import Discrete
import imageio
from PIL import Image
import PIL.ImageDraw as ImageDraw

In [14]:
model_path = "best_model.zip"
num_episodes = 2 # number of episodes to run
climb_auto = True # if True, the agent will automatically climb ladders

In [15]:
# configuration file
config = {
    "policy_type": "CnnPolicy",
    "total_timesteps": 1000000,
    "env_name": "ALE/DonkeyKong-v5", 
    "model_name": "ALE/DonkeyKong-v5",
    "export_path": "./exports/",
    "videos_path": "./videos/",
}

In [16]:
class MyVecTransposeImage(VecEnvWrapper):
    def __init__(self, venv, skip=False):
        super().__init__(venv)
        self.skip = skip

        # Get original shape: e.g., (84, 84, 4)
        old_shape = self.observation_space.shape
        # Transpose shape to (C, H, W)
        new_shape = (old_shape[2], old_shape[0], old_shape[1])  # (4, 84, 84)

        # Use the original low/high if they are uniform; if not, use min/max appropriately
        low_val = self.observation_space.low.min()
        high_val = self.observation_space.high.max()

        self.observation_space = gym.spaces.Box(
            low=low_val,
            high=high_val,
            shape=new_shape,
            dtype=self.observation_space.dtype
        )

    def reset(self):
        obs = self.venv.reset()
        return self.transpose_observations(obs)

    def step_async(self, actions):
        self.venv.step_async(actions)

    def step_wait(self):
        obs, rewards, dones, infos = self.venv.step_wait()
        return self.transpose_observations(obs), rewards, dones, infos

    def transpose_observations(self, obs):
        if self.skip:
            return obs
        if isinstance(obs, dict):
            for key, val in obs.items():
                obs[key] = self._transpose(val)
            return obs
        else:
            return self._transpose(obs)

    def _transpose(self, obs):
        # obs shape is (n_envs, H, W, C) -> transpose to (n_envs, C, H, W)
        return obs.transpose(0, 3, 1, 2)


In [17]:
def get_agent_level_position(image):
    if image is None:
        raise ValueError("Image not loaded. Check the path and file.")
    
    # remove 0 to 25 pixels from the top
    image = image[32:, :]
    # plt.imshow(image, cmap='gray')

    image[149:160, 36:44] = 0
    # display image with black
    # plt.imshow(image, cmap='gray')

    # Lines detection
    # copy image
    gray_image = image.copy()

    # print("Image shape:", gray_image.shape)

    # Perform edge detection
    edges = cv2.Canny(gray_image, threshold1=30, threshold2=100)


    # Detect horizontal lines using Hough Transform
    lines = cv2.HoughLinesP(
        edges, 
        rho=1, 
        theta=np.pi / 180, 
        threshold=30, 
        minLineLength=10, 
        maxLineGap=20
    )

    # Draw detected lines on a debug image
    debug_line_image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    horizontal_lines = []

    if lines is not None:
        # print(f"Total lines detected (before filtering): {len(lines)}")
        for line in lines:
            x1, y1, x2, y2 = line[0]
            # Check for horizontal lines with a more lenient threshold
            vertical_diff = abs(y2 - y1)
            horizontal_diff = abs(x2 - x1)
            
            if vertical_diff < horizontal_diff * 0.1:  # Allow slight vertical tilt
                horizontal_lines.append((x1, y1, x2, y2))
                # cv2.line(debug_line_image, (x1, y1), (x2, y2), (255, 0, 0), 2)  # Blue for horizontal lines
    # else:
    #     print("No lines detected.")


    # detect the agent and it position
    # Perform binary thresholding to highlight the agent and objects
    _, binary = cv2.threshold(gray_image, 50, 255, cv2.THRESH_BINARY)

    # Apply morphological operations to clean up noise
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    binary_cleaned = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)

    # Detect contours in the cleaned binary image
    contours, _ = cv2.findContours(binary_cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Initialize variables for the agent's position
    agent_detection_image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    agent_position = None

    # Filter contours to find the agent
    # print(f"Total contours detected: {len(contours)}")

    for contour in contours:
        # Get bounding box of the contour
        x, y, w, h = cv2.boundingRect(contour)
        
        # Filter based on size: Assuming agent is small
        if 5 <= w <= 30 and 5 <= h <= 30:  # Adjust range based on resolution
            # Further filter based on aspect ratio to avoid line-like objects
            aspect_ratio = max(w / h, h / w)
            if aspect_ratio < 2.0:  # Allow only nearly square contours
                agent_position = (x + w // 2, y + h // 2)  # Center of the bounding box
                

                break  # Assuming only one agent in the frame

    
    # Detect level
    # Sort lines by their average y-value, descending (bottom to top)
    lines_sorted = sorted(horizontal_lines, key=lambda line: (line[1] + line[3]) / 2, reverse=True)

    def cluster_lines(lines, desired_clusters=7, proximity_threshold=10):
        clusters = []
        current_cluster = [lines[0]]
        for line in lines[1:]:
            line_y = (line[1] + line[3]) // 2
            current_cluster_y = sum((l[1]+l[3])//2 for l in current_cluster) / len(current_cluster)
            # If the difference is small, add to current cluster, else start a new one
            if abs(line_y - current_cluster_y) < proximity_threshold:
                current_cluster.append(line)
            else:
                clusters.append(current_cluster)
                current_cluster = [line]
        clusters.append(current_cluster)
        
        return clusters

    proximity_threshold = 10  # Adjust as needed
    clusters = cluster_lines(lines_sorted, desired_clusters=7, proximity_threshold=proximity_threshold)

    # Compute representative y-value for each cluster (average)
    boundary_y_values = []
    for cluster in clusters:
        avg_y = sum((l[1] + l[3]) // 2 for l in cluster) / len(cluster)
        boundary_y_values.append(avg_y)

    # Sort boundaries again in descending order (bottom = largest y, top = smallest y)
    boundary_y_values.sort(reverse=True)

    agent_level = None
    if agent_position:
        agent_y = agent_position[1]
        # Find which level agent_y falls into
        for i in range(6):
            if boundary_y_values[i] >= agent_y > boundary_y_values[i+1]:
                agent_level = i + 1
                break


    # Draw minimal annotation: just draw the agent and print its level
    final_image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)


    return agent_level, agent_position# agent position is (x, y)

In [18]:
# For intermediate rewards
class IntermediateRewardWrapper_climb_not_auto(gym.Wrapper):
    def __init__(self, env):
        super(IntermediateRewardWrapper_climb_not_auto, self).__init__(env)
        self.ladder_postion = [110,82,90,70,110,78]
        self.last_level = 1
        self.previous_additional_reward = 0.0
        self.last_y_position = 0
        # self.num_preprocessings = 1

    def step(self, action):

        obs, reward, terminated, truncated, info = self.env.step(action)
        # print(f"from intermediate reward {obs.shape}")
        # print(f"from intermediate reward {reward}")

        # done = terminated or truncated

        # reward = reward * 1.5
        additional_reward = 0.0

        # if self.num_preprocessings == 1:
        # get agent level and position
        agent_level, agent_position = get_agent_level_position(obs)

 

        if agent_level is not None and 1 <= agent_level <= len(self.ladder_postion):
            agent_level_reward = (7 - agent_level)* (-0.01)
            additional_reward += agent_level_reward

            if agent_level > self.last_level:
                additional_reward += 5000.0
                self.last_level = agent_level
            

            diff = 0
            if agent_position is not None and len(agent_position) > 0:
                # get absolute difference between agent position and ladder position
                diff = abs(self.ladder_postion[agent_level - 1] - agent_position[0])
                agent_position_reward = diff * (-0.1)
                additional_reward += agent_position_reward

                if action == 2 and diff <= 1 and agent_position[1] > self.last_y_position:
                    additional_reward += 30.0
                self.last_y_position = agent_position[1]



        else:
            # If agent_level or agent_position not found, use the previous reward
            additional_reward = self.previous_additional_reward


        # Round the additional reward to 2 decimal places
        additional_reward = round(additional_reward, 2)

        # Update the previous reward
        self.previous_additional_reward = additional_reward

        #     self.num_preprocessings = 0
        # else:
        #     self.num_preprocessings = 1
        #     additional_reward = self.previous_additional_reward


        # Add the additional reward to the original reward
        reward += additional_reward

        # print(f"obs: {obs.shape} , Agent Level: {agent_level}, Agent Position: {agent_position}, Additional Reward: {additional_reward}, Total Reward: {reward}")

        return obs, reward, terminated, truncated, info

In [19]:
class IntermediateRewardWrapper_climb_auto(gym.Wrapper):
    def __init__(self, env):
        super(IntermediateRewardWrapper_climb_auto, self).__init__(env)
        self.ladder_postion = [110,82,90,70,110,78]
        self.last_level = 1
        self.previous_additional_reward = 0.0
        self.last_y_position = 0
        # self.num_preprocessings = 1

    def step(self, action):

        obs, reward, terminated, truncated, info = self.env.step(action)

        additional_reward = 0.0

        # if self.num_preprocessings == 1:
        # get agent level and position
        agent_level, agent_position = get_agent_level_position(obs)



        if agent_level is not None and 1 <= agent_level <= len(self.ladder_postion):
            agent_level_reward = (7 - agent_level)* (-0.01)
            additional_reward += agent_level_reward

            if agent_level > self.last_level:
                additional_reward += 500.0
                self.last_level = agent_level

            diff = 0
            if agent_position is not None and len(agent_position) > 0:
                # get absolute difference between agent position and ladder position
                diff = abs(self.ladder_postion[agent_level - 1] - agent_position[0])
                agent_position_reward = diff * (-0.1)
                additional_reward += agent_position_reward

                if action == 2 and diff <= 1 and agent_position[1] > self.last_y_position:
                    additional_reward += 50.0
                    for i in range(35):
                        obs, reward, terminated, truncated, info = self.env.step(action)
                        if terminated or truncated:
                            break
                self.last_y_position = agent_position[1]

        else:
            # If agent_level or agent_position not found, use the previous reward
            additional_reward = self.previous_additional_reward

        # Round the additional reward to 2 decimal places
        additional_reward = round(additional_reward, 2)

        # Update the previous reward
        self.previous_additional_reward = additional_reward

        # Add the additional reward to the original reward
        reward += additional_reward

        # print(f"obs: {obs.shape} , Agent Level: {agent_level}, Agent Position: {agent_position}, Additional Reward: {additional_reward}, Total Reward: {reward}")

        return obs, reward, terminated, truncated, info

In [20]:
class ActionFilterWrapper(gym.ActionWrapper):
    def __init__(self, env, allowed_actions):
        super().__init__(env)
        self.allowed_actions = allowed_actions
        # The new action space matches the number of allowed actions
        self.action_space = Discrete(len(self.allowed_actions))

    def action(self, act):
        # Map the reduced action space index to the original action
        return self.allowed_actions[act]

In [21]:
class ScaledFloatFrame(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        # The original shape remains (84,84,1), but the dtype and range change
        self.observation_space = gym.spaces.Box(
            low=0.0,
            high=1.0,
            shape=self.observation_space.shape,
            dtype=np.float32
        )
        
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0



class FireResetEnv(gym.Wrapper):
    def __init__(self, env=None):
        super().__init__(env)
        # Check that 'FIRE' is a valid action in the environment
        assert 'FIRE' in env.unwrapped.get_action_meanings(), "Environment does not support 'FIRE' action"
        assert len(env.unwrapped.get_action_meanings()) >= 3, "Action space too small for expected actions"

    def step(self, action):
        return self.env.step(action)

    def reset(self, **kwargs):
        # Reset the environment
        obs, info = self.env.reset(**kwargs)

        # Perform the FIRE action
        obs, _, terminated, truncated, _ = self.env.step(1)
        if terminated or truncated:  # If game ends after FIRE, reset again
            obs, info = self.env.reset(**kwargs)

        return obs, info
        
# Custom wrapper to add channel dimension
class AddChannelDimension(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs_shape = self.observation_space.shape
        # Update the observation space to include a channel dimension
        self.observation_space = gym.spaces.Box(
            low=0,
            high=255,
            shape=(obs_shape[0], obs_shape[1], 1),
            dtype=np.uint8,
        )

    def observation(self, observation):
        # Add a channel dimension
        return np.expand_dims(observation, axis=-1)




def make_env(env_name, allowed_actions, obs_type="grayscale", render_mode=None,):
    def _init():
        env = gym.make(env_name, obs_type="grayscale", render_mode=render_mode)
        print("Standard Env.        : {}".format(env.observation_space.shape))
        env = FireResetEnv(env)
        print("FireResetEnv          : {}".format(env.observation_space.shape))
        # Wrap the environment with the custom ActionFilterWrapper
        env = ActionFilterWrapper(env, allowed_actions)
        print("ActionFilterWrapper   : {}".format(env.observation_space.shape))
        # Wrap the environment to add intermediate rewards
        if climb_auto:
            env = IntermediateRewardWrapper_climb_auto(env)
        else:
            env = IntermediateRewardWrapper_climb_not_auto(env)

        print("IntermediateReward    : {}".format(env.observation_space.shape))

        env = ResizeObservation(env, (84, 84))
        print("ResizeObservation    : {}".format(env.observation_space.shape))
        env = AddChannelDimension(env)  # Add channel dimension here
        print("AddChannelDimension  : {}".format(env.observation_space.shape))
        
        env = ScaledFloatFrame(env)
        print("ScaledFloatFrame     : {}".format(env.observation_space.shape))
        

        return env
    return _init



In [22]:
# select relevant actions
allowed_actions = [0, 1, 2, 3, 4, 11, 12]
# env = DummyVecEnv([make_env(config["env_name"], render_mode="rgb_array")])
env = make_vec_env(env_id=make_env(config["env_name"], allowed_actions= allowed_actions,render_mode="rgb_array"), n_envs=1)
# stack 4 frames
env = VecFrameStack(env, n_stack=4)
print("Post VecFrameStack Shape: {}".format(env.observation_space.shape))

# convert back to PyTorch format (channel-first)
env = MyVecTransposeImage(env)
print("Final Observation Space: {}".format(env.observation_space.shape))

print("Render mode after wrapping:", env.render_mode)

Standard Env.        : (210, 160)
FireResetEnv          : (210, 160)
ActionFilterWrapper   : (210, 160)
IntermediateReward    : (210, 160)
ResizeObservation    : (84, 84)
AddChannelDimension  : (84, 84, 1)
ScaledFloatFrame     : (84, 84, 1)
Post VecFrameStack Shape: (84, 84, 4)
Final Observation Space: (4, 84, 84)
Render mode after wrapping: rgb_array


In [23]:
# load model
model = PPO.load(model_path)

In [24]:
rewards_glb = []

for i in range(num_episodes):
    frames = []
    rewards_episode = []
    done = False
    obs = env.reset()

    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        # done = terminated or truncated
        rewards_episode.append(reward)

        frames.append(env.render())

    rewards_glb.append(sum(rewards_episode))
    # e.g. fps=50 == duration=20 (1000 * 1/50)
    imageio.mimwrite("model_name" +'_'+ str(i) +'.gif', frames, duration=20)

print("Rewards:", rewards_glb)
print("Average Reward:", np.mean(rewards_glb))

Rewards: [array([657.0702], dtype=float32), array([1444.3865], dtype=float32)]
Average Reward: 1050.7283
