In [1]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from level2 import CustomFrozenLakeEnv

# Set your environment name
environment_name = 'FrozenLake-v1'
render_mode = 'human'

# Create a custom FrozenLake environment
custom_lake = CustomFrozenLakeEnv(size=8, num_holes=8, start_point=(0, 0), end_point=(7, 7))

# ------------------------------------------------------------------------------------------
# training on random board that has been generated


from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
import os
import time

# ... (unchanged CustomFrozenLakeEnv definition)

# # Create the environment
env = custom_lake
env = DummyVecEnv([lambda: env])  # Wrap the environment for compatibility

# # Create the PPO agent
ppo_agent = PPO("MlpPolicy", env, verbose=1)

# # Number of training steps
num_steps = 1000000

# # Train the agent
ppo_agent.learn(total_timesteps=num_steps)

# # Save the trained model
model_save_path = "/Users/software/Desktop/reinforcement_learning_practise/hackathon/"
os.makedirs(model_save_path, exist_ok=True)
location = os.path.join(model_save_path, "level2_custom_frozenlake_model")
ppo_agent.save(location)




# ------------------------------------------------------------------------------------------


# trial runs



Using cpu device
-----------------------------
| time/              |      |
|    fps             | 7153 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 4557        |
|    iterations           | 2           |
|    time_elapsed         | 0           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.016015537 |
|    clip_fraction        | 0.0958      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   | -31         |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00947    |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0121     |
|    value_loss           | 0.00893     |
-----------------------------------------
-----------------

In [2]:
custom_lake.render()
custom_lake = [''.join(sublist) for sublist in custom_lake.desc]

print(custom_lake)

['SFFFHFFF', 'FHFFFFFF', 'FFFFFFFF', 'FFFFHFFF', 'FFFFFFFF', 'FFHFFHFF', 'FFFFFHFF', 'HHFFFFFG']


In [3]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from level2 import CustomFrozenLakeEnv

environment_name = 'FrozenLake-v1'
render_mode = 'human'

try:
    gym_env = gym.make(environment_name, desc=custom_lake, render_mode=render_mode)
    vec_env = DummyVecEnv([lambda: gym_env])
except gym.error.UnregisteredEnv:
    # If the Gymnasium environment is not available, use the OpenAI Gym environment
    vec_env = DummyVecEnv([lambda: gym.make(environment_name, desc=custom_lake, render_mode=render_mode)])





# Load the trained model
model = PPO.load(location)

# Set the number of episodes for the trial
num_episodes = 50

# Run a trial of various episodes
for episode in range(num_episodes):
    obs = vec_env.reset()
    total_reward = 0
    done = False
    episode_path = {"observations": [], "actions": [], "rewards": []}

    while not done:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = vec_env.step(action)

        # Store observations, actions, and rewards during the episode
        episode_path["observations"].append(obs.copy())
        episode_path["actions"].append(action)
        episode_path["rewards"].append(reward)

        total_reward += reward

    print(f"Episode {episode + 1} - Total Reward: {total_reward}")

    # Check if the episode was successful
    if total_reward == 1:
        print("Episode succeeded!")
        print("Observations:", episode_path["observations"])
        print("Actions:", episode_path["actions"])
        print("Rewards:", episode_path["rewards"])

# Close the environment
vec_env.close()



Episode 1 - Total Reward: [0.]
Episode 2 - Total Reward: [0.]
Episode 3 - Total Reward: [0.]
Episode 4 - Total Reward: [0.]
Episode 5 - Total Reward: [0.]
Episode 6 - Total Reward: [0.]
Episode 7 - Total Reward: [0.]
Episode 8 - Total Reward: [0.]
Episode 9 - Total Reward: [0.]
Episode 10 - Total Reward: [0.]
Episode 11 - Total Reward: [0.]
Episode 12 - Total Reward: [0.]
Episode 13 - Total Reward: [0.]
Episode 14 - Total Reward: [0.]
Episode 15 - Total Reward: [0.]
Episode 16 - Total Reward: [0.]
Episode 17 - Total Reward: [0.]
Episode 18 - Total Reward: [1.]
Episode succeeded!
Observations: [array([1]), array([1]), array([2]), array([3]), array([2]), array([3]), array([11]), array([12]), array([13]), array([21]), array([20]), array([21]), array([20]), array([21]), array([22]), array([23]), array([31]), array([30]), array([29]), array([37]), array([38]), array([37]), array([38]), array([46]), array([54]), array([46]), array([54]), array([55]), array([54]), array([55]), array([0])]
Act

: 

In [2]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from level2 import CustomFrozenLakeEnv

environment_name = 'FrozenLake-v1'
render_mode = 'human'

try:
    gym_env = gym.make(environment_name, desc=['SFFFHFFF', 'FHFFFFFF', 'FFFFFFFF', 'FFFFHFFF', 'FFFFFFFF', 'FFHFFHFF', 'FFFFFHFF', 'HHFFFFFG'], render_mode=render_mode)
    vec_env = DummyVecEnv([lambda: gym_env])
except gym.error.UnregisteredEnv:
    # If the Gymnasium environment is not available, use the OpenAI Gym environment
    vec_env = DummyVecEnv([lambda: gym.make(environment_name, desc=custom_lake, render_mode=render_mode)])


location = "/Users/software/Desktop/reinforcement_learning_practise/hackathon/level2_custom_frozenlake_model.zip"


# Load the trained model
model = PPO.load(location)

# Set the number of episodes for the trial
num_episodes = 50

# Run a trial of various episodes
for episode in range(num_episodes):
    obs = vec_env.reset()
    total_reward = 0
    done = False
    episode_path = {"observations": [], "actions": [], "rewards": []}

    while not done:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = vec_env.step(action)

        # Store observations, actions, and rewards during the episode
        episode_path["observations"].append(obs.copy())
        episode_path["actions"].append(action)
        episode_path["rewards"].append(reward)

        total_reward += reward

    print(f"Episode {episode + 1} - Total Reward: {total_reward}")

    # Check if the episode was successful
    if total_reward == 1:
        print("Episode succeeded!")
        print("Observations:", episode_path["observations"])
        print("Actions:", episode_path["actions"])
        print("Rewards:", episode_path["rewards"])

# Close the environment
vec_env.close()



Episode 1 - Total Reward: [0.]
Episode 2 - Total Reward: [0.]
Episode 3 - Total Reward: [0.]
Episode 4 - Total Reward: [0.]
Episode 5 - Total Reward: [0.]
Episode 6 - Total Reward: [0.]
Episode 7 - Total Reward: [0.]
Episode 8 - Total Reward: [0.]
Episode 9 - Total Reward: [0.]
Episode 10 - Total Reward: [0.]


KeyboardInterrupt: 