<a href="https://colab.research.google.com/github/2303A41447/ADM-2303A51447/blob/main/reinforcement.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class SimpleDroneEnv(gym.Env):
    """
    A simple 2D drone environment for RL navigation.
    Drone must reach a goal while avoiding obstacles.
    """
    def __init__(self, grid_size=10):
        super(SimpleDroneEnv, self).__init__()
        self.grid_size = grid_size
        self.max_steps = 200

        # Observation: [x, y, goal_x, goal_y]
        self.observation_space = spaces.Box(low=0, high=grid_size, shape=(4,), dtype=np.float32)

        # Actions: 0=up, 1=down, 2=left, 3=right
        self.action_space = spaces.Discrete(4)

        self.obstacles = self._generate_obstacles()
        self.reset()

    def _generate_obstacles(self):
        # Random obstacles (except borders)
        obstacles = set()
        for _ in range(10):
            ox, oy = np.random.randint(1, self.grid_size-1, size=2)
            obstacles.add((ox, oy))
        return obstacles

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.steps = 0
        self.agent_pos = np.array([1, 1])
        self.goal_pos = np.array([self.grid_size-2, self.grid_size-2])
        obs = np.concatenate((self.agent_pos, self.goal_pos)).astype(np.float32)
        return obs, {}

    def step(self, action):
        self.steps += 1
        x, y = self.agent_pos

        if action == 0:   # up
            y += 1
        elif action == 1: # down
            y -= 1
        elif action == 2: # left
            x -= 1
        elif action == 3: # right
            x += 1

        # Stay within bounds
        x = np.clip(x, 0, self.grid_size-1)
        y = np.clip(y, 0, self.grid_size-1)
        new_pos = (x, y)

        reward = -0.1  # small step penalty
        done = False

        if new_pos in self.obstacles:
            reward = -10.0
            done = True
        elif np.array_equal(new_pos, self.goal_pos):
            reward = 20.0
            done = True

        self.agent_pos = np.array(new_pos)
        obs = np.concatenate((self.agent_pos, self.goal_pos)).astype(np.float32)

        if self.steps >= self.max_steps:
            done = True

        return obs, reward, done, False, {}

    def render(self):
        grid = np.full((self.grid_size, self.grid_size), ' ')
        for (ox, oy) in self.obstacles:
            grid[self.grid_size-oy-1, ox] = 'X'
        grid[self.grid_size-self.agent_pos[1]-1, self.agent_pos[0]] = 'D'
        grid[self.grid_size-self.goal_pos[1]-1, self.goal_pos[0]] = 'G'
        print("\n".join("".join(row) for row in grid))
        print()



In [None]:
import gymnasium as gym
!pip install stable-baselines3
from stable_baselines3 import PPO
from __main__ import SimpleDroneEnv

# Create environment
env = SimpleDroneEnv(grid_size=10)

# Train PPO agent
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=50_000)

# Save model
model.save("ppo_drone_simple")

# Evaluate
obs, _ = env.reset()
total_reward = 0
for _ in range(200):
    action, _ = model.predict(obs)
    obs, reward, done, _, _ = env.step(action)
    total_reward += reward
    env.render()
    if done:
        break

print(f"âœ… Test complete. Total reward: {total_reward:.2f}")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 27.3     |
|    ep_rew_mean     | -12.6    |
| time/              |          |
|    fps             | 1198     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------


  return datetime.utcnow().replace(tzinfo=utc)


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 37.3        |
|    ep_rew_mean          | -13.6       |
| time/                   |             |
|    fps                  | 845         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.019479834 |
|    clip_fraction        | 0.333       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.37       |
|    explained_variance   | 0.0453      |
|    learning_rate        | 0.0003      |
|    loss                 | 2.72        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0368     |
|    value_loss           | 8.62        |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 51.7  