# PPO on PastureEnv
This notebook trains Stable-Baselines3 PPO on the custom `PastureEnv`, logs results to CSV, plots metrics, and renders the best policy.

In [2]:
!pip  install   stable-baselines3
import os, sys, json, time, random, csv
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from custom_env import make_env
from eval_utils import evaluate_policy
from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import MlpPolicy as PPOMlpPolicy
MODELS_DIR =  Path('models') / 'ppo'
MODELS_DIR.mkdir(parents=True, exist_ok=True)

Collecting stable-baselines3
  Downloading stable_baselines3-2.7.0-py3-none-any.whl.metadata (4.8 kB)
Downloading stable_baselines3-2.7.0-py3-none-any.whl (187 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.2/187.2 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: stable-baselines3
Successfully installed stable-baselines3-2.7.0


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  return datetime.utcnow().replace(tzinfo=utc)


In [None]:

SEED = 42
np.random.seed(SEED)
random.seed(SEED)
try:
    import torch
    torch.manual_seed(SEED)
except Exception:
    torch = None
run_meta = {
    'algo': 'PPO',
    'seed': SEED,
    'env': 'PastureEnv',
    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
}
with open(MODELS_DIR / 'run_meta.json', 'w') as f: json.dump(run_meta, f, indent=2)
run_meta

{'algo': 'PPO',
 'seed': 42,
 'env': 'PastureEnv',
 'timestamp': '2025-11-24 11:06:34'}

In [None]:

TOTAL_TIMESTEPS = int(200_000)
EVAL_EPISODES = 5
RUN_LIMIT = None 
grid = {
    'learning_rate': [3e-4, 1e-4],
    'gamma': [0.99, 0.95],
    'n_steps': [1024, 2048],
    'batch_size': [64, 128],
    'gae_lambda': [0.95, 0.9],
    'clip_range': [0.2, 0.1],
    'ent_coef': [0.0, 0.01],
    'vf_coef': [0.5],
}
from itertools import product
keys = list(grid.keys())
combos = [dict(zip(keys, vals)) for vals in product(*[grid[k] for k in keys])]
if RUN_LIMIT is not None: combos = combos[:RUN_LIMIT]
len(combos)

128

In [None]:

from collections import Counter
def eval_model(model, episodes=EVAL_EPISODES):
    env_fn = lambda: make_env(render_mode=None)
    stats = evaluate_policy(env_fn, model, episodes=episodes)
    env = env_fn()
    counts = Counter()
    ep_rewards = []
    for _ in range(episodes):
        obs, _ = env.reset()
        done, trunc = False, False
        total = 0.0
        while not (done or trunc):
            action, _ = model.predict(obs, deterministic=True)
            counts[int(action)] += 1
            obs, r, done, trunc, info = env.step(int(action))
            total += r
        ep_rewards.append(total)
    env.close()
    action_counts = [counts.get(i, 0) for i in range(5)]
    return stats, action_counts, ep_rewards

In [None]:
# Training Loop with Logging and Saving
models_dir = MODELS_DIR
runs_csv = models_dir / 'ppo_runs.csv'
episodes_csv = models_dir / 'ppo_episodes.csv'
def ensure_csv_headers(file, headers):
    if not file.exists():
        with open(file, 'w', newline='') as f:
            csv.writer(f).writerow(headers)
ensure_csv_headers(runs_csv, ['run_id','timestamp','params','avg_reward','std_reward','avg_length','mean_hunger','mean_thirst','grazing_balance','action_counts','best','time_sec'])
ensure_csv_headers(episodes_csv, ['run_id','episode','reward'])
best_mean = -1e9
best_path = None
run_id = 0
for params in combos:
    run_id += 1
    t0 = time.time()
    env = make_env(render_mode=None)
    model = PPO(
        policy=PPOMlpPolicy,
        env=env,
        verbose=0,
        tensorboard_log=str(models_dir / 'tb'),
        **params
    )
    model.learn(total_timesteps=TOTAL_TIMESTEPS)
    env.close()
    stats, action_counts, ep_rewards = eval_model(model, episodes=EVAL_EPISODES)
    mean_reward = stats.get('avg_reward', float(np.mean(ep_rewards)))
    is_best = ''
    if mean_reward > best_mean:
        best_mean = mean_reward
        best_path = models_dir / 'best_ppo.zip'
        model.save(best_path)
        is_best = 'yes'
    with open(runs_csv, 'a', newline='') as f:
        csv.writer(f).writerow([run_id, time.strftime('%Y-%m-%d %H:%M:%S'), str(params), stats.get('avg_reward'), stats.get('std_reward'), stats.get('avg_length'), stats.get('mean_hunger'), stats.get('mean_thirst'), stats.get('grazing_balance_mean'), '|'.join(map(str, action_counts)), is_best, f'{time.time()-t0:.1f}'])
    with open(episodes_csv, 'a', newline='') as f:
        w = csv.writer(f)
        for i, r in enumerate(ep_rewards, 1):
            w.writerow([run_id, i, r])
best_path

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


In [None]:

runs_csv = MODELS_DIR / 'ppo' / 'ppo_runs.csv'
episodes_csv = MODELS_DIR / 'ppo' / 'ppo_episodes.csv'
if runs_csv.exists():
    runs = pd.read_csv(runs_csv)
    display(runs.tail())
    plt.figure(figsize=(7,4))
    plt.plot(runs['avg_reward'], marker='o')
    plt.title('PPO sweep: mean reward per run')
    plt.xlabel('run idx')
    plt.ylabel('avg_reward')
    plt.grid(True, alpha=0.3)
    plt.show()
if episodes_csv.exists():
    eps = pd.read_csv(episodes_csv)
    plt.figure(figsize=(7,4))
    for rid, df in eps.groupby('run_id'):
        plt.plot(df['episode'], df['reward'], alpha=0.5)
    plt.title('Per-episode rewards across runs (PPO)')
    plt.xlabel('episode')
    plt.ylabel('reward')
    plt.grid(True, alpha=0.3)
    plt.show()

In [None]:

from pathlib import Path
best_path = MODELS_DIR / 'best_ppo.zip'
if Path(best_path).exists():
    env = make_env(render_mode=None, render_fps=10)
    model = PPO.load(str(best_path), env=env)
    obs, _ = env.reset()
    done = False; trunc = False; total = 0.0
    while not (done or trunc):
        action, _ = model.predict(obs, deterministic=True)
        obs, r, done, trunc, info = env.step(int(action))
        total += r
        env.render()
    env.close()
    print('Episode reward:', total)
else:
    print('Best model not found yet. Run training first.')