# REINFORCE on PastureEnv
This notebook implements a simple episodic REINFORCE (policy gradient) agent for the custom `PastureEnv`, logs results to CSV, plots metrics, and renders the best policy.

In [19]:
import os, sys, json, time, random, csv
from pathlib import Path
import numpy as np
import torch
import pandas as pd
import matplotlib.pyplot as plt
from custom_env import make_env
from eval_utils import evaluate_policy
from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import MlpPolicy as PPOMlpPolicy
MODELS_DIR =  Path('models') / 'reinforce'
MODELS_DIR.mkdir(parents=True, exist_ok=True)

In [20]:
# 2) Reproducibility and Run Metadata
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)
run_meta = {
    'algo': 'REINFORCE',
    'seed': SEED,
    'env': 'PastureEnv',
    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
}
with open(MODELS_DIR / 'run_meta.json', 'w') as f: json.dump(run_meta, f, indent=2)
run_meta

{'algo': 'REINFORCE',
 'seed': 42,
 'env': 'PastureEnv',
 'timestamp': '2025-11-24 11:27:25'}

In [None]:
from dataclasses import dataclass
import torch.nn as nn
from typing import List


@dataclass
class ReinforceConfig:
    learning_rate: float = 1e-3
    gamma: float = 0.99
    hidden_size: int = 128
    entropy_coef: float = 0.01

class ReinforcePolicy(nn.Module):
    def __init__(self, obs_dim: int, act_dim: int, hidden: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, hidden), nn.ReLU(),
            nn.Linear(hidden, hidden), nn.ReLU(),
            nn.Linear(hidden, act_dim)
        )
    def forward(self, x):
        return self.net(x)
    def predict(self, obs):
        with torch.no_grad():
            x = torch.from_numpy(obs).float().unsqueeze(0)
            logits = self.forward(x)
            probs = torch.softmax(logits, dim=-1)
            dist = torch.distributions.Categorical(probs)
            return dist.sample().item()

def compute_returns(rewards: List[float], gamma: float) -> List[float]:
    G = 0.0; out = []
    for r in reversed(rewards):
        G = r + gamma * G
        out.append(G)
    return list(reversed(out))

In [None]:

from collections import Counter
def eval_policy(policy, episodes=5):
    env_fn = lambda: make_env(render_mode=None)
    stats = evaluate_policy(env_fn, policy, episodes=episodes)
    env = env_fn()
    counts = Counter()
    ep_rewards = []
    for _ in range(episodes):
        obs, _ = env.reset()
        done, trunc = False, False
        total = 0.0
        while not (done or trunc):
            action = policy.predict(obs)
            counts[int(action)] += 1
            obs, r, done, trunc, info = env.step(int(action))
            total += r
        ep_rewards.append(total)
    env.close()
    action_counts = [counts.get(i, 0) for i in range(5)]
    return stats, action_counts, ep_rewards

In [None]:
# Hyperparameters and Training Loop
TOTAL_EPISODES = 10000
EVAL_EPISODES = 5
RUN_LIMIT = None
grid = {
    'learning_rate': [1e-3, 5e-4],
    'gamma': [0.95, 0.99],
    'hidden_size': [64, 128],
    'entropy_coef': [0.0, 0.01],
}
from itertools import product
import torch.optim as optim
keys = list(grid.keys())
combos = [dict(zip(keys, vals)) for vals in product(*[grid[k] for k in keys])]
if RUN_LIMIT is not None: combos = combos[:RUN_LIMIT]
def train_run(params: dict, run_id: int):
    env = make_env(render_mode=None)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n
    policy = ReinforcePolicy(obs_dim, act_dim, params['hidden_size'])
    optimizer = optim.Adam(policy.parameters(), lr=params['learning_rate'])
    baseline = None
    all_returns = []
    for ep in range(1, TOTAL_EPISODES + 1):
        obs, _ = env.reset()
        done = False; trunc = False
        ep_rewards = []
        ep_log_probs = []
        ep_entropies = []
        while not (done or trunc):
            x = torch.from_numpy(obs).float().unsqueeze(0)
            logits = policy(x)
            probs = torch.softmax(logits, dim=-1)
            dist = torch.distributions.Categorical(probs)
            action = dist.sample()
            log_prob = dist.log_prob(action)
            entropy = dist.entropy().mean()
            nxt, r, done, trunc, info = env.step(int(action.item()))
            ep_rewards.append(r)
            ep_log_probs.append(log_prob)
            ep_entropies.append(entropy)
            obs = nxt
        returns = compute_returns(ep_rewards, params['gamma'])
        returns_t = torch.tensor(returns, dtype=torch.float32)
        if baseline is None: baseline = returns_t.mean().item()
        baseline = 0.9 * baseline + 0.1 * returns_t.mean().item()
        adv = returns_t - baseline
        loss = -(torch.stack(ep_log_probs) * adv.detach()).mean() - params['entropy_coef'] * torch.stack(ep_entropies).mean()
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        all_returns.append(sum(ep_rewards))
        if ep % 50 == 0:
            print(f"[REINFORCE Run {run_id}] Episode {ep}/{TOTAL_EPISODES} avg_last50={np.mean(all_returns[-50:]):.2f}")
    env.close()
    return policy
models_dir = MODELS_DIR
runs_csv = models_dir / 'reinforce_runs.csv'
episodes_csv = models_dir / 'reinforce_episodes.csv'
def ensure_csv_headers(file, headers):
    if not file.exists():
        with open(file, 'w', newline='') as f:
            csv.writer(f).writerow(headers)
ensure_csv_headers(runs_csv, ['run_id','timestamp','params','avg_reward','std_reward','avg_length','mean_hunger','mean_thirst','grazing_balance','best','time_sec'])
ensure_csv_headers(episodes_csv, ['run_id','episode','reward'])
best_mean = -1e9
best_path = None
run_id = 0
for params in combos:
    run_id += 1
    t0 = time.time()
    policy = train_run(params, run_id)
    stats, action_counts, ep_rewards = eval_policy(policy, episodes=EVAL_EPISODES)
    mean_reward = stats.get('avg_reward', float(np.mean(ep_rewards)))
    is_best = ''
    if mean_reward > best_mean:
        best_mean = mean_reward
        best_path = models_dir / 'best_reinforce.pt'
        torch.save({'state_dict': policy.state_dict(), 'config': params}, best_path)
        is_best = 'yes'
    with open(runs_csv, 'a', newline='') as f:
        csv.writer(f).writerow([run_id, time.strftime('%Y-%m-%d %H:%M:%S'), str(params), stats.get('avg_reward'), stats.get('std_reward'), stats.get('avg_length'), stats.get('mean_hunger'), stats.get('mean_thirst'), stats.get('grazing_balance_mean'), is_best, f'{time.time()-t0:.1f}'])
    with open(episodes_csv, 'a', newline='') as f:
        w = csv.writer(f)
        for i, r in enumerate(ep_rewards, 1):
            w.writerow([run_id, i, r])
best_path

[REINFORCE Run 1] Episode 50/10000 avg_last50=34.18
[REINFORCE Run 1] Episode 100/10000 avg_last50=111.51
[REINFORCE Run 1] Episode 150/10000 avg_last50=141.83
[REINFORCE Run 1] Episode 200/10000 avg_last50=108.70
[REINFORCE Run 1] Episode 250/10000 avg_last50=163.98
[REINFORCE Run 1] Episode 300/10000 avg_last50=195.31
[REINFORCE Run 1] Episode 350/10000 avg_last50=195.93
[REINFORCE Run 1] Episode 400/10000 avg_last50=236.30
[REINFORCE Run 1] Episode 450/10000 avg_last50=172.89
[REINFORCE Run 1] Episode 500/10000 avg_last50=158.94
[REINFORCE Run 1] Episode 550/10000 avg_last50=249.95
[REINFORCE Run 1] Episode 600/10000 avg_last50=207.47
[REINFORCE Run 1] Episode 650/10000 avg_last50=217.76
[REINFORCE Run 1] Episode 700/10000 avg_last50=243.96
[REINFORCE Run 1] Episode 750/10000 avg_last50=220.14
[REINFORCE Run 1] Episode 800/10000 avg_last50=216.55
[REINFORCE Run 1] Episode 850/10000 avg_last50=240.64
[REINFORCE Run 1] Episode 900/10000 avg_last50=246.92
[REINFORCE Run 1] Episode 950/

PosixPath('models/reinforce/best_reinforce.pt')

In [None]:
runs_csv = MODELS_DIR / 'reinforce' / 'reinforce_runs.csv'
episodes_csv = MODELS_DIR / 'reinforce' / 'reinforce_episodes.csv'
if runs_csv.exists():
    runs = pd.read_csv(runs_csv)
    display(runs.tail())
    plt.figure(figsize=(7,4))
    plt.plot(runs['avg_reward'], marker='o')
    plt.title('REINFORCE sweep: mean reward per run')
    plt.xlabel('run idx')
    plt.ylabel('avg_reward')
    plt.grid(True, alpha=0.3)
    plt.show()
if episodes_csv.exists():
    eps = pd.read_csv(episodes_csv)
    plt.figure(figsize=(7,4))
    for rid, df in eps.groupby('run_id'):
        plt.plot(df['episode'], df['reward'], alpha=0.5)
    plt.title('Per-episode rewards across runs (REINFORCE)')
    plt.xlabel('episode')
    plt.ylabel('reward')
    plt.grid(True, alpha=0.3)
    plt.show()

In [25]:
# 7) Visualize Best Model
from pathlib import Path
best_path = MODELS_DIR / 'best_reinforce.pt'
if Path(best_path).exists():
    env = make_env(render_mode=None, render_fps=10)
    # Load saved model
    ckpt = torch.load(best_path, map_location='cpu')
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n
    hidden = ckpt.get('config', {}).get('hidden_size', 128)
    policy = ReinforcePolicy(obs_dim, act_dim, hidden)
    policy.load_state_dict(ckpt['state_dict'])
    policy.eval()
    obs, _ = env.reset()
    done = False; trunc = False; total = 0.0
    while not (done or trunc):
        action = policy.predict(obs)
        obs, r, done, trunc, info = env.step(int(action))
        total += r
        env.render()
    env.close()
    print('Episode reward:', total)
else:
    print('Best model not found yet. Run training first.')

Episode reward: 407.84884188771224
