
# Implementación manual — A2C, PPO y SAC sobre *PointEnv*

Este notebook consolida **tus tres scripts originales** en un único flujo ejecutable, con breves explicaciones:

- **A2C** (entorno discreto)  
- **PPO** (entorno discreto)  
- **SAC** (entorno continuo)  

Se preserva la lógica original de tus archivos `.py` y se **mantienen las salidas** (gráficos y PDFs) en las carpetas correspondientes.


In [None]:

# Configuración básica para notebooks
%matplotlib inline
import os
print("CWD:", os.getcwd())

# Si querés redirigir las salidas, podés exportar variables de entorno aquí.
# Por defecto, cada script ya crea su propia carpeta ./output_* si no existe.



---
## 1) A2C — *Advantage Actor–Critic* (acciones discretas)

**Idea:** política categórica para elegir entre {-1, 0, +1}, crítico para estimar el valor,  
y entrenamiento por ventajas \( \hat{A} = G_t - V(s_t) \) con **entropía** para explorar.

**Ejecución:** al correr la celda siguiente, entrena por 400 episodios, guarda el modelo y genera:
- `./output_a2c_pointenv/a2c_pointenv_model.pth`
- `./output_a2c_pointenv/convergence_a2c.png`
- `./output_a2c_pointenv/A2C_PointEnv_Report.pdf`


In [None]:
#!/usr/bin/env python3
"""
a2c_pointenv.py
A2C (Advantage Actor-Critic) en PointEnv discreto (acciones {-1,0,+1}).
Guarda: modelo, convergence.png, A2C_PointEnv_Report.pdf, README.txt
Requerimientos: torch, matplotlib, numpy
Ejecutar: python a2c_pointenv.py
"""
import os, math, random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from datetime import datetime

# ----------------------------
# Output dir (portable)
OUT_DIR = os.environ.get("OUT_A2C", "./output_a2c_pointenv")
os.makedirs(OUT_DIR, exist_ok=True)

# ----------------------------
# Entorno discreto simple
class PointEnvDiscrete:
    def __init__(self, max_steps=50, force=0.6, noise_scale=0.02):
        self.max_steps = max_steps
        self.force = force
        self.noise_scale = noise_scale
        self.reset()
    def reset(self):
        self.pos = np.random.uniform(-1.5, 1.5)
        self.steps = 0
        self.target = 0.0
        return np.array([self.pos], dtype=np.float32)
    def step(self, action_index):
        act = [-1, 0, 1][action_index]
        noise = np.random.normal(scale=self.noise_scale)
        self.pos = self.pos + act * self.force + noise
        self.steps += 1
        dist = abs(self.pos - self.target)
        reward = -dist
        done = False
        if dist < 0.05:
            reward += 1.0
            done = True
        if self.steps >= self.max_steps:
            done = True
        return np.array([self.pos], dtype=np.float32), float(reward), done, {}
    @property
    def obs_shape(self):
        return (1,)
    @property
    def n_actions(self):
        return 3

# ----------------------------
# Actor-Critic network
class ActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden=64):
        super().__init__()
        self.shared = nn.Sequential(nn.Linear(obs_dim, hidden), nn.Tanh())
        self.policy = nn.Sequential(nn.Linear(hidden, hidden//2), nn.Tanh(), nn.Linear(hidden//2, act_dim))
        self.value  = nn.Sequential(nn.Linear(hidden, hidden//2), nn.Tanh(), nn.Linear(hidden//2, 1))
    def forward(self, x):
        h = self.shared(x)
        logits = self.policy(h)
        value = self.value(h).squeeze(-1)
        return logits, value

# ----------------------------
# Helpers
def select_action(logits):
    probs = torch.softmax(logits, dim=-1)
    dist = torch.distributions.Categorical(probs)
    a = dist.sample()
    return a.item(), dist.log_prob(a), dist.entropy()

def discounted_returns(rewards, dones, last_value, gamma=0.99):
    R = last_value
    returns = []
    for r, d in zip(rewards[::-1], dones[::-1]):
        if d:
            R = 0.0
        R = r + gamma * R
        returns.insert(0, R)
    return returns

# ----------------------------
# Hiperparámetros
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
NUM_EPISODES = 400
MAX_STEPS = 50
GAMMA = 0.99
LR = 2.5e-4
VALUE_COEF = 0.5
ENTROPY_COEF = 1e-3
PRINT_EVERY = 25

# ----------------------------
# Setup
env = PointEnvDiscrete(max_steps=MAX_STEPS)
obs_dim = env.obs_shape[0]
act_dim = env.n_actions

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ActorCritic(obs_dim, act_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=LR)

episode_rewards = []
smoothed = []
alpha_smooth = 0.04

# Training loop
for ep in range(1, NUM_EPISODES + 1):
    obs = env.reset()
    obs_t = torch.tensor(obs, dtype=torch.float32, device=device)
    done = False
    ep_reward = 0.0

    # buffers
    logps = []
    values = []
    rewards = []
    dones = []
    entropies = []

    for step in range(MAX_STEPS):
        logits, value = model(obs_t.unsqueeze(0))
        logits = logits.squeeze(0)
        value = value.squeeze(0)
        action, logp, entropy = select_action(logits)
        next_obs, reward, done, _ = env.step(action)
        ep_reward += reward

        logps.append(logp)
        values.append(value)
        rewards.append(reward)
        dones.append(done)
        entropies.append(entropy)

        obs_t = torch.tensor(next_obs, dtype=torch.float32, device=device)
        if done:
            break

    if done:
        last_value = 0.0
    else:
        with torch.no_grad():
            _, last_value = model(obs_t.unsqueeze(0))
            last_value = last_value.item()

    returns = discounted_returns(rewards, dones, last_value, GAMMA)
    returns = torch.tensor(returns, dtype=torch.float32, device=device)
    values = torch.stack(values)
    logps = torch.stack(logps)
    entropies = torch.stack(entropies)
    advantages = returns - values.detach()

    value_loss = (advantages ** 2).mean()
    policy_loss = -(logps * advantages).mean()
    entropy_loss = -entropies.mean()

    loss = policy_loss + VALUE_COEF * value_loss + ENTROPY_COEF * entropy_loss

    optimizer.zero_grad()
    loss.backward()
    # optional: torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optimizer.step()

    episode_rewards.append(ep_reward)
    if len(smoothed) == 0:
        smoothed.append(ep_reward)
    else:
        smoothed.append(smoothed[-1] * (1 - alpha_smooth) + ep_reward * alpha_smooth)

    if (ep % PRINT_EVERY == 0) or (ep == 1):
        avg100 = np.mean(episode_rewards[-100:]) if len(episode_rewards) >= 1 else 0.0
        print(f"Ep {ep}/{NUM_EPISODES} Reward {ep_reward:.3f} Avg100 {avg100:.3f} Loss {loss.item():.4f}")

# Save model and results
model_path = os.path.join(OUT_DIR, "a2c_pointenv_model.pth")
torch.save(model.state_dict(), model_path)

# Plot
episodes = np.arange(1, len(episode_rewards) + 1)
plt.figure(figsize=(9,5))
plt.plot(episodes, episode_rewards, alpha=0.25, label="Reward por episodio")
plt.plot(episodes, smoothed, label=f"Media exponencial α={alpha_smooth}")
plt.xlabel("Episodio"); plt.ylabel("Reward"); plt.title("A2C Convergencia - PointEnv (discreto)")
plt.legend(); plt.grid(True)
plot_path = os.path.join(OUT_DIR, "convergence_a2c.png")
plt.savefig(plot_path, bbox_inches="tight")
plt.close()

# PDF report (3 páginas)
pdf_path = os.path.join(OUT_DIR, "A2C_PointEnv_Report.pdf")
with PdfPages(pdf_path) as pdf:
    # Page 1 - description
    plt.figure(figsize=(8.27, 11.69)); plt.axis("off")
    txt = [
        "A2C aplicado a PointEnv (discreto)",
        "",
        f"Fecha: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
        "",
        "Problema: punto 1D que debe llegar a 0.0. Acciones discretas {-1,0,+1}.",
        "Algoritmo: A2C (Actor-Critic).",
        "",
        f"Episodes: {NUM_EPISODES}, max_steps: {MAX_STEPS}",
        "",
        "Se incluyen fragmentos de código y gráfica de convergencia."
    ]
    plt.text(0.02, 0.98, "\n".join(txt), va="top", wrap=True, fontsize=11)
    pdf.savefig(); plt.close()

    # Page 2 - code snippets
    plt.figure(figsize=(8.27, 11.69)); plt.axis("off")
    code = [
        "Fragmentos de código:",
        "",
        "ActorCritic: shared -> policy logits ; value scalar",
        "select_action: Categorical(softmax(logits))",
        "returns = discounted_rewards; advantages = returns - values.detach()",
        "loss = policy_loss + VALUE_COEF * value_loss + ENTROPY_COEF * entropy_loss"
    ]
    plt.text(0.02, 0.98, "\n".join(code), va="top", wrap=True, fontsize=10)
    pdf.savefig(); plt.close()

    # Page 3 - plot
    img = plt.imread(plot_path)
    plt.figure(figsize=(8.27, 11.69)); plt.imshow(img); plt.axis("off")
    pdf.savefig(); plt.close()

# README
readme_path = os.path.join(OUT_DIR, "README.txt")
with open(readme_path, "w") as f:
    f.write("A2C PointEnv - Output\n")
    f.write(f"Model: {model_path}\nPlot: {plot_path}\nPDF: {pdf_path}\nEpisodes: {NUM_EPISODES}\n")
print("A2C: entrenamiento finalizado. Archivos en:", OUT_DIR)



---
## 2) PPO — *Proximal Policy Optimization* (discreto)

**Clipped surrogate objective**: maximiza un bound del objetivo por razones de estabilidad;  
usa **GAE(\(\lambda\))** y múltiples épocas sobre el mismo batch de trayectoria.

**Ejecución:** al correr la celda siguiente, entrena por 400 episodios, guarda y produce:
- `./output_ppo_pointenv/ppo_pointenv_model.pth`
- `./output_ppo_pointenv/convergence_ppo.png`
- `./output_ppo_pointenv/PPO_PointEnv_Report.pdf`


In [None]:
#!/usr/bin/env python3
"""
ppo_pointenv.py
PPO (Clipped surrogate) en PointEnv discreto.
Guarda: modelo, convergence.png, PPO_PointEnv_Report.pdf, README.txt
Ejecutar: python ppo_pointenv.py
"""
import os, random
import numpy as np
import torch, torch.nn as nn, torch.optim as optim
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from datetime import datetime

OUT_DIR = os.environ.get("OUT_PPO", "./output_ppo_pointenv")
os.makedirs(OUT_DIR, exist_ok=True)

# Same discrete environment as A2C
class PointEnvDiscrete:
    def __init__(self, max_steps=50, force=0.6, noise_scale=0.02):
        self.max_steps = max_steps
        self.force = force
        self.noise_scale = noise_scale
        self.reset()
    def reset(self):
        self.pos = np.random.uniform(-1.5, 1.5)
        self.steps = 0
        self.target = 0.0
        return np.array([self.pos], dtype=np.float32)
    def step(self, action_index):
        act = [-1, 0, 1][action_index]
        noise = np.random.normal(scale=self.noise_scale)
        self.pos = self.pos + act * self.force + noise
        self.steps += 1
        dist = abs(self.pos - self.target)
        reward = -dist
        done = False
        if dist < 0.05:
            reward += 1.0
            done = True
        if self.steps >= self.max_steps:
            done = True
        return np.array([self.pos], dtype=np.float32), float(reward), done, {}
    @property
    def obs_shape(self):
        return (1,)
    @property
    def n_actions(self):
        return 3

# Network: shared base, logits and value
class ActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden=64):
        super().__init__()
        self.shared = nn.Sequential(nn.Linear(obs_dim, hidden), nn.Tanh())
        self.policy = nn.Sequential(nn.Linear(hidden, hidden//2), nn.Tanh(), nn.Linear(hidden//2, act_dim))
        self.value  = nn.Sequential(nn.Linear(hidden, hidden//2), nn.Tanh(), nn.Linear(hidden//2, 1))
    def forward(self, x):
        h = self.shared(x)
        logits = self.policy(h)
        v = self.value(h).squeeze(-1)
        return logits, v

def sample_action_and_logp(logits):
    probs = torch.softmax(logits, dim=-1)
    dist = torch.distributions.Categorical(probs)
    a = dist.sample()
    return a.item(), dist.log_prob(a), dist.entropy()

def compute_gae(rewards, values, dones, last_value, gamma=0.99, lam=0.95):
    values = list(values) + [last_value]
    gae = 0.0
    returns = []
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step+1] * (1 - dones[step]) - values[step]
        gae = delta + gamma * lam * (1 - dones[step]) * gae
        returns.insert(0, gae + values[step])
    adv = np.array(returns) - np.array(values[:-1])
    return returns, adv

# Hiperparams
SEED = 42; np.random.seed(SEED); torch.manual_seed(SEED)
NUM_EPOCHS = 400
MAX_STEPS = 50
GAMMA = 0.99; LAM = 0.95
LR = 3e-4
CLIP = 0.2
UPDATE_EPOCHS = 6
BATCH_SIZE = 64
ENTROPY_COEF = 1e-3
VALUE_COEF = 0.5

# Setup
env = PointEnvDiscrete(max_steps=MAX_STEPS)
obs_dim = env.obs_shape[0]; act_dim = env.n_actions
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ActorCritic(obs_dim, act_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=LR)

episode_rewards = []
smoothed = []; alpha = 0.04

# Training loop - collect trajectories then update
for ep in range(1, NUM_EPOCHS+1):
    # collect one episode traj
    obs = env.reset()
    obs_t = torch.tensor(obs, dtype=torch.float32, device=device)
    done = False
    ep_reward = 0.0

    obs_buf = []; actions_buf = []; logp_buf = []; rewards_buf = []; dones_buf = []; values_buf = []

    for step in range(MAX_STEPS):
        logits, value = model(obs_t.unsqueeze(0))
        logits = logits.squeeze(0); value = value.squeeze(0)
        a, logp, entropy = sample_action_and_logp(logits)
        next_obs, reward, done, _ = env.step(a)
        ep_reward += reward

        obs_buf.append(obs_t.cpu().numpy())
        actions_buf.append(a)
        logp_buf.append(logp.item())
        rewards_buf.append(reward)
        dones_buf.append(done)
        values_buf.append(value.item())

        obs_t = torch.tensor(next_obs, dtype=torch.float32, device=device)
        if done:
            break

    if done:
        last_value = 0.0
    else:
        with torch.no_grad():
            _, last_value = model(obs_t.unsqueeze(0))
            last_value = last_value.item()

    returns, advantages = compute_gae(rewards_buf, values_buf, dones_buf, last_value, GAMMA, LAM)
    returns = torch.tensor(returns, dtype=torch.float32, device=device)
    advantages = torch.tensor(advantages, dtype=torch.float32, device=device)
    obs_arr = torch.tensor(np.vstack(obs_buf), dtype=torch.float32, device=device)
    actions_arr = torch.tensor(actions_buf, dtype=torch.long, device=device)
    old_logps = torch.tensor(logp_buf, dtype=torch.float32, device=device)

    # PPO update - multiple epochs over the collected batch
    dataset_size = len(actions_buf)
    for _ in range(UPDATE_EPOCHS):
        # simple mini-batching
        idxs = np.arange(dataset_size)
        np.random.shuffle(idxs)
        for start in range(0, dataset_size, BATCH_SIZE):
            end = start + BATCH_SIZE
            mb_idx = idxs[start:end]
            mb_obs = obs_arr[mb_idx]
            mb_actions = actions_arr[mb_idx]
            mb_oldlogp = old_logps[mb_idx]
            mb_returns = returns[mb_idx]
            mb_adv = advantages[mb_idx]

            logits, vals = model(mb_obs)
            probs = torch.softmax(logits, dim=-1)
            dist = torch.distributions.Categorical(probs)
            mb_logp = dist.log_prob(mb_actions)
            ratio = torch.exp(mb_logp - mb_oldlogp)
            surr1 = ratio * mb_adv
            surr2 = torch.clamp(ratio, 1.0 - CLIP, 1.0 + CLIP) * mb_adv
            policy_loss = -torch.min(surr1, surr2).mean()
            value_loss = (mb_returns - vals).pow(2).mean()
            entropy_loss = -dist.entropy().mean()
            loss = policy_loss + VALUE_COEF * value_loss + ENTROPY_COEF * entropy_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    episode_rewards.append(ep_reward)
    if len(smoothed) == 0:
        smoothed.append(ep_reward)
    else:
        smoothed.append(smoothed[-1] * (1 - alpha) + ep_reward * alpha)

    if (ep % 25 == 0) or (ep == 1):
        avg100 = np.mean(episode_rewards[-100:]) if len(episode_rewards) >= 1 else 0.0
        print(f"Ep {ep}/{NUM_EPOCHS} Reward {ep_reward:.3f} Avg100 {avg100:.3f}")

# Save
model_path = os.path.join(OUT_DIR, "ppo_pointenv_model.pth")
torch.save(model.state_dict(), model_path)
# Plot
episodes = np.arange(1, len(episode_rewards)+1)
plt.figure(figsize=(9,5))
plt.plot(episodes, episode_rewards, alpha=0.3, label="Reward por episodio")
plt.plot(episodes, smoothed, label="Media exponencial")
plt.xlabel("Episodio"); plt.ylabel("Reward"); plt.title("PPO Convergencia - PointEnv")
plt.legend(); plt.grid(True)
plot_path = os.path.join(OUT_DIR, "convergence_ppo.png")
plt.savefig(plot_path, bbox_inches="tight"); plt.close()

# PDF
pdf_path = os.path.join(OUT_DIR, "PPO_PointEnv_Report.pdf")
with PdfPages(pdf_path) as pdf:
    plt.figure(figsize=(8.27,11.69)); plt.axis("off")
    txt = ["PPO sobre PointEnv (discreto)", "", f"Fecha: {datetime.now().isoformat()}"]
    plt.text(0.02, 0.98, "\n".join(txt), va="top", wrap=True, fontsize=11); pdf.savefig(); plt.close()
    plt.figure(figsize=(8.27,11.69)); plt.axis("off")
    code = ["Fragmentos: PPO clip, GAE, update_epochs", "", "Clip:", str(CLIP)]
    plt.text(0.02, 0.98, "\n".join(code), va="top", wrap=True, fontsize=10); pdf.savefig(); plt.close()
    img = plt.imread(plot_path); plt.figure(figsize=(8.27,11.69)); plt.imshow(img); plt.axis("off"); pdf.savefig(); plt.close()

readme_path = os.path.join(OUT_DIR, "README.txt")
with open(readme_path, "w") as f:
    f.write("PPO PointEnv - Output\n"); f.write(f"Model: {model_path}\nPlot: {plot_path}\nPDF: {pdf_path}\n")
print("PPO: terminado. Archivos en:", OUT_DIR)



---
## 3) SAC — *Soft Actor–Critic* (continuo)

**Política estocástica gaussiana con Tanh** para accionar en \([-1, 1]\),  
**dos críticos (Q1, Q2)**, *target networks* y **\(\alpha\)** automático para regular la entropía.
Incluye *replay buffer* y *soft updates* (\(\tau\)).

**Ejecución:** al correr la celda siguiente, entrena por 400 episodios y genera:
- `./output_sac_pointenv/sac_pointenv_models.pth` (actor + Qs)
- `./output_sac_pointenv/convergence_sac.png`
- `./output_sac_pointenv/SAC_PointEnv_Report.pdf`


In [None]:
#!/usr/bin/env python3
"""
sac_pointenv.py
Soft Actor-Critic en entorno continuo 1D (PointEnvContinuous).
Guarda: modelo (actor + critics), convergence.png (reward por episodio),
SAC_PointEnv_Report.pdf, README.txt
Ejecutar: python sac_pointenv.py
Requerimientos: torch, numpy, matplotlib
"""
import os, random, math
import numpy as np
import torch, torch.nn as nn, torch.optim as optim
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from datetime import datetime
from collections import deque

OUT_DIR = os.environ.get("OUT_SAC", "./output_sac_pointenv")
os.makedirs(OUT_DIR, exist_ok=True)

# ----------------------------
# Entorno continuo
class PointEnvContinuous:
    def __init__(self, max_steps=50, force=0.6, noise_scale=0.02):
        self.max_steps = max_steps
        self.force = force
        self.noise_scale = noise_scale
        self.reset()
    def reset(self):
        self.pos = np.random.uniform(-1.5, 1.5)
        self.steps = 0
        self.target = 0.0
        return np.array([self.pos], dtype=np.float32)
    def step(self, action):
        # action in [-1,1] scalar
        act = float(np.clip(action, -1.0, 1.0))
        noise = np.random.normal(scale=self.noise_scale)
        self.pos = self.pos + act * self.force + noise
        self.steps += 1
        dist = abs(self.pos - self.target)
        reward = -dist
        done = False
        if dist < 0.05:
            reward += 1.0
            done = True
        if self.steps >= self.max_steps:
            done = True
        return np.array([self.pos], dtype=np.float32), float(reward), done, {}
    @property
    def obs_shape(self):
        return (1,)
    @property
    def action_dim(self):
        return 1

# ----------------------------
# Replay buffer
class ReplayBuffer:
    def __init__(self, maxlen=100000):
        self.maxlen = maxlen
        self.buf = deque(maxlen=maxlen)
    def push(self, s, a, r, s2, d):
        self.buf.append((s, a, r, s2, d))
    def sample(self, batch_size):
        batch = random.sample(self.buf, batch_size)
        s, a, r, s2, d = zip(*batch)
        return (np.vstack(s), np.vstack(a), np.array(r, dtype=np.float32),
                np.vstack(s2), np.array(d, dtype=np.float32))
    def __len__(self):
        return len(self.buf)

# ----------------------------
# Networks
LOG_STD_MIN = -20
LOG_STD_MAX = 2

class GaussianActor(nn.Module):
    def __init__(self, obs_dim, action_dim, hidden=64):
        super().__init__()
        self.fc = nn.Sequential(nn.Linear(obs_dim, hidden), nn.ReLU(), nn.Linear(hidden, hidden), nn.ReLU())
        self.mean = nn.Linear(hidden, action_dim)
        self.log_std = nn.Linear(hidden, action_dim)
    def forward(self, x):
        h = self.fc(x)
        mean = self.mean(h)
        log_std = self.log_std(h).clamp(LOG_STD_MIN, LOG_STD_MAX)
        std = torch.exp(log_std)
        return mean, std
    def sample(self, x):
        mean, std = self.forward(x)
        normal = torch.distributions.Normal(mean, std)
        z = normal.rsample()
        action = torch.tanh(z)
        logp = normal.log_prob(z) - torch.log(1 - action.pow(2) + 1e-6)
        logp = logp.sum(-1, keepdim=True)
        return action, logp, torch.tanh(mean)

class QNetwork(nn.Module):
    def __init__(self, obs_dim, action_dim, hidden=64):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(obs_dim + action_dim, hidden), nn.ReLU(),
                                 nn.Linear(hidden, hidden), nn.ReLU(), nn.Linear(hidden, 1))
    def forward(self, s, a):
        x = torch.cat([s, a], dim=-1)
        return self.net(x).squeeze(-1)

# ----------------------------
# Hyperparams
SEED = 42; np.random.seed(SEED); torch.manual_seed(SEED)
NUM_EPISODES = 400
MAX_STEPS = 50
GAMMA = 0.99
LR = 3e-4
BATCH_SIZE = 64
REPLAY_INIT = 1000
REPLAY_SIZE = 100000
TAU = 0.005
AUTO_ALPHA = True
TARGET_ENTROPY = -1.0  # for 1-d action space
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Setup
env = PointEnvContinuous(max_steps=MAX_STEPS)
obs_dim = env.obs_shape[0]; action_dim = env.action_dim
buffer = ReplayBuffer(maxlen=REPLAY_SIZE)

actor = GaussianActor(obs_dim, action_dim).to(DEVICE)
q1 = QNetwork(obs_dim, action_dim).to(DEVICE)
q2 = QNetwork(obs_dim, action_dim).to(DEVICE)
q1_target = QNetwork(obs_dim, action_dim).to(DEVICE); q2_target = QNetwork(obs_dim, action_dim).to(DEVICE)
q1_target.load_state_dict(q1.state_dict()); q2_target.load_state_dict(q2.state_dict())

actor_opt = optim.Adam(actor.parameters(), lr=LR)
q1_opt = optim.Adam(q1.parameters(), lr=LR)
q2_opt = optim.Adam(q2.parameters(), lr=LR)

# alpha
if AUTO_ALPHA:
    log_alpha = torch.tensor(0.0, requires_grad=True, device=DEVICE)
    alpha_opt = optim.Adam([log_alpha], lr=LR)
else:
    alpha = 0.2

episode_rewards = []
smoothed = []
alpha_smooth = 0.04

# Interaction and training
total_steps = 0
for ep in range(1, NUM_EPISODES + 1):
    s = env.reset()
    s_t = torch.tensor(s, dtype=torch.float32, device=DEVICE)
    ep_reward = 0.0
    for step in range(MAX_STEPS):
        # sample action from current policy (for exploration)
        with torch.no_grad():
            action_t, _, _ = actor.sample(s_t.unsqueeze(0))
            action = action_t.cpu().numpy().reshape(-1)
        next_s, r, done, _ = env.step(float(action[0]))
        buffer.push(s.reshape(1,-1), action.reshape(1,-1), r, next_s.reshape(1,-1), done)
        s = next_s
        s_t = torch.tensor(s, dtype=torch.float32, device=DEVICE)
        ep_reward += r
        total_steps += 1

        # update if enough data
        if len(buffer) > REPLAY_INIT:
            s_b, a_b, r_b, s2_b, d_b = buffer.sample(BATCH_SIZE)
            s_b = torch.tensor(s_b, dtype=torch.float32, device=DEVICE)
            a_b = torch.tensor(a_b, dtype=torch.float32, device=DEVICE)
            r_b = torch.tensor(r_b, dtype=torch.float32, device=DEVICE)
            s2_b = torch.tensor(s2_b, dtype=torch.float32, device=DEVICE)
            d_b = torch.tensor(d_b, dtype=torch.float32, device=DEVICE)

            # target
            with torch.no_grad():
                a2, logp_a2, _ = actor.sample(s2_b)
                q1_t = q1_target(s2_b, a2)
                q2_t = q2_target(s2_b, a2)
                qmin = torch.min(q1_t, q2_t)
                if AUTO_ALPHA:
                    alpha = log_alpha.exp()
                target = r_b + GAMMA * (1 - d_b) * (qmin - alpha * logp_a2.squeeze(-1))

            # Q losses
            q1_pred = q1(s_b, a_b)
            q2_pred = q2(s_b, a_b)
            q1_loss = nn.MSELoss()(q1_pred, target)
            q2_loss = nn.MSELoss()(q2_pred, target)
            q1_opt.zero_grad(); q1_loss.backward(); q1_opt.step()
            q2_opt.zero_grad(); q2_loss.backward(); q2_opt.step()

            # actor loss
            a_pi, logp_pi, _ = actor.sample(s_b)
            q1_pi = q1(s_b, a_pi); q2_pi = q2(s_b, a_pi)
            q_pi = torch.min(q1_pi, q2_pi)
            actor_loss = (alpha * logp_pi.squeeze(-1) - q_pi).mean()
            actor_opt.zero_grad(); actor_loss.backward(); actor_opt.step()

            # alpha loss
            if AUTO_ALPHA:
                alpha_loss = -(log_alpha * (logp_pi + TARGET_ENTROPY).detach()).mean()
                alpha_opt.zero_grad(); alpha_loss.backward(); alpha_opt.step()
                alpha = log_alpha.exp().item()

            # soft updates
            for param, target_param in zip(q1.parameters(), q1_target.parameters()):
                target_param.data.copy_(TAU * param.data + (1 - TAU) * target_param.data)
            for param, target_param in zip(q2.parameters(), q2_target.parameters()):
                target_param.data.copy_(TAU * param.data + (1 - TAU) * target_param.data)

        if done:
            break

    episode_rewards.append(ep_reward)
    if len(smoothed) == 0:
        smoothed.append(ep_reward)
    else:
        smoothed.append(smoothed[-1] * (1 - alpha_smooth) + ep_reward * alpha_smooth)

    if ep % 25 == 0 or ep == 1:
        print(f"Ep {ep}/{NUM_EPISODES} Reward {ep_reward:.3f} Avg100 {np.mean(episode_rewards[-100:]):.3f}")

# Save actor+critics
torch.save({
    'actor': actor.state_dict(),
    'q1': q1.state_dict(),
    'q2': q2.state_dict()
}, os.path.join(OUT_DIR, "sac_pointenv_models.pth"))

# Plot
episodes = np.arange(1, len(episode_rewards) + 1)
plt.figure(figsize=(9,5))
plt.plot(episodes, episode_rewards, alpha=0.25, label="Reward por episodio")
plt.plot(episodes, smoothed, label="Media exponencial")
plt.xlabel("Episodio"); plt.ylabel("Reward"); plt.title("SAC Convergencia - PointEnv Continuous")
plt.legend(); plt.grid(True)
plot_path = os.path.join(OUT_DIR, "convergence_sac.png")
plt.savefig(plot_path, bbox_inches="tight"); plt.close()

# PDF
pdf_path = os.path.join(OUT_DIR, "SAC_PointEnv_Report.pdf")
with PdfPages(pdf_path) as pdf:
    plt.figure(figsize=(8.27,11.69)); plt.axis("off")
    txt = ["SAC aplicado a PointEnv (continuo)", "", f"Fecha: {datetime.now().isoformat()}"]
    plt.text(0.02, 0.98, "\n".join(txt), va="top", wrap=True, fontsize=11); pdf.savefig(); plt.close()
    plt.figure(figsize=(8.27,11.69)); plt.axis("off")
    code = ["Fragmentos: actor gaussiano (tanh), 2 Qs, auto-alpha, replay buffer", "", f"REPLAY_INIT: {REPLAY_INIT}"]
    plt.text(0.02, 0.98, "\n".join(code), va="top", wrap=True, fontsize=10); pdf.savefig(); plt.close()
    img = plt.imread(plot_path); plt.figure(figsize=(8.27,11.69)); plt.imshow(img); plt.axis("off"); pdf.savefig(); plt.close()

# README
readme_path = os.path.join(OUT_DIR, "README.txt")
with open(readme_path, "w") as f:
    f.write("SAC PointEnv - Output\n"); f.write(f"Model bundle: {os.path.join(OUT_DIR,'sac_pointenv_models.pth')}\n")
    f.write(f"Plot: {plot_path}\nPDF: {pdf_path}\nEpisodes: {NUM_EPISODES}\n")
print("SAC: terminado. Archivos en:", OUT_DIR)



---
### Notas
- Los tres bloques reproducen **exactamente** tu código fuente original, sin cambios en la lógica.
- Podés ajustar hiperparámetros dentro de cada celda (número de episodios, LR, etc.).
- Si querés correrlos por separado, podés reiniciar el kernel entre corridas para limpiar memoria/GPU.
