# Анализ эксперимента pendulum-v1

## Постановка эксперимента

**Задача**: управление маятником (Gymnasium `Pendulum-v1`) с помощью нейросетевого контроллера, обученного методом TD3.

**Окружение**: `Pendulum-v1`.
- **Наблюдение**: `[cos(θ), sin(θ), θ̇]` — угол и угловая скорость маятника.
- **Действие**: крутящий момент (torque) `∈ [-2, 2]`.
- **Награда**: `−(θ² + 0.1·θ̇² + 0.001·torque²)` — штраф за отклонение от вертикали и за расход энергии.
- **Эпизод**: 200 шагов.

**Алгоритм**: TD3, MLP 256×256, `gamma = 0.98`, `tau = 0.005`, `policy_freq = 2`, `lr = 0.001`.

In [None]:
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

from pathlib import Path
from nn_laser_stabilizer.config.config import load_config
from nn_laser_stabilizer.paths import get_experiment_dir

In [None]:
EXPERIMENT_NAME = "pendulum-v1"
EXPERIMENT_DATE = "2026-02-14"
EXPERIMENT_TIME = "13-38-10"

EXPERIMENT_DIR_PATH = get_experiment_dir(
    experiment_name=EXPERIMENT_NAME, 
    experiment_date=EXPERIMENT_DATE, 
    experiment_time=EXPERIMENT_TIME)

config = load_config(EXPERIMENT_DIR_PATH / "config.yaml")
print(f"Эксперимент: {config.experiment_name}")
print(f"Окружение: {config.env.name}")
print(f"Алгоритм: {config.algorithm.type}")
print(f"Exploration steps: {config.exploration.steps}")
print(f"Train start step: {config.training.train_start_step}")
print(f"Evaluation frequency: {config.evaluation.frequency}")
print(f"Log frequency: {config.training.log_frequency}")

## Загрузка и парсинг данных

In [None]:
def load_jsonl(path: Path, source: str | None = None, event: str | None = None) -> pd.DataFrame:
    """Загрузка JSONL-файла с опциональной фильтрацией по source и event."""
    rows = []
    with open(path, "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                record = json.loads(line)
            except json.JSONDecodeError:
                continue
            if source and record.get("source") != source:
                continue
            if event and record.get("event") != event:
                continue
            rows.append(record)
    return pd.DataFrame(rows)

In [None]:
TRAIN_LOG_PATH = EXPERIMENT_DIR_PATH / config.training.log_dir / config.training.log_file

# Шаги обучения
train_df = load_jsonl(TRAIN_LOG_PATH, source="train", event="step")
print(f"Шаги обучения: {len(train_df)} записей")
print(f"Диапазон шагов: {train_df['step'].min()} — {train_df['step'].max()}")

# Evaluation
eval_df = load_jsonl(TRAIN_LOG_PATH, source="train", event="evaluation")
print(f"\nEvaluation: {len(eval_df)} записей")
if len(eval_df) > 0:
    print(f"Диапазон шагов: {eval_df['step'].min()} — {eval_df['step'].max()}")

train_df.head()

## Evaluation: динамика награды

In [None]:
plt.figure(figsize=(14, 5))
plt.plot(eval_df['step'], eval_df['reward_mean'], 'o-', markersize=3, linewidth=1.0, color='tab:blue', label='reward_mean')
plt.fill_between(eval_df['step'], eval_df['reward_min'], eval_df['reward_max'], alpha=0.2, color='tab:blue', label='reward min–max')
plt.title('Evaluation: средняя награда за шаг')
plt.xlabel('Шаг обучения')
plt.ylabel('Reward (mean per step)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(14, 5))
plt.plot(eval_df['step'], eval_df['reward_sum'], 'o-', markersize=3, linewidth=1.0, color='tab:green', label='reward_sum')
plt.title('Evaluation: суммарная награда за эпизод')
plt.xlabel('Шаг обучения')
plt.ylabel('Reward (sum)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Анализ процесса обучения

In [None]:
plt.figure(figsize=(14, 5))
plt.plot(train_df['step'], train_df['loss_q1'], alpha=0.7, linewidth=0.8, color='tab:blue', label='Q1 Loss')
plt.title('Critic Q1 Loss')
plt.xlabel('Шаг обучения')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(14, 5))
plt.plot(train_df['step'], train_df['loss_q2'], alpha=0.7, linewidth=0.8, color='tab:green', label='Q2 Loss')
plt.title('Critic Q2 Loss')
plt.xlabel('Шаг обучения')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
actor_df = train_df[train_df['actor_loss'].notna()]

plt.figure(figsize=(14, 5))
plt.plot(actor_df['step'], actor_df['actor_loss'], alpha=0.7, linewidth=0.8, color='tab:red', label='Actor Loss')
plt.title('Actor Loss')
plt.xlabel('Шаг обучения')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(14, 5))
plt.plot(train_df['step'], train_df['buffer_size'], linewidth=1.0, color='tab:purple')
plt.title('Размер буфера воспроизведения')
plt.xlabel('Шаг обучения')
plt.ylabel('Размер буфера')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Анализ моделей

In [None]:
from nn_laser_stabilizer.rl.model.actor import MLPActor
from nn_laser_stabilizer.rl.model.critic import MLPCritic

MODELS_DIR = EXPERIMENT_DIR_PATH / "models"

actor = MLPActor.load(MODELS_DIR / "actor.pth").eval()
actor_target = MLPActor.load(MODELS_DIR / "actor_target.pth").eval()
critic1 = MLPCritic.load(MODELS_DIR / "critic1.pth").eval()
critic2 = MLPCritic.load(MODELS_DIR / "critic2.pth").eval()

print("Actor:")
print(actor)
print(f"\nВсего параметров actor: {sum(p.numel() for p in actor.parameters()):,}")
print(f"Всего параметров critic1: {sum(p.numel() for p in critic1.parameters()):,}")

In [None]:
def plot_weight_histograms(model: torch.nn.Module, title: str):
    """Гистограммы весов и bias для каждого линейного слоя."""
    linear_layers = [(name, module) for name, module in model.named_modules()
                     if isinstance(module, torch.nn.Linear)]
    n = len(linear_layers)
    fig, axes = plt.subplots(n, 2, figsize=(14, 3 * n))
    if n == 1:
        axes = axes.reshape(1, -1)
    
    for i, (name, layer) in enumerate(linear_layers):
        w = layer.weight.detach().cpu().numpy().flatten()
        axes[i, 0].hist(w, bins=80, alpha=0.7, color='tab:blue', edgecolor='black', linewidth=0.3)
        axes[i, 0].set_title(f'{name} weights [{layer.weight.shape[1]}→{layer.weight.shape[0]}]')
        axes[i, 0].set_ylabel('Частота')
        axes[i, 0].grid(True, alpha=0.3)
        
        b = layer.bias.detach().cpu().numpy().flatten()
        axes[i, 1].hist(b, bins=40, alpha=0.7, color='tab:orange', edgecolor='black', linewidth=0.3)
        axes[i, 1].set_title(f'{name} bias [{layer.bias.shape[0]}]')
        axes[i, 1].grid(True, alpha=0.3)
    
    plt.suptitle(title, fontsize=14)
    plt.tight_layout()
    plt.show()

In [None]:
plot_weight_histograms(actor, 'Распределение весов: Actor')

In [None]:
plot_weight_histograms(critic1, 'Распределение весов: Critic 1')

### Визуализация политики

Observation Pendulum-v1: `[cos(θ), sin(θ), θ̇]`.

Heatmap: `action(cos(θ), sin(θ))` при `θ̇ = 0` — какой момент прикладывает агент в зависимости от угла маятника (при нулевой скорости).

In [None]:
# Политика как функция угла θ при θ̇ = 0
theta_range = np.linspace(-np.pi, np.pi, 500)
obs_1d = np.stack([np.cos(theta_range), np.sin(theta_range), np.zeros_like(theta_range)], axis=1)
obs_t = torch.tensor(obs_1d, dtype=torch.float32)

with torch.no_grad():
    actions_1d, _ = actor(obs_t)
    actions_1d_np = actions_1d.cpu().numpy().flatten()

plt.figure(figsize=(10, 5))
plt.plot(np.degrees(theta_range), actions_1d_np, linewidth=2, color='tab:red')
plt.axhline(y=0, color='black', linestyle='-', linewidth=0.5, alpha=0.5)
plt.axvline(x=0, color='green', linestyle='--', linewidth=1, alpha=0.7, label='θ=0 (вертикаль)')
plt.xlabel('θ (градусы)')
plt.ylabel('action (torque)')
plt.title('Политика актора: torque(θ) при θ̇=0')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Heatmap: action(θ, θ̇)
grid_n = 200
theta_grid_range = np.linspace(-np.pi, np.pi, grid_n)
thetadot_range = np.linspace(-8, 8, grid_n)
theta_grid, thetadot_grid = np.meshgrid(theta_grid_range, thetadot_range)

obs_grid = np.stack([
    np.cos(theta_grid.flatten()),
    np.sin(theta_grid.flatten()),
    thetadot_grid.flatten(),
], axis=1)

obs_tensor = torch.tensor(obs_grid, dtype=torch.float32)

with torch.no_grad():
    actions_grid, _ = actor(obs_tensor)
    actions_np = actions_grid.cpu().numpy().reshape(grid_n, grid_n)

plt.figure(figsize=(10, 8))
im = plt.imshow(actions_np, extent=[-180, 180, -8, 8], origin='lower', aspect='auto', cmap='RdBu_r')
plt.colorbar(im, label='action (torque)')
plt.xlabel('θ (градусы)')
plt.ylabel('θ̇ (рад/с)')
plt.title('Политика актора: torque(θ, θ̇)')
plt.grid(False)
plt.tight_layout()
plt.show()

### Сравнение actor и actor_target

In [None]:
# L2-расстояние между параметрами actor и actor_target
l2_distances = {}
for (name, p), (_, p_target) in zip(actor.named_parameters(), actor_target.named_parameters()):
    l2 = (p - p_target).norm().item()
    l2_distances[name] = l2

print("L2-расстояние между actor и actor_target по слоям:")
for name, dist in l2_distances.items():
    print(f"  {name}: {dist:.6f}")

total_l2 = sum(
    (p - p_t).pow(2).sum().item()
    for p, p_t in zip(actor.parameters(), actor_target.parameters())
) ** 0.5
print(f"\nОбщее L2-расстояние: {total_l2:.6f}")

In [None]:
# Разница действий actor vs actor_target на сетке θ × θ̇
with torch.no_grad():
    actions_target_grid, _ = actor_target(obs_tensor)
    actions_target_np = actions_target_grid.cpu().numpy().reshape(grid_n, grid_n)

diff_np = actions_np - actions_target_np

plt.figure(figsize=(10, 8))
vmax = max(abs(diff_np.min()), abs(diff_np.max()))
im = plt.imshow(diff_np, extent=[-180, 180, -8, 8], origin='lower', aspect='auto',
                cmap='RdBu_r', vmin=-vmax, vmax=vmax)
plt.colorbar(im, label='action − action_target')
plt.xlabel('θ (градусы)')
plt.ylabel('θ̇ (рад/с)')
plt.title('Разница политик: actor − actor_target')
plt.grid(False)
plt.tight_layout()
plt.show()

## Анализ буфера воспроизведения

In [None]:
from nn_laser_stabilizer.rl.data.replay_buffer import ReplayBuffer

buffer = ReplayBuffer.load(EXPERIMENT_DIR_PATH / "data" / "replay_buffer.pth")
n = buffer.size
print(f"Размер буфера: {n}")
print(f"Ёмкость: {buffer.capacity}")
print(f"obs_dim: {buffer.observations.shape[1]}, action_dim: {buffer.actions.shape[1]}")

buf_obs = buffer.observations[:n].numpy()       # (N, 3): cos(θ), sin(θ), θ̇
buf_actions = buffer.actions[:n].numpy()         # (N, 1): torque
buf_rewards = buffer.rewards[:n].numpy()         # (N, 1)
buf_next_obs = buffer.next_observations[:n].numpy()
buf_dones = buffer.dones[:n].numpy()

print(f"\nСтатистика наблюдений:")
for i, name in enumerate(['cos(θ)', 'sin(θ)', 'θ̇']):
    print(f"  {name}: mean={buf_obs[:, i].mean():.4f}, std={buf_obs[:, i].std():.4f}, "
          f"min={buf_obs[:, i].min():.4f}, max={buf_obs[:, i].max():.4f}")

print(f"\nСтатистика действий:")
print(f"  torque: mean={buf_actions.mean():.4f}, std={buf_actions.std():.4f}, "
      f"min={buf_actions.min():.4f}, max={buf_actions.max():.4f}")

print(f"\nСтатистика наград:")
print(f"  reward: mean={buf_rewards.mean():.4f}, std={buf_rewards.std():.4f}, "
      f"min={buf_rewards.min():.4f}, max={buf_rewards.max():.4f}")

print(f"\nDones: {buf_dones.sum():.0f} / {n} ({buf_dones.mean() * 100:.2f}%)")

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

axes[0, 0].hist(buf_obs[:, 0], bins=100, alpha=0.7, color='tab:blue', edgecolor='black', linewidth=0.3)
axes[0, 0].set_title('cos(θ)')
axes[0, 0].set_ylabel('Частота')
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].hist(buf_obs[:, 1], bins=100, alpha=0.7, color='tab:cyan', edgecolor='black', linewidth=0.3)
axes[0, 1].set_title('sin(θ)')
axes[0, 1].grid(True, alpha=0.3)

axes[1, 0].hist(buf_actions.flatten(), bins=100, alpha=0.7, color='tab:purple', edgecolor='black', linewidth=0.3)
axes[1, 0].set_title('action (torque)')
axes[1, 0].set_xlabel('Значение')
axes[1, 0].set_ylabel('Частота')
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].hist(buf_rewards.flatten(), bins=100, alpha=0.7, color='tab:green', edgecolor='black', linewidth=0.3)
axes[1, 1].set_title('reward')
axes[1, 1].set_xlabel('Значение')
axes[1, 1].grid(True, alpha=0.3)

plt.suptitle('Распределения данных в буфере воспроизведения', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Восстанавливаем θ из cos(θ) и sin(θ)
theta_buf = np.arctan2(buf_obs[:, 1], buf_obs[:, 0])

plt.figure(figsize=(10, 8))
plt.scatter(np.degrees(theta_buf), buf_obs[:, 2], c=buf_actions.flatten(), 
            cmap='RdBu_r', alpha=0.05, s=1)
plt.colorbar(label='action (torque)')
plt.xlabel('θ (градусы)')
plt.ylabel('θ̇ (рад/с)')
plt.title('Буфер: фазовый портрет, цвет = action')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.hist2d(np.degrees(theta_buf), buf_obs[:, 2], bins=100, cmap='hot_r')
plt.colorbar(label='Количество переходов')
plt.xlabel('θ (градусы)')
plt.ylabel('θ̇ (рад/с)')
plt.title('Покрытие пространства состояний в буфере')
plt.tight_layout()
plt.show()

## Оценка Q-функции

In [None]:
obs_t = torch.tensor(buf_obs, dtype=torch.float32)
act_t = torch.tensor(buf_actions, dtype=torch.float32)

with torch.no_grad():
    q1_buffer, _ = critic1(obs_t, act_t)
    q2_buffer, _ = critic2(obs_t, act_t)
    q_min_buffer = torch.min(q1_buffer, q2_buffer).numpy().flatten()
    
    actor_actions, _ = actor(obs_t)
    q1_policy, _ = critic1(obs_t, actor_actions)
    q2_policy, _ = critic2(obs_t, actor_actions)
    q_min_policy = torch.min(q1_policy, q2_policy).numpy().flatten()

print(f"Q(s, a_buffer): mean={q_min_buffer.mean():.4f}, std={q_min_buffer.std():.4f}, "
      f"min={q_min_buffer.min():.4f}, max={q_min_buffer.max():.4f}")
print(f"Q(s, π(s)):     mean={q_min_policy.mean():.4f}, std={q_min_policy.std():.4f}, "
      f"min={q_min_policy.min():.4f}, max={q_min_policy.max():.4f}")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(q_min_buffer, bins=100, alpha=0.7, color='tab:blue', edgecolor='black', linewidth=0.3)
axes[0].set_title('Q(s, a) — действия из буфера')
axes[0].set_xlabel('Q-value')
axes[0].set_ylabel('Частота')
axes[0].grid(True, alpha=0.3)

axes[1].hist(q_min_policy, bins=100, alpha=0.7, color='tab:red', edgecolor='black', linewidth=0.3)
axes[1].set_title('Q(s, π(s)) — действия актора')
axes[1].set_xlabel('Q-value')
axes[1].grid(True, alpha=0.3)

plt.suptitle('Распределение Q-значений (min(Q1, Q2))', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Heatmap Q(s, π(s)) на сетке θ × θ̇
with torch.no_grad():
    grid_actions, _ = actor(obs_tensor)
    q1_grid, _ = critic1(obs_tensor, grid_actions)
    q2_grid, _ = critic2(obs_tensor, grid_actions)
    q_grid = torch.min(q1_grid, q2_grid).numpy().reshape(grid_n, grid_n)

plt.figure(figsize=(10, 8))
im = plt.imshow(q_grid, extent=[-180, 180, -8, 8], origin='lower', aspect='auto', cmap='viridis')
plt.colorbar(im, label='Q(s, π(s))')
plt.xlabel('θ (градусы)')
plt.ylabel('θ̇ (рад/с)')
plt.title('Q-значение политики: Q(s, π(s))')
plt.grid(False)
plt.tight_layout()
plt.show()