# Эксперимент по обучению с использованием сохраненных данных из буфера воспроизведения

In [None]:
from pathlib import Path
import torch
from omegaconf import OmegaConf
from torchrl.data import LazyTensorStorage, TensorDictReplayBuffer

def make_buffer(config):
    buffer = TensorDictReplayBuffer(
        batch_size=1,
        storage=LazyTensorStorage(max_size=config.data.buffer_size),
    )
    return buffer

EXPERIMENT_NAME = "td3_train_real_async"
EXPERIMENT_DATE = "2025-09-30"
EXPERIMENT_TIME = "13-08-38"

PATH_TO_EXP_DIR = Path(f"../experiments/{EXPERIMENT_NAME}/{EXPERIMENT_DATE}_{EXPERIMENT_TIME}")

ENV_LOG_DIR = PATH_TO_EXP_DIR / "env_logs"
TRAIN_LOG_DIR = PATH_TO_EXP_DIR / "train_logs"

config_path = PATH_TO_EXP_DIR / ".hydra" / "config.yaml"

config = OmegaConf.load(config_path)

scan_buffer = make_buffer(config)

buffer_path = PATH_TO_EXP_DIR / "saved_data/replay_buffer.pkl"
scan_buffer.load(buffer_path) 

print(f"Replay buffer загружен. Размер: {len(scan_buffer)} сэмплов")

In [None]:
batch_size = 64
batch = scan_buffer.sample(batch_size)
print(f"Сэмплирован batch из {len(batch)} сэмплов")

In [None]:
sample = batch[0]

print("Поля сэмпла:", sample.keys())

print("\nObservation:", sample["observation"])
print("Next Observation:", sample["next"]["observation"])
print("Action:", sample["action"])
print("Reward:", sample["next"]["reward"])
print("Done:", sample["done"])

In [None]:
filtered_buffer = make_buffer(config)
scan_buffer._batch_size = 1
seen = set()

for sample in scan_buffer:
    action = sample["action"]
    
    if (action[0] > 0.9).any() or (action[0] < -0.9).any():
        continue

    obs_tuple = tuple(sample["observation"].flatten().tolist())
    action_tuple = tuple(action.flatten().tolist())
    key = (obs_tuple, action_tuple)
    
    if key in seen:
        continue
    seen.add(key)

    observation = sample["observation"]

    observation[0][0] = observation[0][0] / 125.0
    observation[0][1] = observation[0][1] / 10.0  
    sample["observation"] = observation

    next_observation = sample["next"]["observation"].clone()
    next_observation[0][0] = next_observation[0][0] / 125.0
    next_observation[0][1] = next_observation[0][1] / 10.0
    sample["next"]["observation"] = next_observation
    
    filtered_buffer.add(sample)

    if (len(filtered_buffer) > 2500):
        break

print(f"Filtered buffer создан. Размер: {len(filtered_buffer)} сэмплов")

In [None]:
from collections import deque
from nn_laser_stabilizer.envs.utils import make_specs
from nn_laser_stabilizer.logging.utils import set_seeds
from nn_laser_stabilizer.agents.td3 import (
    make_td3_agent,
    make_loss_module,
    make_optimizers,
    make_target_updater,
    train_step,
    warmup_from_specs
)
from nn_laser_stabilizer.envs.normalization import denormalize_kp, denormalize_ki, denormalize_kd

set_seeds(config.seed)

specs = make_specs(config.env.bounds)
action_spec = specs["action"]
observation_spec = specs["observation"]

config.agent.learning_rate_actor = 1e-4
config.agent.learning_rate_critic = 1e-4
config.agent.noise_level = 0.2
config.agent.noise_clip = 0.5

actor, qvalue = make_td3_agent(config, observation_spec, action_spec)
warmup_from_specs(observation_spec, action_spec, actor, qvalue)

loss_module = make_loss_module(config, actor, qvalue, action_spec)
optimizer_actor, optimizer_critic = make_optimizers(config, loss_module)
target_net_updater = make_target_updater(config, loss_module)

train_config = config.train

total_train_steps = 0
recent_qvalue_losses = deque(maxlen=train_config.update_to_data)
recent_actor_losses = deque(maxlen=train_config.update_to_data // train_config.update_actor_freq)

print("Training process initiated")

try:
    for _ in range(1000):
        try:
            for i in range(train_config.update_to_data):
                batch = filtered_buffer.sample(train_config.batch_size)
                update_actor = i % train_config.update_actor_freq == 0
                loss_qvalue_val, loss_actor_val = train_step(
                    batch, loss_module, optimizer_actor, optimizer_critic,
                    target_net_updater, update_actor
                )

                recent_qvalue_losses.append(loss_qvalue_val)
                if loss_actor_val is not None:
                    recent_actor_losses.append(loss_actor_val)

            avg_qvalue_loss = sum(recent_qvalue_losses) / len(recent_qvalue_losses)
            avg_actor_loss = sum(recent_actor_losses) / len(recent_actor_losses)
            print(f"step={total_train_steps} Loss/Critic={avg_qvalue_loss} Loss/Actor={avg_actor_loss}")

            total_train_steps += 1

            actor.eval()
            with torch.no_grad():
                action = actor(batch[0])

            action_list = action["action"].tolist()[0]
            print(f"kp = {denormalize_kp(action_list[0])} ki = {denormalize_ki(action_list[1])} kd = {denormalize_kd(action_list[2])}")

        except KeyboardInterrupt:
            print("Training interrupted by user.")

        except Exception as ex:
            print(f"Error while training: {ex}")

finally:
    print("Training finished")
    print(f"Final buffer size: {len(filtered_buffer)} samples")

In [None]:
for _ in range(5):
    sample = filtered_buffer.sample(1)[0]  
    sample_action = sample["action"]

    actor.eval()
    with torch.no_grad():
        action = actor(sample)

    print("Observation:", sample["observation"])
    print("Action (from buffer):", sample_action)
    print("Action predicted by actor:", action["action"])

    action_list = action["action"].tolist()[0]
    print(f"kp = {denormalize_kp(action_list[0])} ki = {denormalize_ki(action_list[1])} kd = {denormalize_kd(action_list[2])}")

    qvalue.eval()
    with torch.no_grad():
        q_pred = qvalue(sample)  

    print("Predicted Q-value:", q_pred["state_action_value"])
    print()

In [None]:
error_mean_list = []
error_std_list = []
kp_list = []
ki_list = []
kd_list = []
reward_list = []
done_list = []

for i in range(len(scan_buffer)):
    sample = scan_buffer[i]
    
    error_mean_list.append(sample["observation"][0].cpu())
    error_std_list.append(sample["observation"][1].cpu())
    
    kp_list.append(sample["action"][0].cpu())
    ki_list.append(sample["action"][1].cpu())
    kd_list.append(sample["action"][2].cpu())
    
    reward_list.append(sample["next"]["reward"].cpu())
    done_list.append(sample["done"].cpu())

error_mean = torch.stack(error_mean_list)
error_std = torch.stack(error_std_list)
kp = torch.stack(kp_list)
ki = torch.stack(ki_list)
kd = torch.stack(kd_list)
reward = torch.stack(reward_list)
done = torch.stack(done_list)

def stats(tensor, name):
    print(f"=== {name} ===")
    print("Min:", tensor.min().item())
    print("Max:", tensor.max().item())
    print("Mean:", tensor.mean().item())
    print("Std:", tensor.std().item())
    print()

stats(error_mean, "Error Mean (obs[0])")
stats(error_std, "Error Std (obs[1])")
stats(kp, "KP")
stats(ki, "KI")
stats(kd, "KD")
stats(reward, "Reward")
stats(done.float(), "Done flags")

## А теперь обучение на ступенчатых данных

In [None]:
from pathlib import Path
import torch
from torchrl.data import LazyTensorStorage, TensorDictReplayBuffer
from tensordict import TensorDict
from nn_laser_stabilizer.envs.normalization import (
    normalize_kp, normalize_ki, normalize_kd,
    denormalize_kp, denormalize_ki, denormalize_kd
)
from nn_laser_stabilizer.envs.reward import ExponentialErrorReward

EXPERIMENT_NAME = "pid_grid_scan"
EXPERIMENT_DATE = "2025-09-26"
EXPERIMENT_TIME = "16-48-36"

PATH_TO_EXP_DIR = Path(f"../experiments/{EXPERIMENT_NAME}/{EXPERIMENT_DATE}/{EXPERIMENT_TIME}")
LOG_DIR = PATH_TO_EXP_DIR / "logs"

import pandas as pd
import re
from collections import defaultdict

def parse_pid_logfile(path: str) -> pd.DataFrame:
    """
    Читает лог PID-контроллера из файла и возвращает DataFrame.
    Для каждого step объединяет send и recv в одну строку.
    """
    pattern = re.compile(r"(\w+)=([0-9.+-eE]+)")
    steps = defaultdict(dict)

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            parts = line.split(maxsplit=3)
            step = int(parts[0].split("=")[1])
            timestamp = float(parts[1].split("=")[1])
            direction = parts[2]  # send / recv

            steps[step][f"time_{direction}"] = timestamp

            if len(parts) > 3:
                for key, value in pattern.findall(parts[3]):
                    steps[step][key] = float(value)

    df = pd.DataFrame.from_dict(steps, orient="index").reset_index()
    df = df.rename(columns={"index": "step"})
    return df.sort_values("step").reset_index(drop=True)

log_df = parse_pid_logfile(LOG_DIR / "log.txt")

SETPOINT = 1200

log_df = log_df[log_df["step"] >= 1000].copy()

log_df["error"] = log_df["process_variable"] - SETPOINT

BLOCK_SIZE = 2000  
LAST_STEPS = 200  

scan_buffer = TensorDictReplayBuffer(
    batch_size=1,
    storage=LazyTensorStorage(max_size=10000),
)

reward_func = ExponentialErrorReward(k=20)

print(f"Всего записей в логе: {len(log_df)}")
print(f"Шаги от {log_df['step'].min()} до {log_df['step'].max()}")

total_samples = 0
block_count = 0

for start_step in range(log_df['step'].min(), log_df['step'].max(), BLOCK_SIZE):
    end_step = start_step + BLOCK_SIZE
    
    block_df = log_df[(log_df['step'] >= start_step) & (log_df['step'] < end_step)].copy()
    
    if len(block_df) < LAST_STEPS:
        continue
    
    last_steps_df = block_df.tail(LAST_STEPS).copy()
    
    error_mean = last_steps_df["error"].mean()
    error_std = last_steps_df["error"].std()
    
    observation = torch.tensor([
        error_mean / 250.0, 
        error_std / 200.0
    ], dtype=torch.float32)
    
    avg_kp = last_steps_df["kp"].iloc[0]  
    avg_ki = last_steps_df["ki"].iloc[0]  
    avg_kd = last_steps_df["kd"].iloc[0]  
    
    action = torch.tensor([
        normalize_kp(avg_kp),
        normalize_ki(avg_ki),
        normalize_kd(avg_kd)
    ], dtype=torch.float32)
    
    rewards = [reward_func(pv, SETPOINT) for pv in last_steps_df["process_variable"]]
    avg_reward = sum(rewards) / len(rewards)
    reward_tensor = torch.tensor([avg_reward], dtype=torch.float32)
    
    next_block_start = end_step
    next_block_df = log_df[(log_df['step'] >= next_block_start) & (log_df['step'] < next_block_start + BLOCK_SIZE)].copy()
    
    if len(next_block_df) >= LAST_STEPS:
        next_last_steps_df = next_block_df.tail(LAST_STEPS).copy()
        next_error_mean = next_last_steps_df["error"].mean()
        next_error_std = next_last_steps_df["error"].std()
            
        next_observation = torch.tensor([
            next_error_mean / 250.0,
            next_error_std / 200.0
        ], dtype=torch.float32)
        done = torch.tensor([True], dtype=torch.bool)
    else:
        next_observation = observation.clone()
        done = torch.tensor([True], dtype=torch.bool)
    
    sample = TensorDict({
        "observation": observation.unsqueeze(0),
        "action": action.unsqueeze(0),
        "next": TensorDict({
            "observation": next_observation.unsqueeze(0),
            "reward": reward_tensor.unsqueeze(0),
            "done": done.unsqueeze(0)
        }, batch_size=1)
    }, batch_size=1)
    
    scan_buffer.add(sample)
    total_samples += 1
    block_count += 1
    
    if block_count % 10 == 0:
        print(f"Обработано блоков: {block_count}, образцов в буфере: {len(scan_buffer)}")

print(f"\nИтоговая статистика:")
print(f"Обработано блоков: {block_count}")
print(f"Образцов в буфере: {len(scan_buffer)}")

if len(scan_buffer) > 0:
    sample = scan_buffer.sample(1)[0]
    print(f"\nПример образца:")
    print(f"Observation: {sample['observation']}")
    print(f"Action: {sample['action']}")
    print(f"Reward: {sample['next']['reward']}")
    print(f"Done: {sample['next']['done']}")
    
    action_list = sample['action'].squeeze().tolist()
    print(f"Денормализованные действия:")
    print(f"KP: {denormalize_kp(action_list[0]):.4f}")
    print(f"KI: {denormalize_ki(action_list[1]):.4f}")
    print(f"KD: {denormalize_kd(action_list[2]):.4f}")

In [None]:
from collections import deque
from nn_laser_stabilizer.envs.utils import make_specs
from nn_laser_stabilizer.logging.utils import set_seeds
from nn_laser_stabilizer.agents.td3 import (
    make_td3_agent,
    make_loss_module,
    make_optimizers,
    make_target_updater,
    train_step,
    warmup_from_specs
)
from nn_laser_stabilizer.envs.normalization import denormalize_kp, denormalize_ki, denormalize_kd
from omegaconf import OmegaConf

EXPERIMENT_NAME = "td3_train_real_async"
EXPERIMENT_DATE = "2025-09-30"
EXPERIMENT_TIME = "13-08-38"

PATH_TO_EXP_DIR = Path(f"../experiments/{EXPERIMENT_NAME}/{EXPERIMENT_DATE}_{EXPERIMENT_TIME}")

ENV_LOG_DIR = PATH_TO_EXP_DIR / "env_logs"
TRAIN_LOG_DIR = PATH_TO_EXP_DIR / "train_logs"

config_path = PATH_TO_EXP_DIR / ".hydra" / "config.yaml"

config = OmegaConf.load(config_path)

set_seeds(config.seed)

specs = make_specs(config.env.bounds)
action_spec = specs["action"]
observation_spec = specs["observation"]

config.agent.learning_rate_actor = 1e-4
config.agent.learning_rate_critic = 1e-4
config.agent.noise_level = 0.2
config.agent.noise_clip = 0.5

actor, qvalue = make_td3_agent(config, observation_spec, action_spec)
warmup_from_specs(observation_spec, action_spec, actor, qvalue)

loss_module = make_loss_module(config, actor, qvalue, action_spec)
optimizer_actor, optimizer_critic = make_optimizers(config, loss_module)
target_net_updater = make_target_updater(config, loss_module)

train_config = config.train

total_train_steps = 0
recent_qvalue_losses = deque(maxlen=train_config.update_to_data)
recent_actor_losses = deque(maxlen=train_config.update_to_data // train_config.update_actor_freq)

print("Training process initiated")

try:
    for _ in range(1000):
        try:
            for i in range(train_config.update_to_data):
                batch = scan_buffer.sample(train_config.batch_size)
                update_actor = i % train_config.update_actor_freq == 0
                loss_qvalue_val, loss_actor_val = train_step(
                    batch, loss_module, optimizer_actor, optimizer_critic,
                    target_net_updater, update_actor
                )

                recent_qvalue_losses.append(loss_qvalue_val)
                if loss_actor_val is not None:
                    recent_actor_losses.append(loss_actor_val)

            avg_qvalue_loss = sum(recent_qvalue_losses) / len(recent_qvalue_losses)
            avg_actor_loss = sum(recent_actor_losses) / len(recent_actor_losses)
            print(f"step={total_train_steps} Loss/Critic={avg_qvalue_loss} Loss/Actor={avg_actor_loss}")

            total_train_steps += 1

            actor.eval()
            with torch.no_grad():
                action = actor(batch[0])

            action_list = action["action"].tolist()[0]
            print(f"kp = {denormalize_kp(action_list[0])} ki = {denormalize_ki(action_list[1])} kd = {denormalize_kd(action_list[2])}")

        except KeyboardInterrupt:
            print("Training interrupted by user.")

        except Exception as ex:
            print(f"Error while training: {ex}")

finally:
    print("Training finished")
    print(f"Final buffer size: {len(scan_buffer)} samples")