In [18]:
from app_copy import init_device
from app_copy import initialize_ppo
import numpy as np
import torch

def behavior_cloning_pretrain(model, epochs=5, batch_size=64, lr=3e-4, save_path="ppo_bc_model.zip"):
    # 데이터셋 로드
    image_np = np.load('./images.npz')['arr_0']
    sensor_np = np.load('./sensors.npz')['arr_0']
    action_np = np.load('./actions.npz')['arr_0']
    dataset = []
    for i in range(len(image_np)):
        data = {
            'obs': {'image': image_np[i], 'sensor_data': sensor_np[i]},
            'action': action_np[i]
        }
        dataset.append(data)


    # 옵티마이저 설정
    optimizer = torch.optim.Adam(model.policy.parameters(), lr=lr)

    # 학습 루프
    for epoch in range(epochs):
        np.random.shuffle(dataset)
        total_loss = 0
        for i in range(0, len(dataset), batch_size):
            batch = dataset[i:i + batch_size]
            obs_batch = {
                "image": torch.tensor(np.stack([d["obs"]["image"].squeeze(0) for d in batch]), dtype=torch.float32).to(model.device),
                "sensor_data": torch.tensor(np.stack([d["obs"]["sensor_data"].squeeze(0) for d in batch]), dtype=torch.float32).to(model.device)
            }
            action_batch = torch.tensor(np.stack([d["action"] for d in batch]), dtype=torch.int64).to(model.device)
            # 행동 분포 얻기
            distribution = model.policy.get_distribution(obs_batch)
            # 로그 확률 계산
            log_prob = distribution.log_prob(action_batch)
            # 손실 계산
            loss = -log_prob.mean()
            # 역전파
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.policy.parameters(), 0.5)
            optimizer.step()
            total_loss += loss.item()
        print(f"BC Epoch {epoch+1}/{epochs}, Loss: {total_loss / (len(dataset) // batch_size)}")

    # 모델 가중치 저장
    model.save(save_path)
    print(f"BC model saved to {save_path}")
    return model

device = init_device()
model, env = initialize_ppo()

behavior_cloning_pretrain(model)

Tank Env initialized
Using cuda device
BC Epoch 1/5, Loss: 3.8944361209869385
BC Epoch 2/5, Loss: 3.8362802267074585
BC Epoch 3/5, Loss: 3.774782657623291
BC Epoch 4/5, Loss: 3.7113945484161377
BC Epoch 5/5, Loss: 3.645296812057495
BC model saved to ppo_bc_model.zip


<stable_baselines3.ppo.ppo.PPO at 0x7f325170e790>

In [13]:
image_np = np.load('./images.npz')['arr_0']
len(image_np)

128