In [29]:
import pandas as pd
import json
import ast

df = pd.read_csv('./bc_data.csv', encoding='cp949')

In [11]:
import numpy as np

a = np.linspace(0.05, 0.5, 10)
a

array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ])

In [61]:
a = np.load('./actions.npz')
a['arr_0']

array([[1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [2, 1],
       [2, 1],
       [2, 1],
       [2, 1],
       [2, 1],
       [2, 1]])

In [None]:
import gymnasium as gym
from gymnasium.spaces import Dict, Box, MultiDiscrete
import numpy as np
import heapq
import torch
import torch.nn as nn
import torch.nn.functional as F
from flask import Flask, request
import threading
import time
from stable_baselines3 import PPO
from stable_baselines3.common.buffers import DictRolloutBuffer
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.policies import MultiInputActorCriticPolicy
import cv2

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Rule-based model (example)
def rule_based_model(obs):
    # Example rule-based action generation
    # Input: obs = {"image": [128, 128, 2], "sensors": [9]}
    # Output: action = {"main": [action_idx, weight_idx], "extra": [5,]}
    image = obs["image"]  # [128, 128, 2]
    sensors = obs["sensors"]  # [9]
    # Simple rule: choose action based on sensor values
    action_idx = int(np.clip(sensors[0] * 4, 0, 3))  # Example: sensor[0] scaling
    weight_idx = int(np.clip(sensors[1] * 10, 0, 9))  # Example: sensor[1] scaling
    extra_data = sensors[:5]  # Example: first 5 sensor values
    return {
        "main": np.array([action_idx, weight_idx], dtype=np.int64),
        "extra": extra_data.astype(np.float32)
    }

# Behavior Cloning pretraining
def behavior_cloning_pretrain(model, bc_dataset, epochs=5, batch_size=64, lr=3e-4):
    optimizer = torch.optim.Adam(model.policy.parameters(), lr=lr)
    for epoch in range(epochs):
        np.random.shuffle(bc_dataset)
        total_loss = 0
        for i in range(0, len(bc_dataset), batch_size):
            batch = bc_dataset[i:i + batch_size]
            obs_batch = {
                "image": torch.tensor(np.stack([d["obs"]["image"] for d in batch]), dtype=torch.float32).to(device),
                "sensors": torch.tensor(np.stack([d["obs"]["sensors"] for d in batch]), dtype=torch.float32).to(device)
            }
            action_batch = {
                "main": torch.tensor(np.stack([d["action"]["main"] for d in batch]), dtype=torch.int64).to(device),
                "extra": torch.tensor(np.stack([d["action"]["extra"] for d in batch]), dtype=torch.float32).to(device)
            }
            # Forward pass
            with torch.no_grad():
                _, log_prob, _ = model.policy(obs_batch, action=action_batch)
            # Loss: maximize log probability for main, minimize MSE for extra
            main_loss = -log_prob.mean()  # Negative log probability for MultiDiscrete
            extra_loss = F.mse_loss(model.policy.forward(obs_batch)[0]["extra"], action_batch["extra"])
            loss = main_loss + extra_loss
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.policy.parameters(), 0.5)
            optimizer.step()
            total_loss += loss.item()
        print(f"BC Epoch {epoch+1}/{epochs}, Loss: {total_loss / (len(bc_dataset) // batch_size)}")
    return model

# Collect BC dataset
def collect_bc_dataset(env, rule_model, num_samples=1000):
    dataset = []
    obs, _ = env.reset()
    for _ in range(num_samples):
        action = rule_model(obs)
        dataset.append({"obs": obs, "action": action})
        new_obs, reward, done, info = env.step(action)
        obs = new_obs
        if done:
            obs, _ = env.reset()
    return dataset

# Custom DummyVecEnv
class CustomDummyVecEnv(DummyVecEnv):
    def __init__(self, env_fns):
        super().__init__(env_fns)
        self.step_results = []

    def reset(self, seed=None, options=None):
        self.buf_obs = {
            key: np.zeros((self.num_envs,) + self.observation_space[key].shape, dtype=self.observation_space[key].dtype)
            for key in self.observation_space.spaces.keys()
        }
        infos = []
        for env_idx, env in enumerate(self.envs):
            obs, info = env.reset(seed=seed, options=options)
            for key in self.buf_obs:
                self.buf_obs[key][env_idx] = obs[key]
            infos.append(info)
        print(f"Reset buf_obs: image={self.buf_obs['image'].shape}, sensors={self.buf_obs['sensors'].shape}, image_nonzero={np.any(self.buf_obs['image'])}")
        return self.buf_obs, infos[0] if infos else {}

    def step_async(self, actions):
        self.step_results = []
        for env_idx, env in enumerate(self.envs):
            result = env.step(actions[env_idx])
            self.step_results.append(result)

    def step_wait(self):
        self.buf_obs = {
            key: np.zeros((self.num_envs,) + self.observation_space[key].shape, dtype=self.observation_space[key].dtype)
            for key in self.observation_space.spaces.keys()
        }
        rewards, dones, infos = [], [], []
        for i, (obs, rew, terminated, truncated, info) in enumerate(self.step_results):
            done = terminated or truncated
            for key in self.buf_obs:
                self.buf_obs[key][i] = obs[key]
            rewards.append(rew)
            dones.append(done)
            infos.append(info)
        print(f"Step buf_obs: image={self.buf_obs['image'].shape}, sensors={self.buf_obs['sensors'].shape}, image_nonzero={np.any(self.buf_obs['image'])}")
        return self.buf_obs, np.array(rewards), np.array(dones), infos

# Flask app
app = Flask(__name__)
data_heap = []
heap_lock = threading.Lock()
current_action = None
action_lock = threading.Lock()
current_obs = None
obs_lock = threading.Lock()
rollout_buffer = None
model = None
env = None
step_counter = 0
total_steps = 100000
n_steps = 2048
is_first_request = True

# Simulator request handling
@app.route('/simulator_data', methods=['POST'])
def receive_data():
    global step_counter, current_action, current_obs, is_first_request
    data = request.json
    timestamp = time.time()
    with heap_lock:
        heapq.heappush(data_heap, (-timestamp, data))
        print(f"Data received at: {timestamp}, Heap size: {len(data_heap)}, Image shape: {np.array(data['image']).shape}")
    
    if model is None or env is None:
        return {"status": "Model not initialized"}, 200
    
    with heap_lock:
        if not data_heap:
            return {"status": "No data"}, 200
        _, latest_data = heapq.heappop(data_heap)
    
    if is_first_request:
        # First request: Action prediction
        obs = {
            "image": np.array(latest_data["image"], dtype=np.uint8),  # [128, 128, 2]
            "sensors": np.array(latest_data["sensors"], dtype=np.float32)  # [9]
        }
        # Convert to torch.Tensor with batch dimension
        obs_tensor = {
            "image": torch.tensor(obs["image"], dtype=torch.float32).unsqueeze(0).to(device),  # [1, 128, 128, 2]
            "sensors": torch.tensor(obs["sensors"], dtype=torch.float32).unsqueeze(0).to(device)  # [1, 9]
        }
        with obs_lock:
            global current_obs
            current_obs = obs
        
        with torch.no_grad():
            action, value, log_prob = model.policy(obs_tensor, deterministic=False)
        
        with action_lock:
            main_action = action["main"]
            action_idx, weight_idx = main_action
            weight = model.env.envs[0].weight_bins[weight_idx]
            current_action = {"action": int(action_idx), "weight": float(weight), "extra": action["extra"]}
        
        is_first_request = False
        return {"status": "Action predicted"}, 200
    else:
        # Second request: env.step()
        with obs_lock:
            obs = current_obs
        
        # Convert to torch.Tensor
        obs_tensor = {
            "image": torch.tensor(obs["image"], dtype=torch.float32).unsqueeze(0).to(device),
            "sensors": torch.tensor(obs["sensors"], dtype=torch.float32).unsqueeze(0).to(device)
        }
        
        with torch.no_grad():
            action, value, log_prob = model.policy(obs_tensor, deterministic=False)
        
        # env.step() call with Dict action
        new_obs, reward, done, info = env.step(action)
        print(f"Env output: image={new_obs['image'].shape}, sensors={new_obs['sensors'].shape}, done={done}")
        
        # RolloutBuffer storage
        obs_np = {
            "image": obs_tensor["image"].cpu().numpy()[0],
            "sensors": obs_tensor["sensors"].cpu().numpy()[0]
        }
        action_np = {
            "main": action["main"].cpu().numpy() if isinstance(action["main"], torch.Tensor) else action["main"],
            "extra": action["extra"].cpu().numpy() if isinstance(action["extra"], torch.Tensor) else action["extra"]
        }
        rollout_buffer.add(
            obs=obs_np,
            actions=action_np,
            rewards=np.array([reward]),
            dones=np.array([done]),
            values=value.cpu(),
            log_probs=log_prob.cpu(),
            episode_starts=np.array([False])
        )
        step_counter += 1
        
        # Policy update
        if step_counter % n_steps == 0:
            new_obs_tensor = {
                "image": torch.tensor(new_obs["image"], dtype=torch.float32).unsqueeze(0).to(device),
                "sensors": torch.tensor(new_obs["sensors"], dtype=torch.float32).unsqueeze(0).to(device)
            }
            with torch.no_grad():
                next_value = model.policy.predict_values(new_obs_tensor)
            rollout_buffer.compute_returns_and_advantage(last_values=next_value.cpu(), dones=np.array([done]))
            model.train()
            rollout_buffer.reset()
        
        # Learning complete
        if step_counter >= total_steps:
            model.save("ppo_custom_model")
            print("Learning completed")
            return {"status": "Learning completed"}, 200
        
        # Episode reset
        if done:
            obs, _ = env.reset(options={
                "image": new_obs["image"][0],
                "sensor_data": new_obs["sensors"][0].tolist()
            })
            with obs_lock:
                current_obs = obs
            is_first_request = True
        else:
            is_first_request = True
        
        return {"status": "Step processed"}, 200

# Custom environment
class CustomEnv(gym.Env):
    def __init__(self, simulator_url="http://localhost:5000", total_steps=100000):
        super().__init__()
        self.observation_space = Dict({
            "image": Box(low=0, high=255, shape=(128, 128, 2), dtype=np.uint8),
            "sensors": Box(low=-np.inf, high=np.inf, shape=(9,), dtype=np.float32)
        })
        self.action_space = Dict({
            "main": MultiDiscrete([4, 10]),
            "extra": Box(low=-np.inf, high=np.inf, shape=(5,), dtype=np.float32)
        })
        self.simulator_url = simulator_url
        self.max_steps = 1000
        self.step_count = 0
        self.total_steps = total_steps
        self.weight_bins = np.linspace(0.0, 0.9, 10)
        self.render_mode = None
        self.extra_param = None

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.step_count = 0
        if options:
            image = np.array(options["image"], dtype=np.uint8)
            sensors = np.array(options["sensor_data"], dtype=np.float32)
            print(f"Reset with options: image={image.shape}, sensors={sensors.shape}")
            return {"image": image, "sensors": sensors}, {}
        with heap_lock:
            heapq.heapify(data_heap)
            if data_heap:
                _, data = heapq.heappop(data_heap)
            else:
                for _ in range(4):
                    time.sleep(0.25)
                    with heap_lock:
                        if data_heap:
                            _, data = heapq.heappop(data_heap)
                            break
                else:
                    raise TimeoutError("No initial data from simulator")
        image = np.array(data["image"], dtype=np.uint8)
        sensors = np.array(data["sensors"], dtype=np.float32)
        print(f"Reset with heap: image={image.shape}, sensors={sensors.shape}")
        return {"image": image, "sensors": sensors}, {}

    def step(self, action):
        global current_action
        start_time = time.time()
        main_action = action["main"]
        extra_data = action["extra"]
        print(f"Step action: main={main_action}, extra={extra_data}")
        if self.extra_param:
            print(f"Using extra param: {self.extra_param}")
        for _ in range(2):
            with heap_lock:
                if data_heap:
                    _, data = heapq.heappop(data_heap)
                    print(f"Step data at: {data['timestamp']}")
                    break
            time.sleep(0.25)
        else:
            raise TimeoutError("No result data from simulator")
        
        image = np.array(data["image"], dtype=np.uint8)
        sensors = np.array(data["sensors"], dtype=np.float32)
        reward = float(data["reward"])
        self.step_count += 1
        terminated = self.step_count >= self.max_steps or data.get("terminated", False)
        truncated = False
        info = {"step_time": time.time() - start_time}
        return {"image": image, "sensors": sensors}, reward, terminated, truncated, info

    def render(self):
        if self.render_mode == "human":
            image = self.observation_space["image"].sample()
            cv2.imshow("Environment", image[:, :, 0])
            cv2.waitKey(1)
        return image

    def close(self):
        cv2.destroyAllWindows()

# Custom feature extractor
class CustomFeaturesExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.spaces.Dict, features_dim: int = 256):
        super().__init__(observation_space, features_dim)
        self.cnn = nn.Sequential(
            nn.Conv2d(2, 32, kernel_size=8, stride=4, padding=0),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.Flatten(),
        ).to(device)
        with torch.no_grad():
            sample_image = torch.zeros(1, 2, 128, 128).to(device)
            n_flatten = self.cnn(sample_image).shape[1]
        self.mlp = nn.Sequential(
            nn.Linear(9, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
        ).to(device)
        self.linear = nn.Sequential(
            nn.Linear(n_flatten + 64, features_dim),
            nn.ReLU(),
        ).to(device)

    def forward(self, observations):
        image = observations["image"].permute(0, 3, 1, 2).float() / 255.0
        print(f"Forward input: image={image.shape}, sensors={observations['sensors'].shape}")
        image_features = self.cnn(image)
        sensor_features = self.mlp(observations["sensors"])
        combined = torch.cat([image_features, sensor_features], dim=1)
        return self.linear(combined)

# PPO initialization with BC pretraining
def initialize_ppo():
    global model, env, rollout_buffer
    env = CustomEnv(total_steps=total_steps)
    env = CustomDummyVecEnv([lambda: env])
    rollout_buffer = DictRolloutBuffer(
        buffer_size=n_steps,
        observation_space=env.observation_space,
        action_space=env.action_space,
        device=device,
        gae_lambda=0.95,
        gamma=0.99,
        n_envs=1,
    )
    model = PPO(
        policy=MultiInputActorCriticPolicy,
        env=env,
        policy_kwargs={"features_extractor_class": CustomFeaturesExtractor},
        learning_rate=3e-4,
        n_steps=n_steps,
        batch_size=64,
        n_epochs=10,
        ent_coef=0.01,  # Encourage exploration post-BC
        verbose=1,
        device=device,
    )
    # Behavior Cloning pretraining
    bc_dataset = collect_bc_dataset(env, rule_based_model, num_samples=1000)
    model = behavior_cloning_pretrain(model, bc_dataset, epochs=5, batch_size=64, lr=3e-4)
    return model, env, rollout_buffer

# Flask server start
if __name__ == "__main__":
    model, env, rollout_buffer = initialize_ppo()
    app.run(host="0.0.0.0", port=5000, threaded=True)

In [7]:
a = 0.184069
b = a // 0.05

round(b * 0.05,2)

0.15