# VGGT test for multipleView video


## Load the images

In [3]:
import os
def get_image_files(folder_path, extensions=['.jpg', '.jpeg', '.png', '.gif', '.bmp']):
    file_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if any(file.lower().endswith(ext) for ext in extensions):
                relative_path = os.path.relpath(os.path.join(root, file), folder_path)
                full_path = os.path.join(folder_path, relative_path)
                file_paths.append(full_path)
    return file_paths
images_folder = "../collected_frames/"

if os.path.exists(images_folder):
        image_files = get_image_files(images_folder)
        print(f"\nFound {len(image_files)} Images:")
        #print(image_files)
        for img_path in image_files:
            print(img_path)
            
else:
        print(f"Folder {images_folder} doesn't exist")


Found 10 Images:
../collected_frames/images/cam10_frame_00001.jpg
../collected_frames/images/cam9_frame_00001.jpg
../collected_frames/images/cam8_frame_00001.jpg
../collected_frames/images/cam7_frame_00001.jpg
../collected_frames/images/cam6_frame_00001.jpg
../collected_frames/images/cam5_frame_00001.jpg
../collected_frames/images/cam4_frame_00001.jpg
../collected_frames/images/cam3_frame_00001.jpg
../collected_frames/images/cam2_frame_00001.jpg
../collected_frames/images/cam1_frame_00001.jpg


## using the VGGT model

In [14]:
import torch
from vggt.models.vggt import VGGT
from vggt.utils.load_fn import load_and_preprocess_images

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
dtype = torch.float16


cuda


In [15]:
import struct
import numpy as np
from pathlib import Path
import torch

def write_cameras_bin(cameras, output_path):
    """写入 cameras.bin 文件"""
    with open(output_path, 'wb') as f:
        f.write(struct.pack('<Q', len(cameras)))
        
        for camera_id, camera in cameras.items():
            f.write(struct.pack('<I', camera_id))
            f.write(struct.pack('<i', camera['model']))  # 1=PINHOLE
            f.write(struct.pack('<Q', camera['width']))
            f.write(struct.pack('<Q', camera['height']))
            
            for param in camera['params']:
                f.write(struct.pack('<d', param))

def write_images_bin(images, output_path):
    """写入 images.bin 文件"""
    with open(output_path, 'wb') as f:
        f.write(struct.pack('<Q', len(images)))
        
        for image_id, image in images.items():
            f.write(struct.pack('<I', image_id))
            
            # 四元数 (qw, qx, qy, qz)
            for q in image['quat']:
                f.write(struct.pack('<d', q))
            
            # 平移向量 (tx, ty, tz)
            for t in image['trans']:
                f.write(struct.pack('<d', t))
            
            f.write(struct.pack('<I', image['camera_id']))
            
            # 图像名称
            name_bytes = image['name'].encode('utf-8') + b'\0'
            f.write(name_bytes)
            
            # 2D点 (暂时为空)
            f.write(struct.pack('<Q', 0))

def write_points3d_bin(points3d, output_path):
    """写入 points3D.bin 文件"""
    with open(output_path, 'wb') as f:
        f.write(struct.pack('<Q', len(points3d)))
        
        for point_id, point in points3d.items():
            f.write(struct.pack('<Q', point_id))
            
            # 3D坐标
            for coord in point['xyz']:
                f.write(struct.pack('<d', coord))
            
            # RGB颜色
            for color in point['rgb']:
                f.write(struct.pack('<B', color))
            
            f.write(struct.pack('<d', point['error']))
            
            # 观测信息 (暂时为空)
            f.write(struct.pack('<Q', 0))

def rotation_matrix_to_quaternion(R):
    """将旋转矩阵转换为四元数 (qw, qx, qy, qz)"""
    R = R.cpu().numpy() if torch.is_tensor(R) else R
    
    trace = np.trace(R)
    if trace > 0:
        s = np.sqrt(trace + 1.0) * 2
        qw = 0.25 * s
        qx = (R[2, 1] - R[1, 2]) / s
        qy = (R[0, 2] - R[2, 0]) / s
        qz = (R[1, 0] - R[0, 1]) / s
    else:
        if R[0, 0] > R[1, 1] and R[0, 0] > R[2, 2]:
            s = np.sqrt(1.0 + R[0, 0] - R[1, 1] - R[2, 2]) * 2
            qw = (R[2, 1] - R[1, 2]) / s
            qx = 0.25 * s
            qy = (R[0, 1] + R[1, 0]) / s
            qz = (R[0, 2] + R[2, 0]) / s
        elif R[1, 1] > R[2, 2]:
            s = np.sqrt(1.0 + R[1, 1] - R[0, 0] - R[2, 2]) * 2
            qw = (R[0, 2] - R[2, 0]) / s
            qx = (R[0, 1] + R[1, 0]) / s
            qy = 0.25 * s
            qz = (R[1, 2] + R[2, 1]) / s
        else:
            s = np.sqrt(1.0 + R[2, 2] - R[0, 0] - R[1, 1]) * 2
            qw = (R[1, 0] - R[0, 1]) / s
            qx = (R[0, 2] + R[2, 0]) / s
            qy = (R[1, 2] + R[2, 1]) / s
            qz = 0.25 * s
    
    return np.array([qw, qx, qy, qz])

def convert_vggt_to_colmap(extrinsic, intrinsic, point_map_by_unprojection, 
                          depth_conf, image_files, output_dir):
    """将VGGT输出转换为COLMAP格式"""
    
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)
    
    # 转换为numpy
    if torch.is_tensor(extrinsic):
        extrinsic = extrinsic.cpu().numpy()
    if torch.is_tensor(intrinsic):
        intrinsic = intrinsic.cpu().numpy()
    if torch.is_tensor(point_map_by_unprojection):
        point_map_by_unprojection = point_map_by_unprojection.cpu().numpy()
    if torch.is_tensor(depth_conf):
        depth_conf = depth_conf.cpu().numpy()
    
    num_images = len(image_files)
    
    # 1. 准备相机数据
    cameras = {}
    for i in range(num_images):
        camera_id = i + 1
        K = intrinsic[i]  # 3x3 内参矩阵
        
        cameras[camera_id] = {
            'model': 1,  # PINHOLE
            'width': point_map_by_unprojection.shape[-1],  # 图像宽度
            'height': point_map_by_unprojection.shape[-2], # 图像高度
            'params': [K[0, 0], K[1, 1], K[0, 2], K[1, 2]]  # fx, fy, cx, cy
        }
    
    # 2. 准备图像数据 (相机位姿)
    images = {}
    for i in range(num_images):
        image_id = i + 1
        
        # 外参矩阵 (4x4)
        ext = extrinsic[i]
        R = ext[:3, :3]  # 旋转矩阵
        t = ext[:3, 3]   # 平移向量
        
        # COLMAP使用 world-to-camera 变换
        # 如果VGGT输出的是camera-to-world，需要求逆
        # 这里假设VGGT输出的是camera-to-world，所以需要求逆
        R_inv = R.T
        t_inv = -R_inv @ t
        
        # 转换为四元数
        quat = rotation_matrix_to_quaternion(R_inv)
        
        images[image_id] = {
            'quat': quat.tolist(),
            'trans': t_inv.tolist(),
            'camera_id': camera_id,
            'name': Path(image_files[i]).name
        }
    
    # 3. 准备3D点数据
    points3d = {}
    point_id = 1
    
    # 从point_map_by_unprojection提取3D点
    # 形状应该是 [num_images, 3, height, width]
    confidence_threshold = 0.5  # 置信度阈值
    
    for img_idx in range(num_images):
        points_3d = point_map_by_unprojection[img_idx]  # [3, H, W]
        conf = depth_conf[img_idx] if depth_conf is not None else np.ones_like(points_3d[0])
        
        # 采样点 (避免点太多)
        H, W = points_3d.shape[1], points_3d.shape[2]
        step = max(1, min(H, W) // 100)  # 采样步长
        
        for y in range(0, H, step):
            for x in range(0, W, step):
                if conf[y, x] > confidence_threshold:
                    xyz = points_3d[:, y, x]
                    
                    # 检查点是否有效
                    if not np.any(np.isnan(xyz)) and not np.any(np.isinf(xyz)):
                        points3d[point_id] = {
                            'xyz': xyz.tolist(),
                            'rgb': [128, 128, 128],  # 默认灰色
                            'error': 0.0
                        }
                        point_id += 1
    
    # 4. 写入文件
    write_cameras_bin(cameras, output_dir / 'cameras.bin')
    write_images_bin(images, output_dir / 'images.bin')
    write_points3d_bin(points3d, output_dir / 'points3D.bin')
    
    print(f"COLMAP文件已保存到: {output_dir}")
    print(f"- cameras.bin: {len(cameras)} 个相机")
    print(f"- images.bin: {len(images)} 张图像")
    print(f"- points3D.bin: {len(points3d)} 个3D点")
    
    return output_dir


In [None]:
from vggt.utils.pose_enc import pose_encoding_to_extri_intri
from vggt.utils.geometry import unproject_depth_map_to_point_map
model = VGGT.from_pretrained("facebook/VGGT-1B").to(device) 
images = load_and_preprocess_images(image_files).to(device)
print(f"Type: {type(images)}")
print(f"Shape: {images.shape if hasattr(images, 'shape') else 'No shape'}")


with torch.no_grad():
    with torch.cuda.amp.autocast(dtype=dtype):
        images = images[None]
        aggregated_tokens_list, ps_idx = model.aggregator(images)
                
    # Predict Cameras
    pose_enc = model.camera_head(aggregated_tokens_list)[-1]
    # Extrinsic and intrinsic matrices, following OpenCV convention (camera from world)
    extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, images.shape[-2:])

    # Predict Depth Maps
    depth_map, depth_conf = model.depth_head(aggregated_tokens_list, images, ps_idx)

    # Predict Point Maps
    point_map, point_conf = model.point_head(aggregated_tokens_list, images, ps_idx)
        
    # Construct 3D Points from Depth Maps and Cameras
    # which usually leads to more accurate 3D points than point map branch
    point_map_by_unprojection = unproject_depth_map_to_point_map(depth_map.squeeze(0), 
                                                                extrinsic.squeeze(0), 
                                                                intrinsic.squeeze(0))

    # Predict Tracks
    # choose your own points to track, with shape (N, 2) for one scene
    query_points = torch.FloatTensor([[100.0, 200.0], 
                                        [60.72, 259.94]]).to(device)
    track_list, vis_score, conf_score = model.track_head(aggregated_tokens_list, images, ps_idx, query_points=query_points[None])

    colmap_output_dir = convert_vggt_to_colmap(
        extrinsic.squeeze(0),           
        intrinsic.squeeze(0),             
        point_map_by_unprojection,      
        depth_conf.squeeze(0),          
        image_files,                    
        output_dir="./colmap_output"
    )


Type: <class 'torch.Tensor'>
Shape: torch.Size([10, 3, 518, 518])
COLMAP文件已保存到: colmap_output
- cameras.bin: 10 个相机
- images.bin: 10 张图像
- points3D.bin: 15540 个3D点
