# VGGT-X vs InfiniteVGGT Comparison on Bicycle Scene

This notebook runs parallel pipelines to compare VGGT-X and InfiniteVGGT on the bicycle dataset.

## Comparison Goals

1. **Run both methods** on the same input images (images_4 - 4x downsampled)
2. **Compare outputs**:
   - Camera intrinsics
   - Camera extrinsics (poses)
   - 3D point clouds
   - Reconstruction statistics
3. **Save to separate directories** for downstream gaussian splatting comparison
4. **Bundle adjustment testing** via matching procedure to get tracks across images

## Directory Structure

```
/workspace/bicycle/
├── images_4/                    # 4x downsampled images
├── vggtx_output/                # VGGT-X results
│   ├── preproc/
│   │   ├── colmap/sparse/0/
│   │   ├── transforms.json
│   │   └── sparse_pc.ply
│   └── matches/                 # Matching results for BA
└── infinitevggt_output/         # InfiniteVGGT results
    ├── preproc/
    │   ├── colmap/sparse/0/
    │   ├── transforms.json
    │   └── sparse_pc.ply
    └── matches/                 # Matching results for BA
```

## Setup and Imports

In [None]:
%load_ext autoreload
%autoreload 2

import sys
import os
from pathlib import Path
import time
import json
import numpy as np
import pandas as pd
import torch
import cv2
from tqdm import tqdm
import pycolmap

# Verify conda environment
print(f"Python executable: {sys.executable}")
print(f"Python version: {sys.version}")

if 'nerfstudio' not in sys.executable:
    print("\n⚠️  WARNING: Not running in nerfstudio conda environment!")
    print("Please activate with: conda activate nerfstudio")
else:
    print("\n✓ Running in nerfstudio environment")

# Add InfiniteVGGT to Python path
infinitevggt_root = Path("/workspace/InfiniteVGGT")
if str(infinitevggt_root / "src") not in sys.path:
    sys.path.append(str(infinitevggt_root / "src"))

# Import InfiniteVGGT components
from streamvggt.models.streamvggt import StreamVGGT
from streamvggt.utils.load_fn import load_and_preprocess_images
from streamvggt.utils.pose_enc import pose_encoding_to_extri_intri

# Import nerfstudio utilities
from nerfstudio.process_data import vggt_utils, colmap_utils
from nerfstudio.process_data.process_data_utils import CameraModel
from nerfstudio.utils.rich_utils import CONSOLE

from vggt.utils.load_fn import load_and_preprocess_images_ratio

print("✓ All imports successful")

## Configuration

In [None]:
# ============================================================================
# Dataset Configuration
# ============================================================================

# Input: Downsampled bicycle images
base_dir = Path("/workspace/bicycle")
image_dir = base_dir / "images_4"  # 4x downsampled images

# Output directories
vggtx_output_dir = base_dir / "vggtx_output"
vggtx_preproc_dir = vggtx_output_dir / "preproc"
vggtx_colmap_dir = vggtx_preproc_dir / "colmap"
vggtx_matches_dir = vggtx_output_dir / "matches"

infinitevggt_output_dir = base_dir / "infinitevggt_output"
infinitevggt_preproc_dir = infinitevggt_output_dir / "preproc"
infinitevggt_colmap_dir = infinitevggt_preproc_dir / "colmap"
infinitevggt_matches_dir = infinitevggt_output_dir / "matches"

# Create directories
vggtx_colmap_dir.mkdir(parents=True, exist_ok=True)
vggtx_matches_dir.mkdir(parents=True, exist_ok=True)
infinitevggt_colmap_dir.mkdir(parents=True, exist_ok=True)
infinitevggt_matches_dir.mkdir(parents=True, exist_ok=True)

# Check input images
image_paths = sorted(list(image_dir.glob("*.JPG")) + list(image_dir.glob("*.png")))
print(f"Input images: {image_dir}")
print(f"Found {len(image_paths)} images")
print(f"\nVGGT-X output: {vggtx_output_dir}")
print(f"InfiniteVGGT output: {infinitevggt_output_dir}")

# ============================================================================
# Model Parameters
# ============================================================================

# VGGT-X parameters
vggtx_camera_model = "PINHOLE"
vggtx_scale_factor = 1.0
vggtx_use_global_alignment = True
vggtx_conf_threshold = 50.0

# InfiniteVGGT parameters
infinitevggt_checkpoint = Path("/workspace/InfiniteVGGT/checkpoints/streamvggt.pth")
infinitevggt_total_budget = 1200000
infinitevggt_conf_threshold = 50.0
infinitevggt_scale_factor = 1.0
infinitevggt_shared_camera = True

# Common parameters
max_points_for_colmap = 500000

print(f"\nVGGT-X Config:")
print(f"  Camera model: {vggtx_camera_model}")
print(f"  Scale factor: {vggtx_scale_factor}")
print(f"  Global alignment: {vggtx_use_global_alignment}")
print(f"\nInfiniteVGGT Config:")
print(f"  Total budget: {infinitevggt_total_budget:,}")
print(f"  Checkpoint: {infinitevggt_checkpoint}")
print(f"  Checkpoint exists: {infinitevggt_checkpoint.exists()}")

## Pipeline 1: VGGT-X

Run VGGT-X (standard VGGT) on the bicycle images.

In [None]:
print("="*70)
print("RUNNING VGGT-X PIPELINE")
print("="*70)

# Track timing
vggtx_start_time = time.time()

# Run VGGT with global alignment
vggt_utils.run_vggt(
    image_dir=image_dir,
    colmap_dir=vggtx_colmap_dir,
    camera_model=vggtx_camera_model,
    scale_factor=vggtx_scale_factor,
    use_global_alignment=vggtx_use_global_alignment,
    lambda_depth=0.25,
    verbose=True,
)

vggtx_end_time = time.time()
vggtx_inference_time = vggtx_end_time - vggtx_start_time

print(f"\n✓ VGGT-X reconstruction complete!")
print(f"  Time: {vggtx_inference_time:.2f}s")
print(f"  Output: {vggtx_colmap_dir / 'sparse' / '0'}")

In [None]:
# Convert to nerfstudio format
print("\nConverting VGGT-X to nerfstudio format...")

colmap_utils.colmap_to_json(
    recon_dir=vggtx_colmap_dir / "sparse" / "0",
    output_dir=vggtx_preproc_dir,
)

# Load transforms to get applied_transform
vggtx_transforms_path = vggtx_preproc_dir / "transforms.json"
with open(vggtx_transforms_path) as f:
    vggtx_transforms = json.load(f)

vggtx_applied_transform = torch.tensor(vggtx_transforms["applied_transform"])

# Create point cloud PLY file
vggtx_ply_filename = "sparse_pc.ply"
colmap_utils.create_ply_from_colmap(
    filename=vggtx_ply_filename,
    recon_dir=vggtx_colmap_dir / "sparse" / "0",
    output_dir=vggtx_preproc_dir,
    applied_transform=vggtx_applied_transform,
)

# Update transforms.json with PLY path
vggtx_transforms["ply_file_path"] = vggtx_ply_filename
with open(vggtx_transforms_path, 'w') as f:
    json.dump(vggtx_transforms, f, indent=2)

print(f"✓ VGGT-X nerfstudio format complete")
print(f"  transforms.json: {vggtx_transforms_path}")
print(f"  Point cloud: {vggtx_preproc_dir / vggtx_ply_filename}")

## Pipeline 2: InfiniteVGGT

Run InfiniteVGGT on the same bicycle images.

In [None]:
# Run VGGT with global alignment
vggt_utils.run_infinite_vggt(
    image_dir=image_dir,
    colmap_dir=infinitevggt_colmap_dir,
    camera_model=vggtx_camera_model,
    scale_factor=vggtx_scale_factor,
    use_global_alignment=True,
    lambda_depth=0.25,
    verbose=True,
    overwrite=False,
)

In [None]:
# Convert to nerfstudio format
print("\nConverting InfiniteVGGT to nerfstudio format...")

colmap_utils.colmap_to_json(
    recon_dir=infinitevggt_colmap_dir / "sparse" / "0",
    output_dir=infinitevggt_preproc_dir,
)

# Load transforms to get applied_transform
infinitevggt_transforms_path = infinitevggt_preproc_dir / "transforms.json"
with open(infinitevggt_transforms_path) as f:
    infinitevggt_transforms = json.load(f)

infinitevggt_applied_transform = torch.tensor(infinitevggt_transforms["applied_transform"])

# Create point cloud PLY file
infinitevggt_ply_filename = "sparse_pc.ply"
colmap_utils.create_ply_from_colmap(
    filename=infinitevggt_ply_filename,
    recon_dir=infinitevggt_colmap_dir / "sparse" / "0",
    output_dir=infinitevggt_preproc_dir,
    applied_transform=infinitevggt_applied_transform,
)

# Update transforms.json with PLY path
infinitevggt_transforms["ply_file_path"] = infinitevggt_ply_filename
with open(infinitevggt_transforms_path, 'w') as f:
    json.dump(infinitevggt_transforms, f, indent=2)

print(f"✓ InfiniteVGGT nerfstudio format complete")
print(f"  transforms.json: {infinitevggt_transforms_path}")
print(f"  Point cloud: {infinitevggt_preproc_dir / infinitevggt_ply_filename}")

## Comparison: Camera Intrinsics and Extrinsics

In [None]:
print("="*70)
print("COMPARING CAMERA PARAMETERS")
print("="*70)

# Load VGGT-X reconstruction
vggtx_reconstruction = pycolmap.Reconstruction(str(vggtx_colmap_dir / "sparse" / "0"))

# Load InfiniteVGGT reconstruction
infinitevggt_reconstruction = pycolmap.Reconstruction(str(infinitevggt_colmap_dir / "sparse" / "0"))

print(f"\nVGGT-X Reconstruction:")
print(f"  Cameras: {len(vggtx_reconstruction.cameras)}")
print(f"  Images: {len(vggtx_reconstruction.images)}")
print(f"  Points3D: {len(vggtx_reconstruction.points3D):,}")

print(f"\nInfiniteVGGT Reconstruction:")
print(f"  Cameras: {len(infinitevggt_reconstruction.cameras)}")
print(f"  Images: {len(infinitevggt_reconstruction.images)}")
print(f"  Points3D: {len(infinitevggt_reconstruction.points3D):,}")

In [None]:
# Compare camera intrinsics
print("\n" + "="*70)
print("CAMERA INTRINSICS COMPARISON")
print("="*70)

# Get first camera from each reconstruction
vggtx_cam = list(vggtx_reconstruction.cameras.values())[0]
infinitevggt_cam = list(infinitevggt_reconstruction.cameras.values())[0]

print(f"\nVGGT-X Camera:")
print(f"  Model: {vggtx_cam.model}")
print(f"  Width: {vggtx_cam.width}")
print(f"  Height: {vggtx_cam.height}")
print(f"  Params: {vggtx_cam.params}")

print(f"\nInfiniteVGGT Camera:")
print(f"  Model: {infinitevggt_cam.model}")
print(f"  Width: {infinitevggt_cam.width}")
print(f"  Height: {infinitevggt_cam.height}")
print(f"  Params: {infinitevggt_cam.params}")

# Calculate differences in focal length and principal point
# if vggtx_cam.model_name == "PINHOLE" and infinitevggt_cam.model_name in ["PINHOLE", "SIMPLE_PINHOLE"]:
vggtx_fx = vggtx_cam.params[0]
vggtx_fy = vggtx_cam.params[1]
vggtx_cx = vggtx_cam.params[2]
vggtx_cy = vggtx_cam.params[3]
    
    # if infinitevggt_cam.model_name == "SIMPLE_PINHOLE":
    #     infinitevggt_fx = infinitevggt_cam.params[0]
    #     infinitevggt_fy = infinitevggt_cam.params[0]
    #     infinitevggt_cx = infinitevggt_cam.params[1]
    #     infinitevggt_cy = infinitevggt_cam.params[2]
    # else:
infinitevggt_fx = infinitevggt_cam.params[0]
infinitevggt_fy = infinitevggt_cam.params[1]
infinitevggt_cx = infinitevggt_cam.params[2]
infinitevggt_cy = infinitevggt_cam.params[3]

print(f"\nFocal Length Comparison:")
print(f"  VGGT-X fx: {vggtx_fx:.2f}, fy: {vggtx_fy:.2f}")
print(f"  InfiniteVGGT fx: {infinitevggt_fx:.2f}, fy: {infinitevggt_fy:.2f}")
print(f"  Difference fx: {abs(vggtx_fx - infinitevggt_fx):.2f} ({abs(vggtx_fx - infinitevggt_fx)/vggtx_fx*100:.2f}%)")
print(f"  Difference fy: {abs(vggtx_fy - infinitevggt_fy):.2f} ({abs(vggtx_fy - infinitevggt_fy)/vggtx_fy*100:.2f}%)")

print(f"\nPrincipal Point Comparison:")
print(f"  VGGT-X cx: {vggtx_cx:.2f}, cy: {vggtx_cy:.2f}")
print(f"  InfiniteVGGT cx: {infinitevggt_cx:.2f}, cy: {infinitevggt_cy:.2f}")
print(f"  Difference cx: {abs(vggtx_cx - infinitevggt_cx):.2f}")
print(f"  Difference cy: {abs(vggtx_cy - infinitevggt_cy):.2f}")

In [None]:
# Compare camera extrinsics (poses)
print("\n" + "="*70)
print("CAMERA EXTRINSICS COMPARISON")
print("="*70)

# Calculate pose differences
pose_differences = []

for img_id, vggtx_img in vggtx_reconstruction.images.items():
    if img_id in infinitevggt_reconstruction.images:
        infinitevggt_img = infinitevggt_reconstruction.images[img_id]
        
        # Get rotation and translation
        vggtx_R = vggtx_img.rotmat()
        vggtx_t = vggtx_img.tvec
        infinitevggt_R = infinitevggt_img.rotmat()
        infinitevggt_t = infinitevggt_img.tvec
        
        # Calculate rotation difference (Frobenius norm)
        R_diff = np.linalg.norm(vggtx_R - infinitevggt_R, 'fro')
        
        # Calculate translation difference (Euclidean distance)
        t_diff = np.linalg.norm(vggtx_t - infinitevggt_t)
        
        pose_differences.append({
            'image_id': img_id,
            'image_name': vggtx_img.name,
            'R_diff': R_diff,
            't_diff': t_diff,
        })

# Create DataFrame for analysis
pose_df = pd.DataFrame(pose_differences)

print(f"\nPose Differences (N={len(pose_df)}):")
print(f"  Rotation (Frobenius norm):")
print(f"    Mean: {pose_df['R_diff'].mean():.4f}")
print(f"    Std: {pose_df['R_diff'].std():.4f}")
print(f"    Min: {pose_df['R_diff'].min():.4f}")
print(f"    Max: {pose_df['R_diff'].max():.4f}")

print(f"\n  Translation (Euclidean distance):")
print(f"    Mean: {pose_df['t_diff'].mean():.4f}")
print(f"    Std: {pose_df['t_diff'].std():.4f}")
print(f"    Min: {pose_df['t_diff'].min():.4f}")
print(f"    Max: {pose_df['t_diff'].max():.4f}")

# Show top 5 images with largest differences
print(f"\nTop 5 images with largest pose differences:")
top5 = pose_df.nlargest(5, 't_diff')[['image_name', 'R_diff', 't_diff']]
print(top5.to_string(index=False))

## Comparison: Point Clouds

In [None]:
print("="*70)
print("POINT CLOUD COMPARISON")
print("="*70)

# Extract 3D points
vggtx_points = np.array([pt.xyz for pt in vggtx_reconstruction.points3D.values()])
infinitevggt_points = np.array([pt.xyz for pt in infinitevggt_reconstruction.points3D.values()])

print(f"\nVGGT-X Point Cloud:")
print(f"  Number of points: {len(vggtx_points):,}")
print(f"  Bounding box min: {vggtx_points.min(axis=0)}")
print(f"  Bounding box max: {vggtx_points.max(axis=0)}")
print(f"  Mean position: {vggtx_points.mean(axis=0)}")

print(f"\nInfiniteVGGT Point Cloud:")
print(f"  Number of points: {len(infinitevggt_points):,}")
print(f"  Bounding box min: {infinitevggt_points.min(axis=0)}")
print(f"  Bounding box max: {infinitevggt_points.max(axis=0)}")
print(f"  Mean position: {infinitevggt_points.mean(axis=0)}")

# Calculate point cloud statistics
vggtx_bbox_size = vggtx_points.max(axis=0) - vggtx_points.min(axis=0)
infinitevggt_bbox_size = infinitevggt_points.max(axis=0) - infinitevggt_points.min(axis=0)

print(f"\nBounding Box Size Comparison:")
print(f"  VGGT-X: {vggtx_bbox_size}")
print(f"  InfiniteVGGT: {infinitevggt_bbox_size}")
print(f"  Difference: {np.abs(vggtx_bbox_size - infinitevggt_bbox_size)}")

## Visualization: Side-by-Side Point Clouds

In [None]:
import pyvista as pv

print("Loading point clouds for visualization...")

# Load PLY files
vggtx_pcd = pv.PolyData(str(vggtx_preproc_dir / vggtx_ply_filename))
infinitevggt_pcd = pv.PolyData(str(infinitevggt_preproc_dir / infinitevggt_ply_filename))

print(f"\nVGGT-X point cloud: {vggtx_pcd.n_points:,} points")
print(f"InfiniteVGGT point cloud: {infinitevggt_pcd.n_points:,} points")

# Create side-by-side visualization
plotter = pv.Plotter(shape=(1, 2))

# VGGT-X on the left
plotter.subplot(0, 0)
plotter.add_text("VGGT-X", font_size=14)
plotter.add_mesh(
    vggtx_pcd,
    point_size=2,
    render_points_as_spheres=True,
)
plotter.add_axes()

# InfiniteVGGT on the right
plotter.subplot(0, 1)
plotter.add_text("InfiniteVGGT", font_size=14)
plotter.add_mesh(
    infinitevggt_pcd,
    point_size=2,
    render_points_as_spheres=True,
)
plotter.add_axes()

# Link cameras for synchronized viewing
plotter.link_views()

plotter.show()

## Performance Summary

In [None]:
print("="*70)
print("PERFORMANCE SUMMARY")
print("="*70)

print(f"\nDataset:")
print(f"  Images: {len(image_paths)}")
print(f"  Input directory: {image_dir}")

print(f"\nVGGT-X:")
print(f"  Inference time: {vggtx_inference_time:.2f}s")
print(f"  Time per frame: {vggtx_inference_time / len(image_paths):.3f}s")
print(f"  Cameras: {len(vggtx_reconstruction.cameras)}")
print(f"  Images registered: {len(vggtx_reconstruction.images)}")
print(f"  3D points: {len(vggtx_reconstruction.points3D):,}")
print(f"  Output: {vggtx_output_dir}")

print(f"\nInfiniteVGGT:")
print(f"  Inference time: {infinitevggt_inference_time:.2f}s")
print(f"  Time per frame: {infinitevggt_inference_time / len(image_paths):.3f}s")
print(f"  Peak GPU memory: {infinitevggt_peak_memory_gb:.2f} GB")
print(f"  Cameras: {len(infinitevggt_reconstruction.cameras)}")
print(f"  Images registered: {len(infinitevggt_reconstruction.images)}")
print(f"  3D points: {len(infinitevggt_reconstruction.points3D):,}")
print(f"  Output: {infinitevggt_output_dir}")

print(f"\nSpeedup:")
speedup = vggtx_inference_time / infinitevggt_inference_time
if speedup > 1:
    print(f"  InfiniteVGGT is {speedup:.2f}x faster than VGGT-X")
else:
    print(f"  VGGT-X is {1/speedup:.2f}x faster than InfiniteVGGT")

print("="*70)

## Bundle Adjustment: Feature Matching

To test how bundle adjustment affects each method, we need to run the matching procedure to get tracks across images. This uses feature matching to create correspondences.

In [None]:
print("="*70)
print("FEATURE MATCHING FOR BUNDLE ADJUSTMENT")
print("="*70)

# This function runs feature matching and returns match outputs
# The match outputs include:
# - Feature keypoints for each image
# - Feature descriptors
# - Matches between image pairs
# - Confidence scores

from nerfstudio.process_data.vggt_utils import _run_global_alignment

print("\nNote: Feature matching was already performed during global alignment.")
print("Match outputs are stored in the following variables:")
print("  - match_outputs (from InfiniteVGGT global alignment)")
print("\nTo use these matches for bundle adjustment, you can:")
print("  1. Extract tracks from match_outputs")
print("  2. Run pycolmap bundle adjustment with these tracks")
print("  3. Compare refined vs. original camera parameters")

# Save match information for both methods
print(f"\nMatch statistics:")
if 'match_outputs' in locals() and match_outputs is not None:
    # Count matches
    total_matches = sum(len(m) for m in match_outputs['matches'])
    print(f"  Total feature matches: {total_matches:,}")
    print(f"  Average matches per pair: {total_matches / len(match_outputs['matches']):.1f}")

In [None]:
# Example: Extract tracks from matches for bundle adjustment
# This creates a track graph that can be used with pycolmap BA

def create_track_graph_from_matches(match_outputs, image_paths):
    """Create a simple track graph from match outputs.
    
    Returns a dictionary mapping track_id -> [(image_idx, keypoint_idx), ...]
    """
    tracks = {}
    track_id = 0
    
    # This is a simplified version - in practice, you'd want to use
    # transitive closure to connect matches across multiple images
    print("Track creation would go here...")
    print("In a full implementation, you would:")
    print("  1. Build a graph of feature correspondences")
    print("  2. Find connected components (tracks)")
    print("  3. Associate each track with a 3D point")
    print("  4. Use these tracks in pycolmap bundle adjustment")
    
    return tracks

# Note: For actual bundle adjustment, you can use pycolmap's BA functions
# with the reconstruction objects we created:
#   - vggtx_reconstruction
#   - infinitevggt_reconstruction

print("\nTo run bundle adjustment on the reconstructions:")
print("""\n
# Example code:
from pycolmap import BundleAdjustmentOptions

ba_options = BundleAdjustmentOptions()
ba_options.refine_focal_length = True
ba_options.refine_principal_point = False
ba_options.refine_extra_params = False

# Run BA on VGGT-X reconstruction
vggtx_reconstruction_ba = vggtx_reconstruction
vggtx_reconstruction_ba.bundle_adjustment(ba_options)

# Run BA on InfiniteVGGT reconstruction  
infinitevggt_reconstruction_ba = infinitevggt_reconstruction
infinitevggt_reconstruction_ba.bundle_adjustment(ba_options)

# Then compare the before/after camera parameters
""")

## Next Steps

### 1. Train Gaussian Splatting Models

You can now train gaussian splatting models on both outputs:

```bash
# Train on VGGT-X output
ns-train splatfacto \
  --data /workspace/bicycle/vggtx_output/preproc \
  --output-dir /workspace/bicycle/vggtx_output/training

# Train on InfiniteVGGT output
ns-train splatfacto \
  --data /workspace/bicycle/infinitevggt_output/preproc \
  --output-dir /workspace/bicycle/infinitevggt_output/training
```

### 2. Compare Training Quality

After training, compare:
- PSNR, SSIM, LPIPS metrics
- Visual quality of renderings
- Training convergence speed
- Number of gaussians

### 3. Bundle Adjustment Testing

Use the matching outputs to:
- Extract feature tracks
- Run bundle adjustment on both reconstructions
- Compare refined vs. initial camera parameters
- Evaluate impact on final rendering quality