# Download

In [None]:
# Download imports
!pip install objaverse --upgrade -q

import objaverse
import objaverse.xl as oxl

# Downloading annotations
annotations = oxl.get_annotations(
  download_dir="objaverse"
)

# Downloading objects
num_samples = 100
sketchfab_annotations = annotations[annotations["source"] == "sketchfab"]
sketchfab_samples = sketchfab_annotations.sample(n=num_samples)

oxl.download_objects(
  objects = sketchfab_samples,
  download_dir = "objaverse",
  processes = 2
)

# Conversion

In [None]:
# Conversion imports
!pip -q install trimesh[easy] pyrender

import os
os.environ["PYOPENGL_PLATFORM"] = "egl"

import trimesh
import pyrender
import numpy as np
from PIL import Image
from tqdm import tqdm
import traceback

# Conversion settings
input_folder = "/content/objaverse/hf-objaverse-v1/glbs"
images_folder = "/content/pngs"
images_folder_rotated = "/content/pngs_rotated"

image_width = 256
image_height = 256
STATIC_DEGREES = 45
max_glb_size_bytes = 100 * 1024 * 1024  # 100 MB

os.makedirs(images_folder, exist_ok=True)
os.makedirs(images_folder_rotated, exist_ok=True)

static_radians = np.radians(STATIC_DEGREES)
vertical_axis = [1, 0, 0]
rotation_matrix = trimesh.transformations.rotation_matrix(static_radians, vertical_axis)

def convert_glb_to_images(glb_path, output_path_original, output_path_rotated):
  try:
    # Loading .glb in scene
    scene = trimesh.load(glb_path, force="scene", process=False)
    if isinstance(scene, trimesh.Trimesh):
      scene = trimesh.Scene(scene)
    if not isinstance(scene, trimesh.Scene):
      return False

    # Fixing 2-channel (LA) textures if object has them
    for geometry in scene.geometry.values():
      visuals = getattr(geometry, "visual", None)
      if isinstance(visuals, trimesh.visual.texture.TextureVisuals):
        material = getattr(visuals, "material", None)
        texture = getattr(material, "baseColorTexture", None) if hasattr(material, "baseColorTexture") else getattr(material, "image", None)
        if isinstance(texture, Image.Image) and len(texture.getbands()) == 2:
          converted_texture = texture.convert("RGBA")
          if hasattr(material, "baseColorTexture"):
            material.baseColorTexture = converted_texture
          elif hasattr(material, "image"):
            material.image = converted_texture

    # Centering
    if scene.geometry:
      try:
        scene.apply_translation(-scene.bounds.mean(axis=0))
      except:
        pass

    # Applying random rotation
    random_angle = np.random.rand() * 2 * np.pi
    random_axis = trimesh.unitize(np.random.rand(3) - 0.5)
    random_rotation_matrix = trimesh.transformations.rotation_matrix(random_angle, random_axis)
    scene.apply_transform(random_rotation_matrix)

    # Trying to get or set a camera pose
    try:
      camera_pose = scene.camera_transform
    except:
      scene.set_camera(angles=np.random.rand(3) * (np.pi / 4) - (np.pi / 8), fov=60)
      camera_pose = scene.camera_transform

    # Setting up offscreen renderer
    renderer = pyrender.OffscreenRenderer(viewport_width=image_width, viewport_height=image_height)

    # Rendering and saving an image from the scene
    def render_and_save(trimesh_scene, output_path):
      pyrender_scene = pyrender.Scene.from_trimesh_scene(
        trimesh_scene, ambient_light=[0.15, 0.15, 0.15], bg_color=[0, 0, 0, 0]
      )
      if not pyrender_scene.main_camera_node:
        pyrender_scene.add(
          pyrender.PerspectiveCamera(yfov=np.pi / 3.0, aspectRatio=image_width / image_height),
          pose=camera_pose
        )
      if not pyrender_scene.lights:
        pyrender_scene.add(
          pyrender.DirectionalLight(color=[1, 1, 1], intensity=300),
          pose=camera_pose
        )
      color, _ = renderer.render(pyrender_scene, flags=pyrender.RenderFlags.RGBA)
      Image.fromarray(color, "RGBA").save(output_path)

    # First image: random rotation
    render_and_save(scene, output_path_original)

    # Second image: with extra defined rotation
    scene.apply_transform(rotation_matrix)
    render_and_save(scene, output_path_rotated)

    renderer.delete()
    return True

  except Exception:
    traceback.print_exc()
    return False

# Finding all .glb files
glb_files = [
  os.path.join(root, filename)
  for root, _, filenames in os.walk(input_folder)
  for filename in filenames if filename.lower().endswith(".glb")
]

print(f"Found {len(glb_files)} .glb files.")

success_count = 0
failure_count = 0

for glb_path in tqdm(glb_files, desc="Processing .glb files"):
  base_filename = os.path.splitext(os.path.basename(glb_path))[0]
  output_path_original = os.path.join(images_folder, base_filename + ".png")
  output_path_rotated = os.path.join(images_folder_rotated, base_filename + ".png")

  # Skipping large files
  if os.path.getsize(glb_path) > max_glb_size_bytes:
    failure_count += 1
    continue

  success = convert_glb_to_images(glb_path, output_path_original, output_path_rotated)
  success_count += success
  failure_count += not success

# Summary
print(f"\nDone: {success_count} succeeded, {failure_count} failed.")
print(f"Outputs saved to: '{images_folder}' and '{images_folder_rotated}'")

# Evaluation

In [None]:
# Evaluation imports
import os
import sys
import cv2
import torch
import numpy as np
from huggingface_hub import snapshot_download
from skimage.metrics import (
  peak_signal_noise_ratio as psnr_metric,
  structural_similarity as ssim_metric,
  mean_squared_error as mse_metric,
  normalized_root_mse as nrmse_metric
)

import pandas as pd
import torch
import PIL
from PIL import Image

try:
  !wget -nc https://github.com/DominykasDo/GMMTeamProject/raw/refs/heads/main/pipeline_zero1to3.py
  from pipeline_zero1to3 import Zero1to3StableDiffusionPipeline
except ImportError:
  raise Exception(f"'pipeline_zero1to3.py' not found.")

try:
  !pip install lpips
  import lpips
  lpips_alex = lpips.LPIPS(net="alex")
  LPIPS_AVAILABLE = True
except ImportError:
  LPIPS_AVAILABLE = False

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Placeholder for models
MODELS = {}

# Redefinitions (Since Colab could crash after lpips import)
STATIC_DEGREES = 45
images_folder = "/content/pngs"
images_folder_rotated = "/content/pngs_rotated"

# Data loading utilities
def load_image_rgba(image_path):
  if not os.path.exists(image_path):
    raise Exception(f"Image {image_path} not found.")
  return np.array(Image.open(image_path).convert("RGBA"))

def get_image_files(directory):
  if not os.path.isdir(directory):
    raise Exception(f"Directory {directory} not found.")
  return sorted([file_name for file_name in os.listdir(directory) if file_name.lower().endswith(".png")])

def convert_to_rgb_with_whitened_transparency(input_image_rgba):
  if not isinstance(input_image_rgba, PIL.PngImagePlugin.PngImageFile):
    input_image = Image.fromarray(input_image_rgba, mode="RGBA")
  else:
    input_image = input_image_rgba
  white_bg = Image.new("RGB", input_image.size, (255, 255, 255))
  return Image.alpha_composite(white_bg.convert("RGBA"), input_image).convert("RGB")

# Metrics calculation
def calculate_metrics_for_pair(predicted_image_rgba, ground_truth_image_rgba):
  if predicted_image_rgba is None or ground_truth_image_rgba is None:
    nan = float("nan")
    return {
      "PSNR": nan,
      "SSIM": nan,
      "LPIPS": nan,
      "MSE": nan,
      "RMSE": nan,
      "NRMSE": nan,
      "MAE": nan
    }

  # Images preprocessing
  predicted_image_rgb = convert_to_rgb_with_whitened_transparency(predicted_image_rgba)
  ground_truth_image_rgb = convert_to_rgb_with_whitened_transparency(ground_truth_image_rgba)
  pred_rgb = np.clip(predicted_image_rgb, 0, 255).astype(np.uint8)
  gt_rgb = np.clip(ground_truth_image_rgb, 0, 255).astype(np.uint8)

  # Computing basic metrics
  psnr_val = psnr_metric(gt_rgb, pred_rgb, data_range=255)
  mse_val = mse_metric(gt_rgb, pred_rgb)
  rmse_val = nrmse_metric(gt_rgb, pred_rgb, normalization="euclidean") * 255
  nrmse_val = nrmse_metric(gt_rgb, pred_rgb)
  mae_val = np.mean(np.abs(gt_rgb.astype(np.float32) - pred_rgb.astype(np.float32)))

  # SSIM
  min_dim = min(gt_rgb.shape[:2])
  win_size = min(min_dim, 7)
  if win_size % 2 == 0:
    win_size -= 1
  try:
    ssim_val = ssim_metric(
      gt_rgb, pred_rgb,
      data_range=255,
      channel_axis=2,
      win_size=win_size,
      gaussian_weights=True
    )
    if ssim_val == 1.0:
      ssim_val = float("nan")
  except Exception:
    ssim_val = float("nan")

  # LPIPS
  lpips_val = float("nan")
  if LPIPS_AVAILABLE:
    def to_tensor(img):
      img_t = torch.from_numpy(img.astype(np.float32) / 255.0)
      img_t = img_t.permute(2, 0, 1).unsqueeze(0) * 2 - 1
      return img_t
    try:
      lpips_val = lpips_alex(to_tensor(pred_rgb), to_tensor(gt_rgb)).item()
    except Exception:
      lpips_val = float("nan")

  return {
    "PSNR": psnr_val,
    "SSIM": ssim_val,
    "LPIPS": lpips_val,
    "MSE": mse_val,
    "RMSE": rmse_val,
    "NRMSE": nrmse_val,
    "MAE": mae_val
  }

# def add_pixel_noise_test(image_rgba, num_pixels=10, noise_strength=10):
#   noisy_image = image_rgba.copy()
#   height, width, _ = noisy_image.shape
#   for _ in range(num_pixels):
#     y = np.random.randint(0, height)
#     x = np.random.randint(0, width)
#     noise = np.random.randint(-noise_strength, noise_strength + 1, size=4)  # RGBA
#     noisy_image[y, x] = np.clip(noisy_image[y, x] + noise, 0, 255)
#   return noisy_image.astype(np.uint8)

# Main evaluation loop
def run_evaluation():
  global MODELS

  image_files = get_image_files(images_folder)
  if not image_files:
    print(f"No PNG images found in '{images_folder}'. Ensure ground truth images exist in '{images_folder_rotated}'.")
    return pd.DataFrame()

  if not MODELS:
    print("No models found in the MODELS variable.")
    return pd.DataFrame()

  all_results = {}  # Average and Median results for each model for these metrics:
  metric_names = ["PSNR", "SSIM", "LPIPS", "MSE", "RMSE", "NRMSE", "MAE"]

  for model_name, predict_function in MODELS.items():
    print(f"\n--- Evaluating Model: {model_name} ---")

    metric_sums = {m: 0.0 for m in metric_names}
    valid_counts = {m: 0 for m in metric_names}
    metric_values = {m: [] for m in metric_names}
    total_images = 0

    for image_name in image_files:
      input_image_path = os.path.join(images_folder, image_name)
      ground_truth_image_path = os.path.join(images_folder_rotated, image_name)

      ground_truth_rgba = load_image_rgba(ground_truth_image_path)
      if ground_truth_rgba is None:
        raise Exception(f"Failed to load ground truth {image_name} for model {model_name}.")

      print(f"Processing {image_name}...")
      total_images += 1

      predicted_rgba = predict_function(input_image_path)

      if not isinstance(predicted_rgba, np.ndarray):
        raise Exception(f"{model_name} returned non-array for {image_name}")
      else:
        if predicted_rgba.ndim == 2:
          predicted_rgba = cv2.cvtColor(predicted_rgba, cv2.COLOR_GRAY2RGBA)
        elif predicted_rgba.shape[2] == 3:
          alpha_channel = np.full((*predicted_rgba.shape[:2], 1), 255, dtype=np.uint8)
          predicted_rgba = np.concatenate((predicted_rgba, alpha_channel), axis=2)

        if predicted_rgba.shape[2] != 4:
          raise Exception(f"{model_name} output shape not RGBA: {predicted_rgba.shape}")
        else:
          predicted_rgba = np.clip(predicted_rgba, 0, 255).astype(np.uint8)

          if predicted_rgba.shape[:2] != ground_truth_rgba.shape[:2]:
            print(f"Resizing prediction from {predicted_rgba.shape[:2]} to {ground_truth_rgba.shape[:2]}.")
            predicted_rgba = cv2.resize(predicted_rgba, (ground_truth_rgba.shape[1], ground_truth_rgba.shape[0]), interpolation=cv2.INTER_AREA)

          metrics = calculate_metrics_for_pair(predicted_rgba, ground_truth_rgba)

      metric_str = ", ".join(f"{m}={metrics[m]:.4f}" if np.isfinite(metrics[m]) else f"{m}=NaN" for m in metric_names)
      print(f"{image_name} Metrics: {metric_str}")

      if all(np.isfinite(metrics.get(m, float("nan"))) for m in metric_names):
        for m in metric_names:
          val = metrics[m]
          metric_sums[m] += val
          metric_values[m].append(val)
          valid_counts[m] += 1
      else:
        print(f"Skipping {image_name} due to found invalid metric.")

    avg_metrics = {
      m: (metric_sums[m] / valid_counts[m]) if valid_counts[m] > 0 else float("nan")
      for m in metric_names
    }

    median_metrics = {
      m: float(np.median(metric_values[m])) if metric_values[m] else float("nan")
      for m in metric_names
    }

    all_results[model_name] = {f"{m}_avg": avg_metrics[m] for m in metric_names}
    all_results[model_name].update({f"{m}_median": median_metrics[m] for m in metric_names})

    print(f"Average and Median for {model_name}:")
    for m in metric_names:
      print(f"  {m}: avg={avg_metrics[m]:.4f}, median={median_metrics[m]:.4f} (valid {valid_counts[m]}/{total_images})")

  results_df = pd.DataFrame.from_dict(all_results, orient="index")
  print("\n--- Overall Evaluation Results ---")
  print(results_df)
  return results_df

### --- Models" prediction functions" definitions

### Model"s prediction function template for adding new models:
# def new_nvs_model_predict(input_img_path: Path to the input image):
#   Processes an input image and returns a novel view synthesis prediction.
#
#   Returns: np.ndarray
#     The predicted image as a NumPy array (H, W, 4) in RGBA format, dtype=np.uint8.

# --- Passthrough for comparison with models (returns input)
def passthrough_predict(input_img_path):
  input_image_np_rgba = load_image_rgba(input_img_path)
  if input_image_np_rgba is None:
    return None
  return input_image_np_rgba.copy().astype(np.uint8)

# --- Zero123-105000 model definition
zero123_105000_pipeline = None

def load_zero123_105000_model():
  global zero123_105000_pipeline

  if zero123_105000_pipeline is None:
    model_id = "kxic/zero123-105000"
    model_path = "./model_cache/"
    os.makedirs(model_path, exist_ok=True)
    local_model_dir = os.path.join(model_path, "zero123_105000")

    snapshot_download(
      model_id,
      local_dir=local_model_dir
    )

    pipe = Zero1to3StableDiffusionPipeline.from_pretrained(
      local_model_dir,
      torch_dtype=torch.float16,
      use_safetensors=False
    )

    pipe.enable_vae_tiling()
    pipe.enable_attention_slicing()
    pipe = pipe.to(device)
    zero123_105000_pipeline = pipe

def zero123_105000_predict(input_img_path):
  global zero123_105000_pipeline

  load_zero123_105000_model()

  if zero123_105000_pipeline is None:
    raise Exception("'zero123_105000_pipeline' is None")

  input_image = Image.open(input_img_path)
  input_image = convert_to_rgb_with_whitened_transparency(input_image)

  input_model_size = (256, 256)
  input_image_resized = input_image.resize(input_model_size, Image.LANCZOS)

  H, W = input_model_size

  target_pose = [float(-STATIC_DEGREES), 0.0, 0.0]

  print(f"Zero123 (105000) predicting for {input_img_path} with pose: {target_pose}")

  result_images = zero123_105000_pipeline(
    input_imgs=[input_image_resized],
    prompt_imgs=[input_image_resized],
    poses=[target_pose],
    height=H,
    width=W,
    guidance_scale=3.0,
    num_images_per_prompt=1,
    num_inference_steps=40
  ).images
  predicted_image = result_images[0]

  predicted_image_rgb = predicted_image.convert("RGB")
  predicted_np_rgb = np.array(predicted_image_rgb).astype(np.uint8)

  resultDir = "/content/results/zero123_105000"
  os.makedirs(resultDir, exist_ok=True)
  predicted_image_rgb.save(f"{resultDir}/{os.path.basename(input_img_path)}")
  return predicted_np_rgb
### --- End of Zero123-165000 model definition

# --- Zero123-165000 model definition
zero123_165000_pipeline = None

def load_zero123_165000_model():
  global zero123_165000_pipeline

  if zero123_165000_pipeline is None:
    model_id = "kxic/zero123-165000"
    model_path = "./model_cache/"
    os.makedirs(model_path, exist_ok=True)
    local_model_dir = os.path.join(model_path, "zero123_165000")

    snapshot_download(
      model_id,
      local_dir=local_model_dir
    )

    pipe = Zero1to3StableDiffusionPipeline.from_pretrained(
      local_model_dir,
      torch_dtype=torch.float16,
      use_safetensors=False
    )

    pipe.enable_vae_tiling()
    pipe.enable_attention_slicing()
    pipe = pipe.to(device)
    zero123_165000_pipeline = pipe

def zero123_165000_predict(input_img_path):
  global zero123_165000_pipeline

  load_zero123_165000_model()

  if zero123_165000_pipeline is None:
    raise Exception("'zero123_165000_pipeline' is None")

  input_image = Image.open(input_img_path)
  input_image = convert_to_rgb_with_whitened_transparency(input_image)

  input_model_size = (256, 256)
  input_image_resized = input_image.resize(input_model_size, Image.LANCZOS)

  H, W = input_model_size

  target_pose = [float(-STATIC_DEGREES), 0.0, 0.0]

  print(f"Zero123 (165000) predicting for {input_img_path} with pose: {target_pose}")

  result_images = zero123_165000_pipeline(
    input_imgs=[input_image_resized],
    prompt_imgs=[input_image_resized],
    poses=[target_pose],
    height=H,
    width=W,
    guidance_scale=3.0,
    num_images_per_prompt=1,
    num_inference_steps=40
  ).images
  predicted_image = result_images[0]

  predicted_image_rgb = predicted_image.convert("RGB")
  predicted_np_rgb = np.array(predicted_image_rgb).astype(np.uint8)

  resultDir = "/content/results/zero123_165000"
  os.makedirs(resultDir, exist_ok=True)
  predicted_image_rgb.save(f"{resultDir}/{os.path.basename(input_img_path)}")
  return predicted_np_rgb
### --- End of Zero123-165000 model definition

# --- Zero123-XL model definition
zero123_xl_pipeline = None

def load_zero123_xl_model():
  global zero123_xl_pipeline

  if zero123_xl_pipeline is None:
    model_id = "kxic/zero123-xl"
    model_path = "./model_cache/"
    os.makedirs(model_path, exist_ok=True)
    local_model_dir = os.path.join(model_path, "zero123_xl")

    snapshot_download(
      model_id,
      local_dir=local_model_dir
    )

    pipe = Zero1to3StableDiffusionPipeline.from_pretrained(
      local_model_dir,
      torch_dtype=torch.float16,
      use_safetensors=False
    )

    pipe.enable_vae_tiling()
    pipe.enable_attention_slicing()
    pipe = pipe.to(device)
    zero123_xl_pipeline = pipe

def zero123_xl_predict(input_img_path):
  global zero123_xl_pipeline

  load_zero123_xl_model()

  if zero123_xl_pipeline is None:
    raise Exception("'zero123_xl_pipeline' is None")

  input_image = Image.open(input_img_path)
  input_image = convert_to_rgb_with_whitened_transparency(input_image)

  input_model_size = (256, 256)
  input_image_resized = input_image.resize(input_model_size, Image.LANCZOS)

  H, W = input_model_size

  target_pose = [float(-STATIC_DEGREES), 0.0, 0.0]

  print(f"Zero123-XL predicting for {input_img_path} with pose: {target_pose}")

  result_images = zero123_xl_pipeline(
    input_imgs=[input_image_resized],
    prompt_imgs=[input_image_resized],
    poses=[target_pose],
    height=H,
    width=W,
    guidance_scale=3.0,
    num_images_per_prompt=1,
    num_inference_steps=40
  ).images
  predicted_image = result_images[0]

  predicted_image_rgb = predicted_image.convert("RGB")
  predicted_np_rgb = np.array(predicted_image_rgb).astype(np.uint8)

  resultDir = "/content/results/zero123_xl"
  os.makedirs(resultDir, exist_ok=True)
  predicted_image_rgb.save(f"{resultDir}/{os.path.basename(input_img_path)}")
  return predicted_np_rgb
### --- End of Zero123-XL model definition

# --- Stable-Zero123 model definition
stable_zero123_pipeline = None

def load_stable_zero123_model():
  global stable_zero123_pipeline

  if stable_zero123_pipeline is None:
    model_id = "kxic/stable-zero123"
    model_path = "./model_cache/"
    os.makedirs(model_path, exist_ok=True)
    local_model_dir = os.path.join(model_path, "stable_zero123")

    snapshot_download(
      model_id,
      local_dir=local_model_dir
    )

    pipe = Zero1to3StableDiffusionPipeline.from_pretrained(
      local_model_dir,
      torch_dtype=torch.float16,
      use_safetensors=False
    )

    pipe.enable_vae_tiling()
    pipe.enable_attention_slicing()
    pipe = pipe.to(device)
    stable_zero123_pipeline = pipe

def stable_zero123_predict(input_img_path):
  global stable_zero123_pipeline

  load_stable_zero123_model()

  if stable_zero123_pipeline is None:
    raise Exception("'stable_zero123_pipeline' is None")

  input_image = Image.open(input_img_path)
  input_image = convert_to_rgb_with_whitened_transparency(input_image)

  input_model_size = (256, 256)
  input_image_resized = input_image.resize(input_model_size, Image.LANCZOS)

  H, W = input_model_size

  target_pose = [float(-STATIC_DEGREES), 0.0, 0.0]

  print(f"Stable Zero123 predicting for {input_img_path} with pose: {target_pose}")

  result_images = stable_zero123_pipeline(
    input_imgs=[input_image_resized],
    prompt_imgs=[input_image_resized],
    poses=[target_pose],
    height=H,
    width=W,
    guidance_scale=3.0,
    num_images_per_prompt=1,
    num_inference_steps=40
  ).images
  predicted_image = result_images[0]

  predicted_image_rgb = predicted_image.convert("RGB")
  predicted_np_rgb = np.array(predicted_image_rgb).astype(np.uint8)

  resultDir = "/content/results/stable_zero123"
  os.makedirs(resultDir, exist_ok=True)
  predicted_image_rgb.save(f"{resultDir}/{os.path.basename(input_img_path)}")
  return predicted_np_rgb
### --- End of Stable-Zero123 model definition

# Models register
MODELS["Passthrough"] = passthrough_predict
MODELS["Zero123_105000"] = zero123_105000_predict
MODELS["Zero123_165000"] = zero123_165000_predict
MODELS["Zero123_XL"] = zero123_xl_predict
MODELS["Stable_Zero123"] = stable_zero123_predict

print(f"Currently testing rotation degrees: {STATIC_DEGREES}")
print(f"Input images expected in: {images_folder}")
print(f"Ground truth images expected in: {images_folder_rotated}")
print("----------------------------------------")

evaluation_results_df = run_evaluation()

if not evaluation_results_df.empty:
  # CSV
  evaluation_results_df.to_csv("nvs_evaluation_results.csv")
  print("\nResults saved to nvs_evaluation_results.csv")