### Initialization

In [2]:
from pathlib import Path
import open3d as o3d
import os
import sys
from pathlib import Path

# Add the parent directory to sys.path so 'src' can be imported
sys.path.append(str(Path.cwd().parent))

from pytorch_lightning import seed_everything

from src.dataset_utils import (
    get_singleview_data_trt,
    get_singleview_data,
    get_multiview_data,
    get_voxel_data_json,
    get_image_transform_latent_model,
    get_pointcloud_data,
    get_mv_dm_data,
    get_sv_dm_data,
    get_sketch_data
)
from src.model_utils import Model
from src.mvdream_utils import load_mvdream_model
import argparse
from PIL import Image


from comet_ml import start
from comet_ml.integration.pytorch import log_model
from comet_ml.api import API

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd

import time

import Optim_Utils
from Optim_Utils import Optim_Visualizations
import torch

import boto3


  _TORCH_CUSTOM_FWD = amp.custom_fwd(cast_inputs=torch.float16)
  def backward(ctx, grad_output):
  def backward(ctx, grad_output):
  def backward(ctx, grad_output):
  def backward(ctx, grad_output):
  def backward(ctx, grad_output):
  def backward(ctx, grad_output):
  def backward(ctx, grad_output):


In [None]:

def simplify_mesh(obj_path, target_num_faces=1000):
    mesh = o3d.io.read_triangle_mesh(obj_path)
    simplified_mesh = mesh.simplify_quadric_decimation(target_num_faces)
    o3d.io.write_triangle_mesh(obj_path, simplified_mesh)


def generate_3d_object(
    model,
    data,
    data_idx,
    scale,
    diffusion_rescale_timestep,
    experiment,
    save_dir="examples",
    output_format="obj",
    target_num_faces=None,
    seed=42,
    
):
    # Set seed
    seed_everything(seed, workers=True)

    model.set_inference_fusion_params(scale, diffusion_rescale_timestep)
    low, high = model.forward(
        data, data_idx, experiment,  output_format=output_format
    )

    return low,high    

In [None]:
os.environ["XFORMERS_DISABLED"] = "1"



model_name = 'ADSKAILab/WaLa-SV-1B'
images_path = Path('examples/single_view/')
single_image = ['examples/single_view/table.png']

output_dir = 'examples/Test_Gen'
output_format = 'obj'
target_num_faces = 25000
scale = 1.8
seed = 42
diffusion_rescale_timestep = 5

print(f"Loading model")


model = Model.from_pretrained(pretrained_model_name_or_path=model_name)
image_transform = get_image_transform_latent_model()
model.set_inference_fusion_params(
        scale, diffusion_rescale_timestep
    )
experiment = None

### Test 1 - 3 images

In [None]:
def run_inference(image_path,save_dir,model,experiment= None):
    lh_dic = {}
    for idx, image_path in enumerate(image_path.iterdir()):  # Enumerate to get the index
        timer1 = time.time()
        print(f"Processing image: {image_path}")
        data = get_singleview_data(
            image_file=Path(image_path),
            image_transform=image_transform,
            device=model.device,
            image_over_white=False,
        )
        data_idx = 0
        save_dir = Path(output_dir) 
        base_name = os.path.basename(image_path)
        image_name = os.path.splitext(base_name)[0]  
        model.set_inference_fusion_params(
            scale, diffusion_rescale_timestep
        )
        low, high = generate_3d_object(
            model,
            data,
            data_idx,
            scale,
            diffusion_rescale_timestep,
            experiment,
            output_format,
            target_num_faces,
            seed,
        )
        delta = time.time() - timer1
        print('Total Inference time', delta)

        try:
            delta = time.time() - timer1
            experiment.log_metric("Default Delta", delta)
            print('Total Inference time', delta)
        except:
            pass
        
        lh_dic[image_name] = (low,high)

    return lh_dic
lh_dic = run_inference(images_path,output_dir,model)


## Dataset Extraction and Utilization for Benchmarking

#### Utils

In [None]:
source_folder = os.path.expanduser('/Google_Dataset_Outputs')
check_folder = Path("/home/rayhub-user/Optim-WaLa/Benchmark_quality/Original")
destination_folder = Path("/home/rayhub-user/Optim-WaLa/Benchmark_quality/Tensor_RT")

Optim_Utils.find_and_move_matching_files(source_folder, check_folder, destination_folder, TRT = True)


[DEBUG] Files in check_folder (/home/rayhub-user/Optim-WaLa/Benchmark_quality/Original): 20 found
[DEBUG] Files in source_folder (/Google_Dataset_Outputs): 999 found
Moved: 2_of_Jenga_Classic_Game_trt.obj from /Google_Dataset_Outputs to /home/rayhub-user/Optim-WaLa/Benchmark_quality/Tensor_RT
Moved: 50_BLOCKS_trt.obj from /Google_Dataset_Outputs to /home/rayhub-user/Optim-WaLa/Benchmark_quality/Tensor_RT
Moved: 5_HTP_trt.obj from /Google_Dataset_Outputs to /home/rayhub-user/Optim-WaLa/Benchmark_quality/Tensor_RT
[DEBUG] No match for: ASICS_GELAce_Pro_Pearl_WhitePink_trt.obj (match_name: ASICS_GELAce_Pro_Pearl_WhitePink.obj)
[DEBUG] No match for: ASICS_GELBlur33_20_GS_BlackWhiteSafety_Orange_trt.obj (match_name: ASICS_GELBlur33_20_GS_BlackWhiteSafety_Orange.obj)
[DEBUG] No match for: Asus_M5A78LMUSB3_Motherboard_Micro_ATX_Socket_AM3_trt.obj (match_name: Asus_M5A78LMUSB3_Motherboard_Micro_ATX_Socket_AM3.obj)
[DEBUG] No match for: Asus_M5A99FX_PRO_R20_Motherboard_ATX_Socket_AM3_trt.obj (m

In [None]:
source_folder = os.path.expanduser('/Google_Dataset_Outputs')
check_folder = Path("/home/rayhub-user/Optim-WaLa/Benchmark_quality/Original")
destination_folder = Path("/home/rayhub-user/Optim-WaLa/Benchmark_quality/Tensor_RT")

# Check the contents of each folder
Optim_Utils.check_folder_contents(source_folder)
Optim_Utils.check_folder_contents(check_folder)
Optim_Utils.check_folder_contents(destination_folder)

Contents of /Google_Dataset_Outputs:
- 2_of_Jenga_Classic_Game_trt.obj
- 50_BLOCKS_trt.obj
- 5_HTP_trt.obj
- ASICS_GELAce_Pro_Pearl_WhitePink_trt.obj
- ASICS_GELBlur33_20_GS_BlackWhiteSafety_Orange_trt.obj
- Asus_M5A78LMUSB3_Motherboard_Micro_ATX_Socket_AM3_trt.obj
- Asus_M5A99FX_PRO_R20_Motherboard_ATX_Socket_AM3_trt.obj
- Asus_Sabertooth_990FX_20_Motherboard_ATX_Socket_AM3_trt.obj
- Asus_Sabertooth_Z97_MARK_1_Motherboard_ATX_LGA1150_Socket_trt.obj
- BlackBlack_Nintendo_3DSXL_trt.obj
- Black_Decker_Stainless_Steel_Toaster_4_Slice_trt.obj
- Black_Elderberry_Syrup_54_oz_Gaia_Herbs_trt.obj
- BlueBlack_Nintendo_3DSXL_trt.obj
- Blue_Jasmine_Includes_Digital_Copy_UltraViolet_DVD_trt.obj
- Cootie_Game_trt.obj
- Dell_Series_9_Color_Ink_Cartridge_MK993_High_Yield_trt.obj
- Frozen_Scrabble_Jr_trt.obj
- Google_Cardboard_Original_package_trt.obj
- HP_Card_Invitation_Kit_trt.obj
- Hyaluronic_Acid_trt.obj
- LEGO_Creationary_Game_trt.obj
- LEGO_Creationary_Game_ZJa163wlWp2_trt.obj
- Lenovo_Yoga_2_11

### Dataset download and extraction

In [None]:
zip_dir = '/Google_Dataset'         # Folder with zip files
output_dir = '/Google_Extracted_Dataset/'  # Where to save extracted thumbnails
os.makedirs(output_dir, exist_ok=True)

Optim_Utils.extract_first_thumbnail('/Google_Dataset/Dataset', '/Google_Dataset/Dataset/Single_Images')

In [None]:
folder1 = "/home/ray/WaLa/Benchmark_quality/Original"
folder2 = "/home/ray/WaLa/Benchmark_quality/Simplified"
Optim_Utils.compare_file_sizes(folder1, folder2)

### Dataset Inference Tests

In [None]:
experiment = start(
  api_key="mqrUAXjKBRul24uX6pxR3gRHX*eyJiYXNlVXJsIjoiaHR0cHM6Ly9jb21ldC5kZXYuY2xvdWRvcy5hdXRvZGVzay5jb20ifQ",
  project_name="wala-time-checks",
  workspace="alessandro-giuliano")


  # Define the hyperparameters
hyperparameters = {
    "model_name": "ADSKAILab/WaLa-SV-1B",
    "images_path": str(Path("examples/single_view/")),
    "single_image": ["examples/single_view/table.png"],
    "output_dir": "examples/Test_Gen",
    "output_format": "obj",
    "target_num_faces": None,
    "scale": 1.8,
    "seed": 42,
    "diffusion_rescale_timestep": 5,
    "Mcubes-Optimized": True,
    "Object-3D_simplified_mesh": 'Quadratic (Post)',
}

# Log the hyperparameters to Comet
experiment.log_parameters(hyperparameters)

In [None]:
### Scanned Objects by Google Research
google_test_img_path = Path(os.path.expanduser('~/test_images'))  # Expands '~' to the full home directory path
output_dir = os.path.expanduser('~/Obj_generated')

model = Model.from_pretrained(pretrained_model_name_or_path=model_name)
image_transform = get_image_transform_latent_model()

run_inference(google_test_img_path,output_dir,model, experiment)

experiment.end()

### Data Visualization

In [None]:
api_key="mqrUAXjKBRul24uX6pxR3gRHX*eyJiYXNlVXJsIjoiaHR0cHM6Ly9jb21ldC5kZXYuY2xvdWRvcy5hdXRvZGVzay5jb20ifQ"
comet_api = API(api_key=api_key)
project_name="wala-time-checks"
workspace="alessandro-giuliano"
experiment_key = '1bf42d8c3ee545628968903512893acf'
experiment_keys = {'Benchmark' : '1bf42d8c3ee545628968903512893acf',
                    'mcubes_fast_write' : '223ac82d90fc4292a33c60cc93e9f929',
                    'Simplified Vertex' : '7fa0d3b99b0d48588f11da362ed52aca',
                    'Simplified Quadratic (Inside)' : '86c28056aca843ef9f450d948beb7c61',
                    'Simplified Quadratic (Outside)' : 'ab20f6ca2c98422399ebae16ad717dff'}
                    
                    
# Retrieve the experiment
experiment = comet_api.get_experiment(workspace, project_name, experiment_keys['Benchmark'])



all_metric_data, metric_names, metric_display_names = Optim_Visualizations.retrieve_and_plot(experiment,plotting = True)


In [None]:

metric_statistics, default_delta_values = Optim_Visualizations.get_stats(experiment, metric_names, display = True)


In [None]:
Optim_Visualizations.plot_metric_pie(metric_statistics,metric_display_names)

In [None]:
Optim_Visualizations.plot_metric_heatmap(
    metric_statistics,
    metric_display_names,
    default_delta_values,
    include_min=True,
    include_max=True,
    include_median=True
)

#### Display Comparisons

In [None]:

# Retrieve both experiments
experiment1 = comet_api.get_experiment(workspace, project_name, experiment_keys['Benchmark'])
experiment2 = comet_api.get_experiment(workspace, project_name, experiment_keys['mcubes_fast_write'])


# Compare "export obj time"
Optim_Visualizations.compare_metric_distribution(
    experiment1, experiment2, 
    metric_name="export obj time", 
    label1="WaLa Original", 
    label2="WaLa MCubes Fast Write", 
    display_name="Object File Writing Time"
)

# Compare "Default Delta" (total runtime)
Optim_Visualizations.compare_metric_distribution(
    experiment1, experiment2, 
    metric_name="Default Delta", 
    label1="WaLa Original", 
    label2="WaLa MCubes Fast Write",
    display_name="Total Runtime"
)



### Custom Optimization Implementations

#### Mcubes object export/ write object to file

In [None]:

# Generate a large random mesh for testing
num_vertices = 100_000
num_faces = 200_000

vertices = np.random.rand(num_vertices, 3)
triangles = np.random.randint(0, num_vertices, size=(num_faces, 3))


# Test and compare
filename1 = "test_original.obj"
filename2 = "test_optimized.obj"

t1 = Optim_Utils.benchmark_export(Optim_Utils.export_obj_fast, vertices, triangles, filename1)
t2 = Optim_Utils.benchmark_export(Optim_Utils.export_obj_original, vertices, triangles, filename2)

print(f"Original export_obj_original time: {t2:.4f} seconds")
print(f"Optimized export_obj time: {t1:.4f} seconds")
print(f"Speedup: {t2/t1:.2f}x faster")

### Straight Through Visualization with Forward

In [None]:
from src.diffusion_modules.dwt import DWTInverse3d
import mcubes
args_max_depth = 3
args_wavelet = 'bior6.8'
args_padding_mode = 'constant'
args_resolution = 256


def save_visualization_obj(image_name, obj_path, samples):
        """Save a visualization object."""
        low, highs = samples

        dwt_inverse_3d = DWTInverse3d(args_max_depth, args_wavelet,args_padding_mode )
        sdf_recon = dwt_inverse_3d((low, highs))
        vertices, triangles = mcubes.marching_cubes(
            sdf_recon.cpu().detach().numpy()[0, 0], 0.0
        )

        vertices = (vertices / args_resolution) * 2.0 - 1.0
        triangles = triangles[:, ::-1]
        mcubes.export_obj(vertices, triangles, obj_path)
     
      
def display_visualization_3d(image_name, samples, output_dir):
    """Generate and save 3D visualization as multiple viewpoint images."""
    import open3d as o3d
    import numpy as np
    import mcubes
    from pathlib import Path

    # Create output directory for renders
    render_dir = Path(output_dir) / "renders" / image_name
    render_dir.mkdir(parents=True, exist_ok=True)

    # Get samples
    low, highs = samples

    # Reconstruct 3D object
    dwt_inverse_3d = DWTInverse3d(args_max_depth, args_wavelet, args_padding_mode)
    sdf_recon = dwt_inverse_3d((low, highs))
    vertices, triangles = mcubes.marching_cubes(
        sdf_recon.cpu().detach().numpy()[0, 0], 0.0
    )

    # Normalize vertices
    vertices = (vertices / args_resolution) * 2.0 - 1.0
    triangles = triangles[:, ::-1]

    # Create Open3D mesh
    mesh = o3d.geometry.TriangleMesh()
    mesh.vertices = o3d.utility.Vector3dVector(vertices)
    mesh.triangles = o3d.utility.Vector3iVector(triangles)
    mesh.compute_vertex_normals()

    # Create offscreen renderer
    render = o3d.visualization.rendering.OffscreenRenderer(800, 600)
    mat = o3d.visualization.rendering.MaterialRecord()
    mat.shader = 'defaultLit'
    render.scene.add_geometry("mesh", mesh, mat)

    # Render from multiple viewpoints
    viewpoints = [
        ([0, 0, 1], [0, 1, 0]),  # front
        ([1, 0, 0], [0, 1, 0]),  # right
        ([0, 0, -1], [0, 1, 0]), # back
        ([-1, 0, 0], [0, 1, 0]), # left
        ([0, 1, 0], [0, 0, 1]),  # top
    ]

    for i, (eye, up) in enumerate(viewpoints):
        render.setup_camera(60.0, [0, 0, 0], eye, up)
        img = render.render_to_image()
        img_path = render_dir / f"view_{i}.png"
        o3d.io.write_image(str(img_path), img)

    return render_dir


In [None]:
temp_save_path = '/home/ray/WaLa/examples/Test_Gen'
save_dir = Path(temp_save_path)
lh_dic = run_inference(images_path,output_dir,model)

for image_name, samples_tuple in lh_dic.items(): # Use .items() to get key-value pairs
    obj_path = os.path.join(save_dir, f"{image_name}.obj")
    low_pred, highs_pred = samples_tuple # Unpack the tuple from the dictionary value
      

    save_visualization_obj( image_name,
        obj_path=obj_path, samples=(low_pred, highs_pred)
    )
    

## ONNX Optimization

In [None]:
import torch.nn.functional as F
import traceback

orig_interpolate = F.interpolate  # Save the original function


def debug_interpolate(*args, **kwargs):
    mode = kwargs.get('mode', None)
    antialias = kwargs.get('antialias', None)
    if mode == 'bicubic' and (antialias is None or antialias is True):
        print("Bicubic with antialias detected!")
        traceback.print_stack()
    return orig_interpolate(*args, **kwargs)
F.interpolate = debug_interpolate

In [None]:
import onnxruntime as ort
import numpy as np


image_path = Path('examples/single_view/')
single_image = ['examples/single_view/table.png']
image_transform = get_image_transform_latent_model()
lh_dic = {}

# Load your ONNX model
onnx_model_path = "model.onnx"  # Update with your actual model path
#session = ort.InferenceSession(onnx_model_path)

# Use CUDA (GPU) if available
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
session = ort.InferenceSession(onnx_model_path, providers=providers)

for idx, image_path in enumerate(image_path.iterdir()):  # Enumerate to get the index

        print(f"Processing image: {image_path}")
        data = get_singleview_data(
            image_file=Path(image_path),
            image_transform=image_transform,
            device='cuda:0',
            image_over_white=False,
        )
        data_idx = 0
        base_name = os.path.basename(image_path)
        image_name = os.path.splitext(base_name)[0]  

        # Prepare ONNX input dictionary using model input names
        input_feed = {}
        print([inp.name for inp in session.get_inputs()])
        # If 'data' is a dictionary, flatten it for ONNX input
        input_feed = {
            "data": data["images"].cpu().numpy() if hasattr(data["images"], "cpu") else np.array(data["images"]),
            "data_low": data["low"].cpu().numpy() if hasattr(data["low"], "cpu") else np.array(data["low"]),
            "data_idx": (
                np.array([data["img_idx"].cpu().item()], dtype=np.int64)
                if hasattr(data["img_idx"], "cpu")
                else np.array([data["img_idx"]], dtype=np.int64)
            )
        }

        # Run inference
        t_inf_onnx = time.time()
        outputs = session.run(None, input_feed)
        print("Inference Time of new ONNX model", time.time() - t_inf_onnx, 's')
        lh_dic[image_name] = outputs
 

print("ONNX inference output:", outputs)

In [None]:
import importlib
import torch
importlib.reload(Optim_Utils)
temp_save_path = '/home/rayhub-user/Optim-WaLa/examples/Test_Gen'
save_dir = Path(temp_save_path)

for image_name, samples_tuple in lh_dic.items(): # Use .items() to get key-value pairs
    obj_path = os.path.join(save_dir, f"{image_name}.obj")
    low_pred, highs_pred = samples_tuple # Unpack the tuple from the dictionary value
    
    
    # Convert numpy arrays to torch tensors
    if isinstance(low_pred, np.ndarray):
        low_pred = torch.from_numpy(low_pred)
    if isinstance(highs_pred, np.ndarray):
        highs_pred = torch.from_numpy(highs_pred)
     # If highs_pred is not a list, wrap it in a list
    if not isinstance(highs_pred, (list, tuple)):
        highs_pred = [highs_pred]

    # Move all tensors to the correct device
    low_pred = low_pred.to('cuda:0')
    highs_pred = [h.to('cuda:0') for h in highs_pred]

    Optim_Utils.save_visualization_obj( image_name,
        obj_path=obj_path, samples=(low_pred, highs_pred)
    )

In [None]:
import boto3
from pathlib import Path

def upload_files_to_s3(file_paths, bucket_name, s3_prefix=""):
    """
    Uploads a list of files to an S3 bucket.

    Args:
        file_paths (list): List of file paths (str or Path) to upload.
        bucket_name (str): Name of the S3 bucket.
        s3_prefix (str): Optional prefix (folder) in the bucket.
    """
    s3 = boto3.client('s3')
    for file_path in file_paths:
        file_path = Path(file_path)
        s3_key = f"{s3_prefix}/{file_path.name}" if s3_prefix else file_path.name
        with open(file_path, "rb") as f:
            s3.upload_fileobj(f, bucket_name, s3_key)
        print(f"Uploaded {file_path} to s3://{bucket_name}/{s3_key}")

# Example usage:
files = ["model.onnx.data", "model.onnx"]
upload_files_to_s3(files, "giuliaa-optim", s3_prefix="/ONNX")

Uploaded model.onnx.data to s3://giuliaa-optim//ONNX/model.onnx.data
Uploaded model.onnx to s3://giuliaa-optim//ONNX/model.onnx


### TensorRT

In [None]:
import onnx
import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
import io
s3 = boto3.client('s3')

bucket_name = 'giuliaa-optim'
s3_key = '/TRT/model_base.trt'

# Convert ONNX to TensorRT engine
onnx_model_path = "model.onnx"
trt_engine_path = "model.trt"
trt_engine_path_optimized = "model_optimized.trt"

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)
network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(network_flags)
parser = trt.OnnxParser(network, TRT_LOGGER)

with open(onnx_model_path, "rb") as f:
    if not parser.parse(f.read()):
        for i in range(parser.num_errors):
            print(parser.get_error(i))
        raise RuntimeError("Failed to parse ONNX model")


config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 10 << 30) # 1 GiB
#config.set_flag(trt.BuilderFlag.FP16)
#config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED
#config.set_tactic_sources(trt.TacticSource.CUBLAS | trt.TacticSource.CUDNN | trt.TacticSource.EDGE_MASK_CONVOLUTIONS)
#config.builder_optimization_level = 3

# Build the TensorRT engine (TensorRT 8.x+)
serialized_engine = builder.build_serialized_network(network, config)

with open(trt_engine_path, "wb") as f:
    f.write(serialized_engine)



s3.upload_fileobj(io.BytesIO(serialized_engine), bucket_name, s3_key)
print(f"Uploaded TensorRT engine to s3://{bucket_name}/{s3_key}")


[05/30/2025-17:28:15] [TRT] [W] ModelImporter.cpp:516: Make sure input data_idx has Int64 binding.
[05/30/2025-17:28:32] [TRT] [W] Unused Input: data_idx
[05/30/2025-17:28:32] [TRT] [W] Unused Input: data_low
[05/30/2025-17:28:37] [TRT] [W] [RemoveDeadLayers] Input Tensor data_idx is unused or used only at compile-time, but is not being removed.
[05/30/2025-17:28:37] [TRT] [W] [RemoveDeadLayers] Input Tensor data_low is unused or used only at compile-time, but is not being removed.
Uploaded TensorRT engine to s3://giuliaa-optim//TRT/model_base.trt


In [None]:
import tensorrt as trt
import numpy as np
import time
import torch
from pathlib import Path
import os
import pycuda.driver as cuda
import pycuda.autoinit
import io
from src.dataset_utils import get_singleview_data, get_image_transform_latent_model
import Optim_Utils

# Load TensorRT engine
def load_engine(engine_path):
    logger = trt.Logger(trt.Logger.WARNING)
    with open(engine_path, "rb") as f:
        runtime = trt.Runtime(logger)
        return runtime.deserialize_cuda_engine(f.read())

def load_engine_from_s3(bucket_name, s3_key):
    """
    Loads a TensorRT engine directly from S3 into memory and checks buffer size.
    """


    s3 = boto3.client('s3')
    # Get expected file size from S3
    obj = s3.head_object(Bucket=bucket_name, Key=s3_key)
    expected_size = obj['ContentLength']

    engine_buffer = io.BytesIO()
    s3.download_fileobj(bucket_name, s3_key, engine_buffer)
    engine_buffer.seek(0)
    actual_size = len(engine_buffer.getvalue())
    print(f"S3 engine size: {expected_size} bytes, Downloaded buffer size: {actual_size} bytes")
    if actual_size != expected_size:
        raise RuntimeError("Downloaded engine size does not match S3 object size!")

    logger = trt.Logger(trt.Logger.WARNING)
    runtime = trt.Runtime(logger)
    return runtime.deserialize_cuda_engine(engine_buffer.read())




# Modern TensorRT inference function using tensor API
def run_trt_inference(engine, input_data,stream,context):
    """
    Run inference using TensorRT with modern tensor API
    
    Args:
        engine: TensorRT engine
        input_data: List of input numpy arrays
        
    Returns:
        List of output numpy arrays
    """


    # Get input and output tensor names
    input_names = []
    output_names = []
    
    time_input_names = time.time()
    for i in range(engine.num_io_tensors):
        name = engine.get_tensor_name(i)
        if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
            input_names.append(name)
        else:
            output_names.append(name)
    print(f"Time to get input/output names: {time.time() - time_input_names:.4f} seconds")
    # Prepare device memory
    device_memory = []
    
    time_handle = time.time()
    # Handle inputs
    for i, name in enumerate(input_names):
        if i >= len(input_data):
            continue
            
        data = input_data[i]
        
        # Handle dynamic input shapes
        if -1 in engine.get_tensor_shape(name):
            context.set_input_shape(name, data.shape)
        
        # Allocate device memory and copy input data
        mem = cuda.mem_alloc(data.nbytes)
        cuda.memcpy_htod_async(mem, data, stream)
        device_memory.append(mem)
        context.set_tensor_address(name, int(mem))
    print(f"Time to handle inputs: {time.time() - time_handle:.4f} seconds")

    # Prepare outputs
    outputs = []
    output_memory = []
    
    time_allocate = time.time()
    # Allocate output memory
    for name in output_names:
        shape = context.get_tensor_shape(name)
        dtype = trt.nptype(engine.get_tensor_dtype(name))
        
        # Allocate device memory for output
        size = trt.volume(shape) * np.dtype(dtype).itemsize
        mem = cuda.mem_alloc(size)
        device_memory.append(mem)
        output_memory.append(mem)
        
        # Set output tensor address
        context.set_tensor_address(name, int(mem))
        
        # Create host output array
        output = np.empty(shape, dtype=dtype)
        outputs.append(output)
    print(f"Time to allocate output memory: {time.time() - time_allocate:.4f} seconds")

    # Run inference
    time_asyncv3 = time.time()
    context.execute_async_v3(stream_handle=stream.handle)
    print(f"Async inference time: {time.time() - time_asyncv3:.4f} seconds")

    time_copy = time.time()
    # Copy outputs from device to host
    for i, mem in enumerate(output_memory):
        cuda.memcpy_dtoh_async(outputs[i], mem, stream)
    print(f"Time to copy outputs: {time.time() - time_copy:.4f} seconds")

    time_sync = time.time()
    # Synchronize to ensure all operations are complete
    stream.synchronize()
    print(f"Time to synchronize: {time.time() - time_sync:.4f} seconds")

    time_free = time.time()
    # Free device memory
    for mem in device_memory:
        mem.free()
    print(f"Time to free device memory: {time.time() - time_free:.4f} seconds")


    return outputs

# Main experiment function
def run_tensorrt_experiment(image_dir, engine_path,bucket_name = 'giuliaa-optim',s3_key='TRT/model_base.trt', use_s3 = False, experiment = None):
    # Load image transform
    image_transform = get_image_transform_latent_model()
    
    
    if use_s3:
        # Load TensorRT engine from S3
        engine = load_engine_from_s3(bucket_name, s3_key)
    else:
        # Load TensorRT engine from local path
        if not os.path.exists(engine_path):
            raise FileNotFoundError(f"TensorRT engine file {engine_path} does not exist.")
        engine = load_engine(engine_path)
    
    # Results storage
    results = []
    image_path = Path(image_dir)
    save_dir = Path('/home/rayhub-user/Optim-WaLa/examples/Test_Gen')
    os.makedirs(save_dir, exist_ok=True)


    time_contx = time.time()
    # Create execution context
    context = engine.create_execution_context()
    print(f"Time to create execution context: {time.time() - time_contx:.4f} seconds")

    time_stream = time.time()
    # Create a CUDA stream for async operations
    stream = cuda.Stream()
    print(f"Time to create CUDA stream: {time.time() - time_stream:.4f} seconds")

    try:

        experiment.log_metric("Time to create execution context", time.time() - time_contx)
        experiment.log_metric("Time to create CUDA stream", time.time() - time_stream)
    except:
        pass

    # Process each image
    for img_file in image_path.iterdir():      
        time_default = time.time()
  
        print(f"\nProcessing image: {img_file.name}")
        img_name = img_file.stem
        
        # Get image data
        data = get_singleview_data(
            image_file=img_file,
            image_transform=image_transform,
            image_over_white=False,
            device = 'cpu'
        )
        
        


        # Prepare input data as numpy arrays
        inputs = [
            data["images"].cpu().numpy() if hasattr(data["images"], "cpu") else np.array(data["images"]),
            data["low"].cpu().numpy() if hasattr(data["low"], "cpu") else np.array(data["low"]),
            data["img_idx"].cpu().numpy() if hasattr(data["img_idx"], "cpu") else np.array([data["img_idx"]], dtype=np.int64)
        ]


####### For Debug
        #for i, arr in enumerate(data.values()):
            #print(f"Input[{i}] min/max/mean:", np.nanmin(arr), np.nanmax(arr), np.nanmean(arr))
            #print(f"Any NaN in input[{i}]?", np.isnan(arr).any())
#######
        # Run TensorRT inference and measure time
        start_time = time.time()
        outputs = run_trt_inference(engine, inputs,stream,context)
        inference_time = time.time() - start_time

        try:
            experiment.log_metric("TensorRT Inference Time", inference_time)
        except: 
            pass
####### For Debug        
        #print("Output[0] min/max/mean:", np.nanmin(outputs[0]), np.nanmax(outputs[0]), np.nanmean(outputs[0]))
        #print("Output[1] min/max/mean:", np.nanmin(outputs[1]), np.nanmax(outputs[1]), np.nanmean(outputs[1]))
        #print("Any NaN in output[0]?", np.isnan(outputs[0]).any())
        #print("Any NaN in output[1]?", np.isnan(outputs[1]).any())
#######



        print(f"TensorRT inference time: {inference_time:.4f} seconds")
        
        # Store result
        results.append({
            "image": img_name,
            "inference_time": inference_time
        })
        
        # Generate 3D object from TensorRT outputs
        low_trt = torch.from_numpy(outputs[0]).to('cuda:0')
        high_trt = [torch.from_numpy(outputs[1]).to('cuda:0')]
        obj_path = str(save_dir / f"{img_name}_trt.obj")
        # Save visualization
        Optim_Utils.save_visualization_obj(img_name, obj_path, (low_trt, high_trt))
        print('Total Inference time with Writing', time.time() - time_default)
        
        try:
            experiment.log_metric("Default Delta", time.time() - time_default)
        except:
            pass

        
    # Calculate average inference time
    avg_time = sum(item["inference_time"] for item in results) / len(results)
    print(f"\nAverage TensorRT inference time: {avg_time:.4f} seconds")
    
    return results

# Run the experiment
image_dir = 'examples/single_view/'
trt_engine_path = "model.trt"
s3_key='/TRT/model_base.trt'
bucket_name = 'giuliaa-optim'

results = run_tensorrt_experiment(image_dir, trt_engine_path,bucket_name, s3_key, use_s3 = True, experiment = None)

S3 engine size: 5301395820 bytes, Downloaded buffer size: 5301395820 bytes
Time to create execution context: 1.3477 seconds
Time to create CUDA stream: 0.0000 seconds

Processing image: table.png
Time to get input/output names: 0.0000 seconds
Time to handle inputs: 0.0002 seconds
Time to allocate output memory: 0.0003 seconds
Async inference time: 1.3587 seconds
Time to copy outputs: 0.3741 seconds
Time to synchronize: 0.0002 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference time: 1.7341 seconds
Total Inference time with Writing 1.853771686553955

Processing image: apple.jpeg
Time to get input/output names: 0.0000 seconds
Time to handle inputs: 0.0003 seconds
Time to allocate output memory: 0.0004 seconds
Async inference time: 1.3501 seconds
Time to copy outputs: 0.3745 seconds
Time to synchronize: 0.0002 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference time: 1.7260 seconds
Total Inference time with Writing 1.7829906940460205

Processing image: 

### Inference TRT Google Dataset

In [None]:
experiment = start(
  api_key="mqrUAXjKBRul24uX6pxR3gRHX*eyJiYXNlVXJsIjoiaHR0cHM6Ly9jb21ldC5kZXYuY2xvdWRvcy5hdXRvZGVzay5jb20ifQ",
  project_name="wala-time-checks",
  workspace="alessandro-giuliano")


  # Define the hyperparameters
hyperparameters = {
    "model_name": "ADSKAILab/WaLa-SV-1B",
    "output_format": "obj",
    "target_num_faces": None,
    "scale": 1.8,
    "seed": 42,
    "diffusion_rescale_timestep": 5,
    "Mcubes-Optimized": True,
    "Object-3D_simplified_mesh": None,
    "TRT Model": "model.trt",
    "TRT Optimization": "Base",
}

# Log the hyperparameters to Comet
experiment.log_parameters(hyperparameters)

[1;38;5;39mCOMET INFO:[0m An experiment with the same configuration options is already running and will be reused.


In [None]:
import tensorrt as trt
import numpy as np
import time
import torch
from pathlib import Path
import os
import pycuda.driver as cuda
import pycuda.autoinit

from src.dataset_utils import get_singleview_data, get_image_transform_latent_model
import Optim_Utils



# Load TensorRT engine
def load_engine(engine_path):
    logger = trt.Logger(trt.Logger.WARNING)
    with open(engine_path, "rb") as f:
        runtime = trt.Runtime(logger)
        return runtime.deserialize_cuda_engine(f.read())

In [None]:
google_test_img_path = '/GD_Dataset_Single_View'
output_dir = '/Google_Dataset_Outputs'

image_path = Path(google_test_img_path)
save_dir = Path(output_dir)
os.makedirs(save_dir, exist_ok=True)
image_transform = get_image_transform_latent_model()

engine = load_engine(trt_engine_path)
context = engine.create_execution_context()
stream = cuda.Stream()

results = []
for img_file in image_path.glob('*.[jp][pn]g'):
    time_default = time.time()
    print(f"\nProcessing image: {img_file.name}")
    img_name = img_file.stem

    data = get_singleview_data(
        image_file=img_file,
        image_transform=image_transform,
        image_over_white=False,
        device='cpu'
    )

    inputs = [
        data["images"].cpu().numpy() if hasattr(data["images"], "cpu") else np.array(data["images"]),
        data["low"].cpu().numpy() if hasattr(data["low"], "cpu") else np.array(data["low"]),
        data["img_idx"].cpu().numpy() if hasattr(data["img_idx"], "cpu") else np.array([data["img_idx"]], dtype=np.int64)
    ]

    start_time = time.time()
    outputs = run_trt_inference(engine, inputs, stream, context)
    inference_time = time.time() - start_time

    experiment.log_metric("TensorRT Inference Time", inference_time, step=None)
    print(f"TensorRT inference time: {inference_time:.4f} seconds")

    results.append({
        "image": img_name,
        "inference_time": inference_time
    })

    low_trt = torch.from_numpy(outputs[0]).to('cuda:0')
    high_trt = [torch.from_numpy(outputs[1]).to('cuda:0')]
    obj_path = str(save_dir / f"{img_name}_trt.obj")
    Optim_Utils.save_visualization_obj(img_name, obj_path, (low_trt, high_trt))
    print('Total Inference time with Writing', time.time() - time_default)

avg_time = sum(item["inference_time"] for item in results) / len(results)
experiment.log_metric("Average TensorRT Inference Time", avg_time)
print(f"\nAverage TensorRT inference time: {avg_time:.4f} seconds")

experiment.end()


Processing image: 2_of_Jenga_Classic_Game.jpg
Time to get input/output names: 0.0000 seconds
Time to handle inputs: 0.0003 seconds
Time to allocate output memory: 0.0004 seconds
Async inference time: 1.3603 seconds
Time to copy outputs: 0.3737 seconds
Time to synchronize: 0.0000 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference time: 1.7358 seconds
Total Inference time with Writing 1.765320062637329

Processing image: 50_BLOCKS.jpg
Time to get input/output names: 0.0000 seconds
Time to handle inputs: 0.0003 seconds
Time to allocate output memory: 0.0004 seconds
Async inference time: 1.3505 seconds
Time to copy outputs: 0.3742 seconds
Time to synchronize: 0.0000 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference time: 1.7266 seconds
Total Inference time with Writing 1.7941734790802002

Processing image: 5_HTP.jpg
Time to get input/output names: 0.0001 seconds
Time to handle inputs: 0.0003 seconds
Time to allocate output memory: 0.0004 seconds
Asyn

[1;38;5;196mCOMET ERROR:[0m Due to connectivity issues, there's an error in processing the heartbeat. The experiment's status updates might be inaccurate until the connection issues are resolved.


Async inference time: 1.3528 seconds
Time to copy outputs: 0.3744 seconds
Time to synchronize: 0.0000 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference time: 1.7290 seconds
Total Inference time with Writing 1.8434162139892578

Processing image: Olive_Kids_Robots_Pencil_Case.jpg
Time to get input/output names: 0.0000 seconds
Time to handle inputs: 0.0003 seconds
Time to allocate output memory: 0.0004 seconds
Async inference time: 1.3524 seconds
Time to copy outputs: 0.3743 seconds
Time to synchronize: 0.0000 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference time: 1.7285 seconds
Total Inference time with Writing 1.8121654987335205

Processing image: Olive_Kids_Trains_Planes_Trucks_Bogo_Backpack.jpg
Time to get input/output names: 0.0000 seconds
Time to handle inputs: 0.0003 seconds
Time to allocate output memory: 0.0004 seconds
Async inference time: 1.3516 seconds
Time to copy outputs: 0.3744 seconds
Time to synchronize: 0.0000 seconds
Time to free

[1;38;5;196mCOMET ERROR:[0m Failed to send metrics batch message, got 502 b'<html>\r\n<head><title>502 Bad Gateway</title></head>\r\n<body>\r\n<center><h1>502 Bad Gateway</h1></center>\r\n</body>\r\n</html>\r\n'


Time to copy outputs: 0.3743 seconds
Time to synchronize: 0.0000 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference time: 1.7278 seconds
Total Inference time with Writing 1.8455686569213867

Processing image: PETS_ACCESSORIES.jpg
Time to get input/output names: 0.0000 seconds
Time to handle inputs: 0.0003 seconds
Time to allocate output memory: 0.0004 seconds
Async inference time: 1.3529 seconds
Time to copy outputs: 0.3743 seconds
Time to synchronize: 0.0000 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference time: 1.7288 seconds
Total Inference time with Writing 1.8034608364105225

Processing image: Paul_Frank_Dot_Lunch_Box.jpg
Time to get input/output names: 0.0000 seconds
Time to handle inputs: 0.0003 seconds
Time to allocate output memory: 0.0004 seconds
Async inference time: 1.3516 seconds
Time to copy outputs: 0.3744 seconds
Time to synchronize: 0.0000 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference time: 1.7278 seconds


[1;38;5;196mCOMET ERROR:[0m Failed to send stdout/stderr message batch, got 502 b'<html>\r\n<head><title>502 Bad Gateway</title></head>\r\n<body>\r\n<center><h1>502 Bad Gateway</h1></center>\r\n</body>\r\n</html>\r\n'


Time to copy outputs: 0.3739 seconds
Time to synchronize: 0.0000 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference time: 1.7273 seconds
Total Inference time with Writing 1.818810224533081

Processing image: Reebok_SMOOTHFLEX_CUSHRUN_20.jpg
Time to get input/output names: 0.0000 seconds
Time to handle inputs: 0.0003 seconds
Time to allocate output memory: 0.0004 seconds
Async inference time: 1.3516 seconds
Time to copy outputs: 0.3740 seconds
Time to synchronize: 0.0000 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference time: 1.7273 seconds
Total Inference time with Writing 1.814833402633667

Processing image: SAMOA.jpg
Time to get input/output names: 0.0000 seconds
Time to handle inputs: 0.0003 seconds
Time to allocate output memory: 0.0004 seconds
Async inference time: 1.3508 seconds
Time to copy outputs: 0.3740 seconds
Time to synchronize: 0.0000 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference time: 1.7264 seconds
Total Inf

[1;38;5;196mCOMET ERROR:[0m Failed to send metrics batch message, got 502 b'<html>\r\n<head><title>502 Bad Gateway</title></head>\r\n<body>\r\n<center><h1>502 Bad Gateway</h1></center>\r\n</body>\r\n</html>\r\n'


Time to copy outputs: 0.3741 seconds
Time to synchronize: 0.0000 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference time: 1.7275 seconds
Total Inference time with Writing 1.8165645599365234

Processing image: Ecoforms_Plant_Saucer_S14NATURAL.jpg
Time to get input/output names: 0.0001 seconds
Time to handle inputs: 0.0003 seconds
Time to allocate output memory: 0.0004 seconds
Async inference time: 1.3527 seconds
Time to copy outputs: 0.3742 seconds
Time to synchronize: 0.0000 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference time: 1.7286 seconds
Total Inference time with Writing 1.8147735595703125

Processing image: Ecoforms_Pot_Nova_6_Turquoise.jpg
Time to get input/output names: 0.0000 seconds
Time to handle inputs: 0.0003 seconds
Time to allocate output memory: 0.0004 seconds
Async inference time: 1.3515 seconds
Time to copy outputs: 0.3743 seconds
Time to synchronize: 0.0000 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference 

[1;38;5;196mCOMET ERROR:[0m Failed to send stdout/stderr message batch, got 502 b'<html>\r\n<head><title>502 Bad Gateway</title></head>\r\n<body>\r\n<center><h1>502 Bad Gateway</h1></center>\r\n</body>\r\n</html>\r\n'


Async inference time: 1.3513 seconds
Time to copy outputs: 0.3743 seconds
Time to synchronize: 0.0001 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference time: 1.7274 seconds
Total Inference time with Writing 1.8636047840118408

Processing image: Perricone_MD_Best_of_Perricone_7Piece_Collection_MEGsO6GIsyL.jpg
Time to get input/output names: 0.0000 seconds
Time to handle inputs: 0.0003 seconds
Time to allocate output memory: 0.0004 seconds
Async inference time: 1.3519 seconds
Time to copy outputs: 0.3744 seconds
Time to synchronize: 0.0000 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference time: 1.7281 seconds
Total Inference time with Writing 1.8452603816986084

Processing image: Perricone_MD_Cold_Plasma_Body.jpg
Time to get input/output names: 0.0000 seconds
Time to handle inputs: 0.0003 seconds
Time to allocate output memory: 0.0004 seconds
Async inference time: 1.3506 seconds
Time to copy outputs: 0.3742 seconds
Time to synchronize: 0.0000 secon

[1;38;5;196mCOMET ERROR:[0m Failed to send metrics batch message, got 502 b'<html>\r\n<head><title>502 Bad Gateway</title></head>\r\n<body>\r\n<center><h1>502 Bad Gateway</h1></center>\r\n</body>\r\n</html>\r\n'


Async inference time: 1.3517 seconds
Time to copy outputs: 0.3743 seconds
Time to synchronize: 0.0000 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference time: 1.7279 seconds
Total Inference time with Writing 1.809523582458496

Processing image: Schleich_Allosaurus.jpg
Time to get input/output names: 0.0000 seconds
Time to handle inputs: 0.0003 seconds
Time to allocate output memory: 0.0004 seconds
Async inference time: 1.3515 seconds
Time to copy outputs: 0.3743 seconds
Time to synchronize: 0.0000 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference time: 1.7277 seconds
Total Inference time with Writing 1.7810800075531006

Processing image: Schleich_Lion_Action_Figure.jpg
Time to get input/output names: 0.0000 seconds
Time to handle inputs: 0.0003 seconds
Time to allocate output memory: 0.0004 seconds
Async inference time: 1.3518 seconds
Time to copy outputs: 0.3743 seconds
Time to synchronize: 0.0000 seconds
Time to free device memory: 0.0003 second

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : sensitive_lodge_3633
[1;38;5;39mCOMET INFO:[0m     url                   : https://comet.dev.cloudos.autodesk.com/alessandro-giuliano/wala-time-checks/2ac87d530c3d4b36b3589f4c601c30ce
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     Average TensorRT Inference Time      : 1.7274238674251645
[1;38;5;39mCOMET INFO:[0m     Default Delta [4]                    : (0.04102778434753418, 2.1540329456329346)
[1;38;5;39mCOMET INFO:[0m     TensorRT Inference Time [1004]       : (0.00704193115234375, 1.7427480220794678)
[

Time to copy outputs: 0.3744 seconds
Time to synchronize: 0.0000 seconds
Time to free device memory: 0.0003 seconds
TensorRT inference time: 1.7281 seconds
Total Inference time with Writing 1.7923624515533447

Average TensorRT inference time: 1.7274 seconds


[1;38;5;39mCOMET INFO:[0m Please wait for metadata to finish uploading (timeout is 3600 seconds)
[1;38;5;39mCOMET INFO:[0m Uploading 2 metrics, params and output messages
