In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install kaolin==0.17.0 -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.5.1_cu121.html
!pip install open-clip-torch

!pip install open3d

import os

if not os.path.exists("AML_group"):
    !git clone https://github.com/AlesCarl/AML_group.git
else:
    print("Repository già clonato.")
%cd AML_group



# Part1

In [2]:
import clip
import copy
import json
import kaolin as kal
import kaolin.ops.mesh
import numpy as np
import os
import random
import torch
import torch.nn as nn
import torchvision
import open_clip
import open3d as o3d


from itertools import permutations, product
from Normalization import MeshNormalizer


from mesh import Mesh
from pathlib import Path
from render import Renderer
from tqdm import tqdm
from torch.autograd import grad
from torchvision import transforms
from utils import device, color_mesh

class NeuralHighlighter(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(NeuralHighlighter, self).__init__()

        layers = []
        for i in range(num_layers):
            layers.append(nn.Linear(input_dim if i == 0 else hidden_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.LayerNorm([hidden_dim]))

        layers.append(nn.Linear(hidden_dim, output_dim))
        layers.append(nn.Softmax(dim=1))

        self.mlp = nn.ModuleList(layers)

    def forward(self, x):
        for layer in self.mlp:
            x = layer(x)
        return x


def get_clip_model(clip_model):
    device = 'cuda'
    model, preprocess = clip.load(clip_model, device=device) # jit = True for better perfomance
    return model



# ================== HELPER FUNCTIONS =============================
def save_final_results(log_dir, name, mesh, mlp, vertices, colors, render, background):
    mlp.eval()
    with torch.no_grad():
        probs = mlp(vertices)
        max_idx = torch.argmax(probs, 1, keepdim=True)
        # for renders
        one_hot = torch.zeros(probs.shape).to(device)
        one_hot = one_hot.scatter_(1, max_idx, 1)
        sampled_mesh = mesh

        highlight = torch.tensor([204, 255, 0]).to(device)
        gray = torch.tensor([180, 180, 180]).to(device)
        colors = torch.stack((highlight/255, gray/255)).to(device)
        color_mesh(one_hot, sampled_mesh, colors)
        rendered_images, _, _ = render.render_views(sampled_mesh, num_views=5,
                                                                        show=False,
                                                                        center_azim=0,
                                                                        center_elev=0,
                                                                        std=1,
                                                                        return_views=True,
                                                                        lighting=True,
                                                                        background=background)
        # for mesh
        final_color = torch.zeros(vertices.shape[0], 3).to(device)
        final_color = torch.where(max_idx==0, highlight, gray)
        mesh.export(os.path.join(log_dir, f"{name}.ply"), extension="ply", color=final_color)
        save_renders(log_dir, 0, rendered_images, name='final_render.jpg')


def clip_loss(n_augs, rendered_images, encoded_text, clip_transform, augment_transform, clip_model):

    if n_augs == 0:
        clip_image = clip_transform(rendered_images)
        encoded_renders = clip_model.encode_image(clip_image)
        encoded_renders = encoded_renders / encoded_renders.norm(dim=1, keepdim=True)
        if encoded_text.shape[0] > 1:
            loss = torch.cosine_similarity(torch.mean(encoded_renders, dim=0),
                                                torch.mean(encoded_text, dim=0), dim=0)
        else:
            loss = torch.cosine_similarity(torch.mean(encoded_renders, dim=0, keepdim=True),
                                                encoded_text)
    elif n_augs > 0:
        loss = 0.0
        for _ in range(n_augs):
            augmented_image = augment_transform(rendered_images)
            encoded_renders = clip_model.encode_image(augmented_image)
            if encoded_text.shape[0] > 1:
                loss -= torch.cosine_similarity(torch.mean(encoded_renders, dim=0),
                                                    torch.mean(encoded_text, dim=0), dim=0)
            else:
                loss -= torch.cosine_similarity(torch.mean(encoded_renders, dim=0, keepdim=True),
                                                    encoded_text)
    return loss
    # "1-loss" removed -> now best value in output is -1

def save_renders(dir, i, rendered_images, name=None):
    if name is not None:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, name))
    else:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, 'renders/iter_{}.jpg'.format(i)))


Warp 1.5.1 initialized:
   CUDA Toolkit 12.6, Driver 12.2
   Devices:
     "cpu"      : "x86_64"
     "cuda:0"   : "Tesla T4" (15 GiB, sm_75, mempool enabled)
   Kernel cache:
     /root/.cache/warp/1.5.1


In [None]:
# Constrain most sources of randomness
# (some torch backwards functions within CLIP are non-determinstic)
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


render_res = 224
n_iter = 1000
res = 224
obj_path = 'data/horse.obj' # 'APPROXIMATE.obj' if we want to test the point cloud to mesh conversion
output_dir = './output/'
clip_model = 'ViT-L/14'

input_dim = 3
hidden_dim = 256
output_dim = 2

# Hyper-parameters
learning_rate = 0.0001
n_layers = 3 # depth 4
n_views = 5 # 5
n_augs = 1 # 4

Path(os.path.join(output_dir, 'renders')).mkdir(parents=True, exist_ok=True)

objbase, extension = os.path.splitext(os.path.basename(obj_path))

render = Renderer(dim=(render_res, render_res))
mesh = Mesh(obj_path)
MeshNormalizer(mesh)()


### --

# Initialize variables
background = torch.tensor((1., 1., 1.)).to(device)

log_dir = output_dir


# CLIP and augmentation transform
clip_normalizer = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
clip_transform = transforms.Compose([
    transforms.Resize((res, res)),
    clip_normalizer
])
augment_transform = transforms.Compose([
    transforms.RandomResizedCrop(res, scale=(1, 1)),
    transforms.RandomPerspective(fill=1, p=0.8, distortion_scale=0.5),
    clip_normalizer
])



# MLP Settings
mlp = NeuralHighlighter(input_dim, hidden_dim, output_dim, n_layers).to(device)
optim = torch.optim.Adam(mlp.parameters(), learning_rate)

# list of possible colors
rgb_to_color = {(204/255, 1., 0.): "highlighter", (180/255, 180/255, 180/255): "gray"}
color_to_rgb = {"highlighter": [204/255, 1., 0.], "gray": [180/255, 180/255, 180/255]}
full_colors = [[204/255, 1., 0.], [180/255, 180/255, 180/255]]
colors = torch.tensor(full_colors).to(device)


# --- Prompt ---
# encode prompt with CLIP
model = get_clip_model(clip_model)

known_object = 'horse'
classes = 'Shoes'


prompt = "A 3D render of a gray {} with highlighted {}".format(known_object, classes)
with torch.no_grad():
    prompt_token = clip.tokenize([prompt]).to(device)
    encoded_text = model.encode_text(prompt_token)
    encoded_text = encoded_text / encoded_text.norm(dim=1, keepdim=True)


vertices = copy.deepcopy(mesh.vertices)

losses = []

# Optimization loop
for i in tqdm(range(n_iter)):
    optim.zero_grad()

    # predict highlight probabilities
    pred_class = mlp(vertices)

    # color and render mesh
    sampled_mesh = mesh
    color_mesh(pred_class, sampled_mesh, colors)
    rendered_images, elev, azim = render.render_views(sampled_mesh, num_views=n_views,
                                                            show=False,
                                                            center_azim=0,
                                                            center_elev=0,
                                                            std=1,
                                                            return_views=True,
                                                            lighting=True,
                                                            background=background)

    # Calculate CLIP Loss
    loss = clip_loss(n_augs, rendered_images, encoded_text, clip_transform, augment_transform, model)
    loss.backward(retain_graph=True)

    optim.step()

    # update variables + record loss
    with torch.no_grad():
        losses.append(loss.item())

    # report results
    if i % 100 == 0:
        print("Last 100 CLIP score: {}".format(np.mean(losses[-100:])))
        save_renders(log_dir, i, rendered_images)
        with open(os.path.join(log_dir, "training_info.txt"), "a") as f:
            f.write(f"For iteration {i}... Prompt: {prompt}, Last 100 avg CLIP score: {np.mean(losses[-100:])}, CLIP score {losses[-1]}\n")


# save results
save_final_results(log_dir, 'Primo test', mesh, mlp, vertices, colors, render, background)

# Save prompts
with open(os.path.join(dir, prompt), "w") as f:
    f.write('')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# part 2

In [5]:
# Percorso del file OBJ
obj_path = 'data/horse.obj'

# Funzione per caricare la mesh come TriangleMesh
def load_obj_as_triangle_mesh(obj_path):
    # Carica il file .obj
    mesh = o3d.io.read_triangle_mesh(obj_path)
    if mesh.is_empty():
        raise ValueError(f"La mesh nel file {obj_path} non è stata caricata correttamente.")
    return mesh

# Carica la mesh
mesh = load_obj_as_triangle_mesh(obj_path)

# Converte la mesh in una point cloud campionando punti uniformemente
pcd = mesh.sample_points_uniformly(2048)

# Imposta il colore nero per tutti i punti
pcd.colors = o3d.utility.Vector3dVector(np.zeros((len(pcd.points), 3)))  # Colore nero: [0, 0, 0]


# Esporta la point cloud in formato PLY
# o3d.io.write_point_cloud("candle_PC.ply", pcd)

In [None]:
# Point cloud to mesh

pcd.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamHybrid(radius=0.1, max_nn=30))

with o3d.utility.VerbosityContextManager(
        o3d.utility.VerbosityLevel.Debug) as cm:
    mesh, densities = o3d.geometry.TriangleMesh.create_from_point_cloud_poisson(
        pcd, depth=9)
print(mesh)

o3d.io.write_triangle_mesh("APPROXIMATE.obj", mesh)

# 3 part

In [95]:
import os
from os.path import join as opj
import numpy as np
from torch.utils.data import Dataset
import h5py
import json
import pickle as pkl


def pc_normalize(pc):
    centroid = np.mean(pc, axis=0)
    pc = pc - centroid
    m = np.max(np.sqrt(np.sum(pc**2, axis=1)))
    pc = pc / m
    return pc, centroid, m


class AffordNetDataset(Dataset):
    def __init__(self, data_dir, split):
        super().__init__()
        self.data_dir = data_dir
        self.split = split

        self.load_data()

        self.affordance = self.all_data[0]["affordance"]

        return

    def load_data(self):
     self.all_data = []
     with open(opj(self.data_dir, 'full_shape_%s_data.pkl' % self.split), 'rb') as f:
        temp_data = pkl.load(f)
     for index, info in enumerate(temp_data):
        if info["semantic class"] == "Scissors":  # Filtra solo gli oggetti "BOWL"
            temp_info = {}
            temp_info["shape_id"] = info["shape_id"]
            temp_info["semantic class"] = info["semantic class"]
            temp_info["affordance"] = info["affordance"]
            temp_info["data_info"] = info["full_shape"] # vertici
            self.all_data.append(temp_info)




    def __getitem__(self, index):

        data_dict = self.all_data[index]
        modelid = data_dict["shape_id"]
        modelcat = data_dict["semantic class"]

        data_info = data_dict["data_info"]
        model_data = data_info["coordinate"].astype(np.float32)
        labels = data_info["label"]
        for aff in self.affordance:
            temp = labels[aff].astype(np.float32).reshape(-1, 1)
            model_data = np.concatenate((model_data, temp), axis=1)

        datas = model_data[:, :3]
        targets = model_data[:, 3:]

        datas, _, _ = pc_normalize(datas)

        return datas, datas, targets, modelid, modelcat

    def __len__(self):
        return len(self.all_data)

In [8]:
from torch.utils.data import DataLoader
from torch.utils.data import Subset

def build_dataset(data_dir, test=False):
    test_set = AffordNetDataset( data_dir, 'train')
    val_set = AffordNetDataset( data_dir, 'val')

     # 5 campioni per il VAL set
    val_indices = list(range(min(5, len(val_set))))
    val_set = Subset(val_set, val_indices)

    # 10 campioni per il test set
    test_indices = list(range(min(10, len(test_set))))
    test_set = Subset(test_set, test_indices)

    dataset_dict = dict( val_set=val_set, test_set=test_set)
    return dataset_dict


def build_loader(dataset_dict):
    val_set = dataset_dict["val_set"]
    test_set = dataset_dict["test_set"]

    batch_size_factor = 1


    test_loader = DataLoader(test_set, batch_size=8, ## occhio qui
                              shuffle=True, drop_last=True, num_workers=8)

    val_loader = DataLoader(val_set, batch_size=1,
                            shuffle=False, num_workers=8, drop_last=False)
    loader_dict = dict(
        val_loader=val_loader,
        test_loader=test_loader ,
    )

    return loader_dict

In [134]:
# dataset_dir = '/content/drive/My Drive/full-shape'
dataset_dir = '/content/drive/My Drive/ColabNotebooks'

dataset = build_dataset(dataset_dir)

loader = build_loader(dataset)

In [132]:
def pointToMesh(data ):
    #'''
    single_data = data[0].cpu().numpy()     # shape [N, 3]
    single_target = targets[0].cpu().numpy() # shape [N, num_affordances].     PER COSA LO USA

    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(single_data)

    #o3d.io.write_point_cloud("PointCloud.ply", pcd)


    # Stima delle normali
    pcd.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamHybrid(
        radius=0.08,  # Raggio per la stima delle normali, aumentalo se i punti sono più distanti 0.08
        max_nn=50     # Numero massimo di vicini per stimare la normale
    ))

    # Orienta le normali in modo consistente
    pcd.orient_normals_consistent_tangent_plane(k=30) # 30

    # Ricostruzione con
    mesh = o3d.geometry.TriangleMesh.create_from_point_cloud_alpha_shape(
        pcd, 0.05
    )

    # (Opzionale) Liscia la mesh per rimuovere artefatti
    mesh = mesh.filter_smooth_laplacian(number_of_iterations=5)

    # Salva e visualizza la mesh
    o3d.io.write_triangle_mesh("alpha.obj", mesh)
    #'''

    '''
    # Estrazione dei punti da tensore PyTorch
    single_data = data[0].cpu().numpy()     # shape [N, 3]
    # single_target = targets[0].cpu().numpy() # shape [N, num_affordances] - (se non ti serve, puoi rimuoverlo)

    # Creazione di una PointCloud di Open3D
    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(single_data)

    # Stima delle normali (puoi alzare radius se i punti sono più distanti)
    pcd.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamHybrid(
        radius=0.08,  # Valore di partenza, prova 0.1 - 0.15 se necessario
        max_nn=80     # Numero massimo di vicini per stimare la normale
    ))
    pcd.orient_normals_consistent_tangent_plane(k=30) #30


    # Ricostruzione con Ball Pivoting
    # Dai piccoli a più grandi per catturare sia dettagli fini che superfici più ampie
    ##radii = [0.005, 0.01, 0.02, 0.04]
    radii = [0.001, 0.002, 0.005, 0.01, 0.02, 0.04]
    mesh = o3d.geometry.TriangleMesh.create_from_point_cloud_ball_pivoting(
        pcd,
        o3d.utility.DoubleVector(radii)
    )

    # (Opzionale) Lisciatura della mesh per rimuovere artefatti
    mesh = mesh.filter_smooth_laplacian(number_of_iterations=5)

    # Salvataggio della mesh
    o3d.io.write_triangle_mesh("alpha.obj", mesh)
   '''


## FUNZIONE

In [113]:
def optimize (vertex, targets , modelCat):

    pointToMesh(vertex) #

    seed = 42
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


    render_res = 224
    n_iter = 1000
    res = 224
    obj_path = 'alpha.obj'
    output_dir = './output/'
    clip_model = 'ViT-L/14'

    input_dim = 3
    hidden_dim = 256
    output_dim = 2

    # Hyper-parameters
    learning_rate = 0.0001
    n_layers = 5 # depth 4
    n_views = 4 # 4
    n_augs = 3 # 4

    Path(os.path.join(output_dir, 'renders')).mkdir(parents=True, exist_ok=True)

    objbase, extension = os.path.splitext(os.path.basename(obj_path))

    render = Renderer(dim=(render_res, render_res))
    mesh = Mesh(obj_path)
    MeshNormalizer(mesh)()

    # Initialize variables
    background = torch.tensor((1., 1., 1.)).to(device)

    log_dir = output_dir

    # CLIP and augmentation transform
    clip_normalizer = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    clip_transform = transforms.Compose([
        transforms.Resize((res, res)),
        clip_normalizer
    ])
    augment_transform = transforms.Compose([
        transforms.RandomResizedCrop(res, scale=(1, 1)),
        transforms.RandomPerspective(fill=1, p=0.8, distortion_scale=0.5),
        clip_normalizer
    ])

    # MLP Settings
    mlp = NeuralHighlighter(input_dim, hidden_dim, output_dim, n_layers).to(device)
    optim = torch.optim.Adam(mlp.parameters(), learning_rate)

    # list of possible colors
    rgb_to_color = {(204/255, 1., 0.): "highlighter", (180/255, 180/255, 180/255): "gray"}
    color_to_rgb = {"highlighter": [204/255, 1., 0.], "gray": [180/255, 180/255, 180/255]}
    full_colors = [[204/255, 1., 0.], [180/255, 180/255, 180/255]]
    colors = torch.tensor(full_colors).to(device)

    # --- Prompt ---
    # encode prompt with CLIP
    model = get_clip_model(clip_model)

    known_object = modelCat

    #prompt      = f"A 3D render of a grey bowl with the part where a person can sit highlighted"

    # prompt = f"A 3D render of a grey bowl being wrap-grasped, with the part of the bowl being wrap-grasped highlighted"

    prompt = f"A 3D render of a grey scissors with the cutting edges highlighted "



    with torch.no_grad():
        prompt_token = clip.tokenize([prompt]).to(device)
        encoded_text = model.encode_text(prompt_token)
        encoded_text = encoded_text / encoded_text.norm(dim=1, keepdim=True)

    vertices = copy.deepcopy(mesh.vertices)

    losses = []

    # Optimization loop
    for i in tqdm(range(n_iter)):
        optim.zero_grad()

        # predict highlight probabilities
        pred_class = mlp(vertices)

        # color and render mesh
        sampled_mesh = mesh
        color_mesh(pred_class, sampled_mesh, colors)
        rendered_images, elev, azim = render.render_views(sampled_mesh, num_views=n_views,
                                                                show=False,
                                                                center_azim=0,
                                                                center_elev=0,
                                                                std=1,
                                                                return_views=True,
                                                                lighting=True,
                                                                background=background)

        # Calculate CLIP Loss
        loss = clip_loss(n_augs, rendered_images, encoded_text, clip_transform, augment_transform, model)
        loss.backward(retain_graph=True)

        optim.step()

        # update variables + record loss
        with torch.no_grad():
            losses.append(loss.item())

        # report results
        if i % 100 == 0:
            print("Last 100 CLIP score: {}".format(np.mean(losses[-100:])))
            save_renders(log_dir, i, rendered_images)
            with open(os.path.join(log_dir, "training_info.txt"), "a") as f:
                f.write(f"For iteration {i}... Prompt: {prompt}, Last 100 avg CLIP score: {np.mean(losses[-100:])}, CLIP score {losses[-1]}\n")


    # save results
    save_final_results(log_dir, 'TTest2', mesh, mlp, vertices, colors, render, background)

    return mlp


## Test 2

In [120]:
def calculate_miou(predictions, ground_truths, threshold=0.1):
    """
    Calcola il Mean Intersection Over Union (mIOU).

    Args:
        predictions (np.ndarray): Maschere predette, shape (B, N, C),
                                   dove B è il batch, N i punti, C le affordance.
        ground_truths (np.ndarray): Ground truth, shape (B, N, C).
        threshold (float): Soglia per binarizzare le predizioni.

    Returns:
        float: mIOU medio su tutte le affordance e il batch.
    """
    print(f"Predictions shape: {predictions.shape}")
    print(f"Ground truths shape: {ground_truths.shape}")

    predictions = (predictions >= threshold).astype(int)  # Binarizza le predizioni
    ground_truths = (ground_truths >= threshold).astype(int)  # Binarizza ground truth

    batch_size, num_points = ground_truths.shape
    iou_per_class = np.zeros((batch_size, 1))

    for b in range(batch_size):
        pred = predictions[b, :, 0]
        gt = ground_truths[b, :]

        # Calcola intersezione e unione
        intersection = np.sum(pred * gt)
        union = np.sum(pred + gt) - intersection

        if union == 0:  # Evita divisione per zero
            iou_per_class[b, 0] = np.nan  # Non valido se non ci sono punti
        else:
            iou_per_class[b, 0] = intersection / union


    # Media su batch e classi
    mean_iou = np.nanmean(iou_per_class)
    return mean_iou

# RUN ALL

In [133]:
model_instance = None
targets = None
vertex = None

for batch_idx, batch in enumerate(loader["val_loader"]):

    if batch_idx == 0:
       continue
    '''
    if batch_idx == 1:
       continue
    '''

    # Estraggo il primo elemento del batch
    data, data1, targets, modelid, modelcat = batch   # con target = GT  ( area giusta da colorare )
    vertex = data
    #pointToMesh(data) #


    model_instance = optimize(data, targets, modelcat)

    break



In [None]:
# Predizioni del modello
vertex, targets = vertex.float().cuda(), targets.float().cuda()
afford_pred = model_instance(vertex)
afford_pred = torch.sigmoid(afford_pred).detach().cpu().numpy()  # Shape: [B, N, C]

new_targets = targets[:, :, 16]

#for value in new_targets[0]:
 #   print(value)

# Ground truth
ground_truth = new_targets.detach().cpu().numpy()  # Shape: [B, N, C]

# Calcolo del mIOU
miou = calculate_miou(afford_pred, ground_truth)
print(f"Mean IOU: {miou}")

In [None]:

# Numero di oggetti nel validation loader
num_val_objects = len(loader["val_loader"].dataset)
print(f"Number of objects in val_loader: {num_val_objects}")

# Numero di oggetti nel test loader
num_test_objects = len(loader["test_loader"].dataset)
print(f"Number of objects in test_loader: {num_test_objects}")


Number of objects in val_loader: 5
Number of objects in test_loader: 10


# TODO: domani:

- OTTENERE UN MESH DI QUALITA' MAGGIORE  !!

In [None]:
!rm -rf /content/AML_group
