#### Installation

In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install kaolin==0.17.0 -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.5.1_cu121.html
!pip install open-clip-torch

!pip install open3d

import os

if not os.path.exists("AML_group"):
    !git clone https://github.com/AlesCarl/AML_group.git
else:
    print("Repository già clonato.")
%cd AML_group

from google.colab import drive
drive.mount('/content/drive')

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-8ghf9l14
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-8ghf9l14
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369489 sha256=a24f7dfb343d8e8e966cfa8107fefb32a212d45a959efcf3ac80dd48c73d981e
  Stored in directory: /tmp/pip-ephem-wheel-cache-j29ru429/wheels/3f/7c/a4/9b490845988bf7a4d

#### Imports

In [None]:
import clip
import copy
import json
import kaolin as kal
import kaolin.ops.mesh
import numpy as np
import os
import random
import torch
import torch.nn as nn
import torchvision
import open_clip
import open3d as o3d
import numpy as np
import h5py
import json
import pickle as pkl


from itertools import permutations, product
from Normalization import MeshNormalizer
from utils import device, color_mesh
from mesh import Mesh
from render import Renderer

from pathlib import Path
from tqdm import tqdm
from torch.autograd import grad
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader, Subset
from torchvision.transforms import functional as F
from PIL import Image


seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

Warp 1.5.1 initialized:
   CUDA Toolkit 12.6, Driver 12.2
   Devices:
     "cpu"      : "x86_64"
     "cuda:0"   : "Tesla T4" (15 GiB, sm_75, mempool enabled)
   Kernel cache:
     /root/.cache/warp/1.5.1


# Neural Highlighter Class

In [None]:
class NeuralHighlighter(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(NeuralHighlighter, self).__init__()

        layers = []
        for i in range(num_layers):
            layers.append(nn.Linear(input_dim if i == 0 else hidden_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.LayerNorm([hidden_dim]))

        layers.append(nn.Linear(hidden_dim, output_dim))
        layers.append(nn.Softmax(dim=1))

        self.mlp = nn.ModuleList(layers)

    def forward(self, x):
        for layer in self.mlp:
            x = layer(x)
        return x

## Utility Functions

In [17]:
def get_clip_model(clip_model):
    device = 'cuda'
    model, preprocess = clip.load(clip_model, device=device) # jit = True for better perfomance
    return model

def save_final_results(log_dir, name, mesh, mlp, vertices, colors, render, background):
    mlp.eval()
    with torch.no_grad():
        probs = mlp(vertices)
        max_idx = torch.argmax(probs, 1, keepdim=True)
        one_hot = torch.zeros(probs.shape).to(device)
        one_hot = one_hot.scatter_(1, max_idx, 1)
        sampled_mesh = mesh

        highlight = torch.tensor([204, 255, 0]).to(device)
        gray = torch.tensor([180, 180, 180]).to(device)
        colors = torch.stack((highlight/255, gray/255)).to(device)
        color_mesh(one_hot, sampled_mesh, colors)

        rendered_images, _, _ = render.render_views(sampled_mesh, num_views=5, show=False, center_azim=0, center_elev=0, std=1, return_views=True, lighting=True, background=background)

        final_color = torch.zeros(vertices.shape[0], 3).to(device)
        final_color = torch.where(max_idx==0, highlight, gray)
        mesh.export(os.path.join(log_dir, f"{name}.ply"), extension="ply", color=final_color)
        save_renders(log_dir, 0, rendered_images, name='final_render.jpg')

def clip_loss(n_augs, rendered_images, encoded_text, clip_transform, augment_transform, clip_model):
    if n_augs == 0:
        clip_image = clip_transform(rendered_images)
        encoded_renders = clip_model.encode_image(clip_image)
        encoded_renders = encoded_renders / encoded_renders.norm(dim=1, keepdim=True)
        if encoded_text.shape[0] > 1:
            loss = torch.cosine_similarity(torch.mean(encoded_renders, dim=0),
                                                torch.mean(encoded_text, dim=0), dim=0)
        else:
            loss = torch.cosine_similarity(torch.mean(encoded_renders, dim=0, keepdim=True),
                                                encoded_text)
    elif n_augs > 0:
        loss = 0.0
        for _ in range(n_augs):
            augmented_image = augment_transform(rendered_images)
            print(augmented_image.shape)
            encoded_renders = clip_model.encode_image(augmented_image)
            if encoded_text.shape[0] > 1:
                loss -= torch.cosine_similarity(torch.mean(encoded_renders, dim=0),
                                                    torch.mean(encoded_text, dim=0), dim=0)
            else:
                loss -= torch.cosine_similarity(torch.mean(encoded_renders, dim=0, keepdim=True),
                                                    encoded_text)
        loss= loss / n_augs
    return loss

def save_renders(dir, i, rendered_images, name=None):
    if name is not None:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, name))
    else:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, 'renders/iter_{}.jpg'.format(i)))

# Optimizer Loop

In [18]:
def optimize(obj_path, learning_rate, n_layers, n_views, n_augs, prompt, augment_transform=None):
    render_res = 224
    n_iter = 1000
    res = 224
    output_dir = './output/'
    clip_model = 'ViT-L/14'

    input_dim = 3
    hidden_dim = 256
    output_dim = 2

    Path(os.path.join(output_dir, 'renders')).mkdir(parents=True, exist_ok=True)

    objbase, extension = os.path.splitext(os.path.basename(obj_path))

    render = Renderer(dim=(render_res, render_res), background_image='./data/bg1.jpg')
    mesh = Mesh(obj_path)
    MeshNormalizer(mesh)()

    # Initialize variables
    background = torch.tensor((1., 1., 1.)).to(device)

    # CLIP and augmentation transform
    clip_normalizer = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    clip_transform = transforms.Compose([
        transforms.Resize((res, res)),
        clip_normalizer
    ])
    if augment_transform is None:
        augment_transform = transforms.Compose([
            transforms.RandomResizedCrop(res, scale=(1, 1)),
            transforms.RandomPerspective(fill=1, p=0.8, distortion_scale=0.5),
            clip_normalizer
        ])

    # MLP and optimizer Settings
    mlp = NeuralHighlighter(input_dim, hidden_dim, output_dim, n_layers).to(device)
    optim = torch.optim.Adam(mlp.parameters(), learning_rate)

    # List of possible colors
    rgb_to_color = {(204/255, 1., 0.): "highlighter", (180/255, 180/255, 180/255): "gray"}
    color_to_rgb = {"highlighter": [204/255, 1., 0.], "gray": [180/255, 180/255, 180/255]}
    full_colors = [[204/255, 1., 0.], [180/255, 180/255, 180/255]]
    colors = torch.tensor(full_colors).to(device)

    # Encode prompt with CLIP
    model = get_clip_model(clip_model)

    with torch.no_grad():
        prompt_token = clip.tokenize([prompt]).to(device)
        encoded_text = model.encode_text(prompt_token)
        encoded_text = encoded_text / encoded_text.norm(dim=1, keepdim=True)

    vertices = copy.deepcopy(mesh.vertices)

    losses = []

    # Optimization loop
    for i in tqdm(range(n_iter)):
        optim.zero_grad()

        # predict highlight probabilities
        pred_class = mlp(vertices)

        # color and render mesh
        sampled_mesh = mesh
        color_mesh(pred_class, sampled_mesh, colors)
        rendered_images, elev, azim = render.render_views(sampled_mesh, num_views=n_views, show=False, center_azim=0, center_elev=0, std=1, return_views=True, lighting=True, background=background)

        # Calculate CLIP Loss
        loss = clip_loss(n_augs, rendered_images, encoded_text, clip_transform, augment_transform, model)
        loss.backward(retain_graph=True)

        optim.step()

        # update variables + record loss
        with torch.no_grad():
            losses.append(loss.item())

        # report results
        if i % 100 == 0:
            print("Last 100 CLIP score: {}".format(np.mean(losses[-100:])))
            save_renders(output_dir, i, rendered_images)
            with open(os.path.join(output_dir, "training_info.txt"), "a") as f:
                f.write(f"For iteration {i}... Prompt: {prompt}, Last 100 avg CLIP score: {np.mean(losses[-100:])}, CLIP score {losses[-1]}\n")
        #if i == 700:
        #  break

    # save results
    save_final_results(output_dir, 'final_mesh', mesh, mlp, vertices, colors, render, background)

    return mlp

# First Part

In [None]:
obj_path = 'data/horse.obj'

learning_rate = 0.0001
n_layers = 5
n_views = 4
n_augs = 4

prompt= "A 3D rendering of a Horse with highlighted Shoes."

model_instance = optimize(obj_path, learning_rate, n_layers, n_views, n_augs, prompt)

# Second Part

## Mesh 2 Point Cloud

In [22]:
# Percorso del file OBJ
obj_path = 'data/horse.obj'

# Funzione per caricare la mesh come TriangleMesh
def load_obj_as_triangle_mesh(obj_path):
    # Carica il file .obj
    mesh = o3d.io.read_triangle_mesh(obj_path)
    if mesh.is_empty():
        raise ValueError(f"La mesh nel file {obj_path} non è stata caricata correttamente.")
    return mesh

# Carica la mesh
mesh = load_obj_as_triangle_mesh(obj_path)

# Converte la mesh in una point cloud campionando punti uniformemente
pcd = mesh.sample_points_uniformly(2048)

# Imposta il colore nero per tutti i punti
pcd.colors = o3d.utility.Vector3dVector(np.zeros((len(pcd.points), 3)))  # Colore nero: [0, 0, 0]

# Esporta la point cloud in formato PLY
# o3d.io.write_point_cloud("candle_PC.ply", pcd)

## Point Cloud 2 Mesh

In [23]:
def pointToMesh(data, output_path):
    single_data = data[0].cpu().numpy()
    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(single_data)

    # Stima delle normali
    pcd.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamHybrid(
        radius=0.06,  # Raggio per la stima delle normali,
        max_nn=80     # Numero massimo di vicini per stimare la normale
    ))

    # Orienta le normali in modo consistente
    pcd.orient_normals_consistent_tangent_plane(k=100)

    # Ricostruzione con
    mesh = o3d.geometry.TriangleMesh.create_from_point_cloud_alpha_shape(
        pcd, 0.04
    )

    # Salva e visualizza la mesh
    o3d.io.write_triangle_mesh(output_path, mesh)

# Third Part

## AffordanceNet Class to handle the dataset

In [24]:
def pc_normalize(pc):
    centroid = np.mean(pc, axis=0)
    pc = pc - centroid
    m = np.max(np.sqrt(np.sum(pc**2, axis=1)))
    pc = pc / m
    return pc, centroid, m


class AffordNetDataset(Dataset):
    def __init__(self, data_dir, split):
        super().__init__()
        self.data_dir = data_dir
        self.split = split

        self.load_data()
        self.affordance = self.all_data[0]["affordance"]
        return

    def load_data(self):
     self.all_data = []
     with open(os.path.join(self.data_dir, 'full_shape_%s_data.pkl' % self.split), 'rb') as f:
        temp_data = pkl.load(f)
     for index, info in enumerate(temp_data):
        if info["semantic class"] == "Scissors":  # Filtra solo gli oggetti "BOWL"
            temp_info = {}
            temp_info["shape_id"] = info["shape_id"]
            temp_info["semantic class"] = info["semantic class"]
            temp_info["affordance"] = info["affordance"]
            temp_info["data_info"] = info["full_shape"] # vertici
            self.all_data.append(temp_info)

    def __getitem__(self, index):
        data_dict = self.all_data[index]
        modelid = data_dict["shape_id"]
        modelcat = data_dict["semantic class"]

        data_info = data_dict["data_info"]
        model_data = data_info["coordinate"].astype(np.float32)
        labels = data_info["label"]
        for aff in self.affordance:
            temp = labels[aff].astype(np.float32).reshape(-1, 1)
            model_data = np.concatenate((model_data, temp), axis=1)

        datas = model_data[:, :3]
        targets = model_data[:, 3:]
        datas, _, _ = pc_normalize(datas)

        return datas, datas, targets, modelid, modelcat

    def __len__(self):
        return len(self.all_data)


def build_dataset(data_dir, test=False):
    test_set = AffordNetDataset(data_dir, 'train')
    val_set = AffordNetDataset(data_dir, 'val')

    # Seleziona i primi 5 campioni per il VAL set
    val_indices = list(range(min(5, len(val_set))))
    val_set = Subset(val_set, val_indices)

    # Seleziona 10 campioni per il TEST set, a partire dal 6° (indice 5)
    test_start_index = 5
    test_indices = list(range(test_start_index, test_start_index + min(10, len(test_set) - test_start_index)))
    test_set = Subset(test_set, test_indices)

    # Ritorna i dataset in un dizionario
    dataset_dict = dict(val_set=val_set, test_set=test_set)
    return dataset_dict

def build_loader(dataset_dict):
    val_set = dataset_dict["val_set"]
    test_set = dataset_dict["test_set"]

    batch_size_factor = 1

    test_loader = DataLoader(test_set, batch_size=1, shuffle=True, drop_last=False, num_workers=8)
    val_loader = DataLoader(val_set, batch_size=1, shuffle=False, num_workers=8, drop_last=False)
    loader_dict = dict(val_loader=val_loader, test_loader=test_loader)
    return loader_dict

## Evaluate the mIOU

In [None]:
def calculate_miou(predictions, ground_truths, threshold):
    ground_truths = (ground_truths >= threshold).astype(int)  # Binarizza ground truth

    batch_size, num_points = ground_truths.shape
    iou_per_class = np.zeros((batch_size, 1))

    for b in range(batch_size):
        pred = predictions[b, :].cpu().numpy()
        gt = ground_truths[b, :]

        # Calcola intersezione e unione
        intersection = np.sum(pred * gt)
        union = np.sum(pred + gt) - intersection

        if union == 0:  # Evita divisione per zero
            iou_per_class[b, 0] = np.nan  # Non valido se non ci sono punti
        else:
            iou_per_class[b, 0] = intersection / union

    # Media su batch e classi
    mean_iou = np.nanmean(iou_per_class)
    return mean_iou

## Load the dataset

In [25]:
# dataset_dir = '/content/drive/My Drive/full-shape'
dataset_dir = '/content/drive/My Drive/ColabNotebooks'

dataset = build_dataset(dataset_dir)

loader = build_loader(dataset)

## Run

## Validation


In [None]:
model_instance = None
targets = None
vertex = None

learning_rate = 0.0001
n_layers = 5
n_views = 5
n_augs = 4

prompt= "A 3D rendering of scissors showing the regions optimized for grasping"

for batch_idx, batch in enumerate(loader["val_loader"]):
    if batch_idx == 0: # MESH buona
       continue

    if batch_idx == 1: # forbice strana
       continue

    if batch_idx == 2: # forbice aperta
       continue

    # Estraggo il primo elemento del batch
    data, data1, targets, modelid, modelcat = batch
    vertex = data
    obj_path = 'alpha.obj'
    pointToMesh(data, obj_path)

    model_instance = optimize(obj_path, learning_rate, n_layers, n_views, n_augs, prompt)
    break

## Test


In [None]:
model_instance = None
targets = None
vertex = None

learning_rate = 0.0001
n_layers = 5
n_views = 5
n_augs = 4

for batch_idx, batch in enumerate(loader["test_loader"]):



    if batch_idx == 0: # male male
       continue


    if batch_idx == 1:
       continue

    '''

    if batch_idx == 2: # forbice aperta
       continue



    if batch_idx == 3:
       continue

    '''


    # Estraggo il primo elemento del batch
    data, data1, targets, modelid, modelcat = batch
    vertex = data
    obj_path = 'alpha.obj'
    pointToMesh(data, obj_path)

    model_instance = optimize(obj_path, learning_rate, n_layers, n_views, n_augs, prompt)
    break

Background image path:  ./data/bg1.jpg
bg_image shape: (5760, 3840)
Background image size:  <built-in method size of Tensor object at 0x7f23c8cc2750>


 31%|███████████▉                           | 274M/890M [00:05<00:12, 50.9MiB/s]

## Model Evaluation

In [None]:
model_instance1= model_instance
model_instance1.eval()  # Metti il modello in modalità eval per la validazione/test

with torch.no_grad():  # Disabilita il calcolo dei gradienti per l'inferenza
  vertex, targets = vertex.float().cuda(), targets.float().cuda()

  afford_pred = model_instance1(vertex)
  #afford_pred = torch.sigmoid(afford_pred).detach().cpu()  # Shape: [1, 2048, 2]

  # Usa argmax per ottenere la classe predetta (0 o 1)
  afford_pred = torch.argmax(afford_pred, dim=-1)  # Shape: [1, 2048]

  # Usare l'operazione logica NOT (bitwise XOR con 1)
  afford_pred = afford_pred ^ 1

  new_targets = targets[:, :, 0]
  ground_truth = new_targets.detach().cpu().numpy()  # Shape: [1, 2048, 1]

# Calcolo del mIOU
miou = calculate_miou(afford_pred, ground_truth, threshold=0.1)
print(f"Mean IOU: {miou}")

NameError: name 'model_instance' is not defined

# Extension

In [None]:
class RandomApplySubset:
    def __init__(self, transforms, n_select):
        """
        Initialize with a list of transformations and the number of transformations to apply.
        :param transforms: List of possible transformations.
        :param n_select: Number of transformations to randomly select and apply.
        """
        self.transforms = transforms
        self.n_select = n_select

    def __call__(self, image):
        """
        Randomly selects a subset of transformations and applies them to the image.
        :param image: Input image to transform.
        :return: Transformed image.
        """
        selected_transforms = random.sample(self.transforms, self.n_select)
        for transform in selected_transforms:
            print(f"Applying {transform} to image of type {type(image)}")
            image = transform(image)
        return image


class BackgroundTransform:
    def __init__(self, backgrounds, blur_sigma=(0.1, 2.0)):
        """
        Args:
            backgrounds (list of torch.Tensor): List of background images as tensors.
            blur_sigma (tuple): Min and max sigma for Gaussian blur.
        """
        if not isinstance(backgrounds, list) or len(backgrounds) == 0:
            raise ValueError("BackgroundTransform requires a non-empty list of background tensors.")
        if not all(isinstance(bg, torch.Tensor) for bg in backgrounds):
            raise TypeError("All backgrounds must be torch tensors.")

        self.backgrounds = backgrounds
        self.blur_sigma = blur_sigma

    def __call__(self, rendered_images):
        """
        Args:
            rendered_images (torch.Tensor)[]: Input images tensor of shape (B, C, H, W).

        Returns:
            torch.Tensor: Transformed image blended with a random background.
        """
        transformed_images = []

        for rendered_image in rendered_images:
            # Select a random background
            bg = random.choice(self.backgrounds)
            bg = bg.to(rendered_image.device)

            # Resize the background to match the rendered image
            bg = transforms.Resize(rendered_image.shape[-2:])(bg)

            # Apply Gaussian blur to the background
            blurred_bg = F.gaussian_blur(bg, kernel_size=(5, 5), sigma=random.uniform(*self.blur_sigma))

            # Create a mask from the rendered image
            mask = (rendered_image.sum(dim=0, keepdim=True) > 0).float()  # Mask identifies non-zero pixels

            # Blend the rendered image with the blurred background
            transformed_image = rendered_image * mask + blurred_bg * (1 - mask)

            # Ensure output is valid
            if transformed_image.ndim == 2:
                transformed_image = transformed_image.unsqueeze(0)
            transformed_image = transformed_image.to(rendered_image.device)

            transformed_images.append(transformed_image)

        transformed_images = torch.stack(transformed_images).to(rendered_images.device)

        return transformed_images

In [None]:
obj_path = 'data/horse.obj'

learning_rate = 0.0001
n_layers = 4
n_views = 3
n_augs = 4

prompt = "A 3D rendering of a Horse with highlighted Shoes."

clip_normalizer = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))

backgrounds = [
    transforms.ToTensor()(Image.open("./data/bg1.jpg").resize((224, 224))).to(device),
    transforms.ToTensor()(Image.open("./data/bg2.jpg").resize((224, 224))).to(device),
    transforms.ToTensor()(Image.open("./data/bg3.jpg").resize((224, 224))).to(device),
    torch.tensor((1., 1., 1.)).to(device)
]

possible_augmentations = [
    transforms.RandomResizedCrop(224, scale=(1, 1)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomPerspective(fill=1, p=0.8, distortion_scale=0.5),
    transforms.GaussianBlur(kernel_size=(5, 5), sigma=(0.1, 2.0))
]

# Custom augmentation that randomly applies a subset
augment_transform = transforms.Compose([
    # BackgroundTransform(backgrounds, blur_sigma=(0.1, 2.0)),
    RandomApplySubset(possible_augmentations, n_select=2),  # Randomly apply 3 augmentations each time
    clip_normalizer
])

model_instance = optimize(obj_path, learning_rate, n_layers, n_views, n_augs, prompt, augment_transform=augment_transform)