In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install kaolin==0.17.0 -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.5.1_cu121.html
!pip install open-clip-torch


!git clone https://github.com/AlesCarl/AML_group.git
%cd AML_group


In [2]:
import clip
import copy
import json
import kaolin as kal
import kaolin.ops.mesh
import numpy as np
import os
import random
import torch
import torch.nn as nn
import torchvision
import open_clip


from itertools import permutations, product
from Normalization import MeshNormalizer


from mesh import Mesh
from pathlib import Path
from render import Renderer
from tqdm import tqdm
from torch.autograd import grad
from torchvision import transforms
from utils import device, color_mesh

class NeuralHighlighter(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(NeuralHighlighter, self).__init__()

        layers = []
        for i in range(num_layers):
            layers.append(nn.Linear(input_dim if i == 0 else hidden_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.LayerNorm([hidden_dim]))

        layers.append(nn.Linear(hidden_dim, output_dim))
        layers.append(nn.Softmax(dim=1))

        self.mlp = nn.ModuleList(layers)

    def forward(self, x):
        for layer in self.mlp:
            x = layer(x)
        return x


def get_clip_model(clip_model):
    device = 'cuda'
    model, preprocess = clip.load(clip_model, device=device) # jit = True for better perfomance
    return model



# ================== HELPER FUNCTIONS =============================
def save_final_results(log_dir, name, mesh, mlp, vertices, colors, render, background):
    mlp.eval()
    with torch.no_grad():
        probs = mlp(vertices)
        max_idx = torch.argmax(probs, 1, keepdim=True)
        # for renders
        one_hot = torch.zeros(probs.shape).to(device)
        one_hot = one_hot.scatter_(1, max_idx, 1)
        sampled_mesh = mesh

        highlight = torch.tensor([204, 255, 0]).to(device)
        gray = torch.tensor([180, 180, 180]).to(device)
        colors = torch.stack((highlight/255, gray/255)).to(device)
        color_mesh(one_hot, sampled_mesh, colors)
        rendered_images, _, _ = render.render_views(sampled_mesh, num_views=5,
                                                                        show=False,
                                                                        center_azim=0,
                                                                        center_elev=0,
                                                                        std=1,
                                                                        return_views=True,
                                                                        lighting=True,
                                                                        background=background)
        # for mesh
        final_color = torch.zeros(vertices.shape[0], 3).to(device)
        final_color = torch.where(max_idx==0, highlight, gray)
        mesh.export(os.path.join(log_dir, f"{name}.ply"), extension="ply", color=final_color)
        save_renders(log_dir, 0, rendered_images, name='final_render.jpg')


def clip_loss(n_augs, rendered_images, encoded_text, clip_transform, augment_transform, clip_model):

    if n_augs == 0:
        clip_image = clip_transform(rendered_images)
        encoded_renders = clip_model.encode_image(clip_image)
        encoded_renders = encoded_renders / encoded_renders.norm(dim=1, keepdim=True)
        if encoded_text.shape[0] > 1:
            loss = torch.cosine_similarity(torch.mean(encoded_renders, dim=0),
                                                torch.mean(encoded_text, dim=0), dim=0)
        else:
            loss = torch.cosine_similarity(torch.mean(encoded_renders, dim=0, keepdim=True),
                                                encoded_text)
    elif n_augs > 0:
        loss = 0.0
        for _ in range(n_augs):
            augmented_image = augment_transform(rendered_images)
            encoded_renders = clip_model.encode_image(augmented_image)
            if encoded_text.shape[0] > 1:
                loss -= torch.cosine_similarity(torch.mean(encoded_renders, dim=0),
                                                    torch.mean(encoded_text, dim=0), dim=0)
            else:
                loss -= torch.cosine_similarity(torch.mean(encoded_renders, dim=0, keepdim=True),
                                                    encoded_text)
    return loss
    # "1-loss" removed -> now best value in output is -1

def save_renders(dir, i, rendered_images, name=None):
    if name is not None:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, name))
    else:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, 'renders/iter_{}.jpg'.format(i)))


Warp 1.5.1 initialized:
   CUDA Toolkit 12.6, Driver 12.2
   Devices:
     "cpu"      : "x86_64"
     "cuda:0"   : "Tesla T4" (15 GiB, sm_75, mempool enabled)
   Kernel cache:
     /root/.cache/warp/1.5.1


In [4]:
# Constrain most sources of randomness
# (some torch backwards functions within CLIP are non-determinstic)
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


render_res = 224
n_iter = 1000
res = 224
obj_path = 'data/horse.obj' # 'APPROXIMATE.obj' if we want to test the point cloud to mesh conversion
output_dir = './output/'
clip_model = 'ViT-L/14'

input_dim = 3
hidden_dim = 256
output_dim = 2

# Hyper-parameters
learning_rate = 0.0001
n_layers = 3 # depth 4
n_views = 3 # 5
n_augs = 1 # 4

Path(os.path.join(output_dir, 'renders')).mkdir(parents=True, exist_ok=True)

objbase, extension = os.path.splitext(os.path.basename(obj_path))

render = Renderer(dim=(render_res, render_res))
mesh = Mesh(obj_path)
MeshNormalizer(mesh)()

# Initialize variables
background = torch.tensor((1., 1., 1.)).to(device)

log_dir = output_dir


# CLIP and augmentation transform
clip_normalizer = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
clip_transform = transforms.Compose([
    transforms.Resize((res, res)),
    clip_normalizer
])
augment_transform = transforms.Compose([
    transforms.RandomResizedCrop(res, scale=(1, 1)),
    transforms.RandomPerspective(fill=1, p=0.8, distortion_scale=0.5),
    clip_normalizer
])


# MLP Settings
mlp = NeuralHighlighter(input_dim, hidden_dim, output_dim, n_layers).to(device)
optim = torch.optim.Adam(mlp.parameters(), learning_rate)

# list of possible colors
rgb_to_color = {(204/255, 1., 0.): "highlighter", (180/255, 180/255, 180/255): "gray"}
color_to_rgb = {"highlighter": [204/255, 1., 0.], "gray": [180/255, 180/255, 180/255]}
full_colors = [[204/255, 1., 0.], [180/255, 180/255, 180/255]]
colors = torch.tensor(full_colors).to(device)


# --- Prompt ---
# encode prompt with CLIP
model = get_clip_model(clip_model)

known_object = 'horse'
classes = 'Shoes'

prompt = "A 3D render of a gray {} with highlighted {}".format(known_object, classes)
with torch.no_grad():
    prompt_token = clip.tokenize([prompt]).to(device)
    encoded_text = model.encode_text(prompt_token)
    encoded_text = encoded_text / encoded_text.norm(dim=1, keepdim=True)


vertices = copy.deepcopy(mesh.vertices)

losses = []

# Optimization loop
for i in tqdm(range(n_iter)):
    optim.zero_grad()

    # predict highlight probabilities
    pred_class = mlp(vertices)

    # color and render mesh
    sampled_mesh = mesh
    color_mesh(pred_class, sampled_mesh, colors)
    rendered_images, elev, azim = render.render_views(sampled_mesh, num_views=n_views,
                                                            show=False,
                                                            center_azim=0,
                                                            center_elev=0,
                                                            std=1,
                                                            return_views=True,
                                                            lighting=True,
                                                            background=background)

    # Calculate CLIP Loss
    loss = clip_loss(n_augs, rendered_images, encoded_text, clip_transform, augment_transform, model)
    loss.backward(retain_graph=True)

    optim.step()

    # update variables + record loss
    with torch.no_grad():
        losses.append(loss.item())

    # report results
    if i % 100 == 0:
        print("Last 100 CLIP score: {}".format(np.mean(losses[-100:])))
        save_renders(log_dir, i, rendered_images)
        with open(os.path.join(log_dir, "training_info.txt"), "a") as f:
            f.write(f"For iteration {i}... Prompt: {prompt}, Last 100 avg CLIP score: {np.mean(losses[-100:])}, CLIP score {losses[-1]}\n")


# save results
save_final_results(log_dir, 'Primo test', mesh, mlp, vertices, colors, render, background)

# Save prompts
with open(os.path.join(dir, prompt), "w") as f:
    f.write('')

100%|████████████████████████████████████████| 890M/890M [00:08<00:00, 105MiB/s]
  0%|          | 1/1000 [00:02<33:27,  2.01s/it]

Last 100 CLIP score: -0.2398681640625


 10%|█         | 101/1000 [00:22<03:03,  4.89it/s]

Last 100 CLIP score: -0.30088134765625


 20%|██        | 201/1000 [00:43<02:46,  4.81it/s]

Last 100 CLIP score: -0.30932373046875


 30%|███       | 301/1000 [01:04<02:29,  4.69it/s]

Last 100 CLIP score: -0.31841064453125


 40%|████      | 401/1000 [01:25<02:07,  4.71it/s]

Last 100 CLIP score: -0.32242431640625


 50%|█████     | 501/1000 [01:47<01:47,  4.63it/s]

Last 100 CLIP score: -0.3262255859375


 60%|██████    | 601/1000 [02:08<01:28,  4.53it/s]

Last 100 CLIP score: -0.32611328125


 70%|███████   | 701/1000 [02:30<01:05,  4.56it/s]

Last 100 CLIP score: -0.3264404296875


 80%|████████  | 801/1000 [02:52<00:43,  4.59it/s]

Last 100 CLIP score: -0.3250048828125


 90%|█████████ | 901/1000 [03:13<00:21,  4.59it/s]

Last 100 CLIP score: -0.321964111328125


100%|██████████| 1000/1000 [03:35<00:00,  4.64it/s]


TypeError: expected str, bytes or os.PathLike object, not builtin_function_or_method

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Mesh to Point Cloud using open3d

!pip install open3d

import open3d as o3d
import numpy as np


# Percorso del file OBJ
obj_path = 'data/candle.obj'

# Funzione per caricare la mesh come TriangleMesh
def load_obj_as_triangle_mesh(obj_path):
    # Carica il file .obj
    mesh = o3d.io.read_triangle_mesh(obj_path)
    if mesh.is_empty():
        raise ValueError(f"La mesh nel file {obj_path} non è stata caricata correttamente.")
    return mesh

# Carica la mesh
mesh = load_obj_as_triangle_mesh(obj_path)

# Converte la mesh in una point cloud campionando punti uniformemente
pcd = mesh.sample_points_uniformly(10000)

# Imposta il colore nero per tutti i punti
pcd.colors = o3d.utility.Vector3dVector(np.zeros((len(pcd.points), 3)))  # Colore nero: [0, 0, 0]


# Esporta la point cloud in formato PLY
# o3d.io.write_point_cloud("candle_PC.ply", pcd)



True

In [None]:
# Point cloud to mesh

pcd.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamHybrid(radius=0.1, max_nn=30))

with o3d.utility.VerbosityContextManager(
        o3d.utility.VerbosityLevel.Debug) as cm:
    mesh, densities = o3d.geometry.TriangleMesh.create_from_point_cloud_poisson(
        pcd, depth=9)
print(mesh)

o3d.io.write_triangle_mesh("APPROXIMATE.obj", mesh)

In [10]:
!pip install transforms3d


from transforms3d.quaternions import quat2mat
from transforms3d.euler import euler2mat



def random_rotation_matrix():
    rand = np.random.rand(3)
    r1 = np.sqrt(1.0 - rand[0])
    r2 = np.sqrt(rand[0])
    pi2 = np.pi * 2.0
    t1 = pi2 * rand[1]
    t2 = pi2 * rand[2]
    q = np.array([np.cos(t2)*r2, np.sin(t1)*r1, np.cos(t1)*r1, np.sin(t2)*r2])
    return quat2mat(q)




def rotate_point_cloud_SO3(batch_data):
    rotated_data = np.zeros(batch_data.shape, dtype=np.float32)
    for k in range(batch_data.shape[0]):
        rotation_matrix = random_rotation_matrix()
        shape_pc = batch_data[k, ...]
        rotated_data[k, ...] = (
            np.matmul(rotation_matrix, shape_pc.reshape((-1, 3)).T)).T
    return rotated_data


def rotate_point_cloud_y(batch_data):

    rotated_data = np.zeros(batch_data.shape, dtype=np.float32)
    for k in range(batch_data.shape[0]):
        rotation_angle = np.random.uniform() * 2 * np.pi
        rotation_matrix = euler2mat(0, rotation_angle, 0)
        shape_pc = batch_data[k, ...]
        rotated_data[k, ...] = (
            np.matmul(rotation_matrix, shape_pc.reshape((-1, 3)).T)).T
    return rotated_data

Collecting transforms3d
  Downloading transforms3d-0.4.2-py3-none-any.whl.metadata (2.8 kB)
Downloading transforms3d-0.4.2-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transforms3d
Successfully installed transforms3d-0.4.2


In [12]:
import os
from os.path import join as opj
import numpy as np
from torch.utils.data import Dataset
import h5py
import json
import pickle as pkl


def pc_normalize(pc):
    centroid = np.mean(pc, axis=0)
    pc = pc - centroid
    m = np.max(np.sqrt(np.sum(pc**2, axis=1)))
    pc = pc / m
    return pc, centroid, m


class AffordNetDataset(Dataset):
    def __init__(self, data_dir, split):
        super().__init__()
        self.data_dir = data_dir
        self.split = split

        self.load_data()

        self.affordance = self.all_data[0]["affordance"]

        return

    def load_data(self):
        self.all_data = []
        with open(opj(self.data_dir, 'full_shape_%s_data.pkl' % self.split), 'rb') as f:
                    temp_data = pkl.load(f)
        for index, info in enumerate(temp_data):
              temp_info = {}
              temp_info["shape_id"] = info["shape_id"]
              temp_info["semantic class"] = info["semantic class"]
              temp_info["affordance"] = info["affordance"]
              temp_info["data_info"] = info["full_shape"]
              self.all_data.append(temp_info)

    def __getitem__(self, index):

        data_dict = self.all_data[index]
        modelid = data_dict["shape_id"]
        modelcat = data_dict["semantic class"]

        data_info = data_dict["data_info"]
        model_data = data_info["coordinate"].astype(np.float32)
        labels = data_info["label"]
        for aff in self.affordance:
            temp = labels[aff].astype(np.float32).reshape(-1, 1)
            model_data = np.concatenate((model_data, temp), axis=1)

        datas = model_data[:, :3]
        targets = model_data[:, 3:]

        datas, _, _ = pc_normalize(datas)

        return datas, datas, targets, modelid, modelcat

    def __len__(self):
        return len(self.all_data)

In [13]:
from torch.utils.data import DataLoader

def build_dataset(data_dir, test=False):
    train_set = AffordNetDataset(
            data_dir, 'train')
    val_set = AffordNetDataset(
            data_dir, 'val')
    dataset_dict = dict(train_set=train_set, val_set=val_set)
    return dataset_dict


def build_loader(dataset_dict):
    train_set = dataset_dict["train_set"]
    val_set = dataset_dict["val_set"]
    batch_size_factor = 1
    train_loader = DataLoader(train_set, batch_size=cfg.training_cfg.batch_size,
                              shuffle=True, drop_last=True, num_workers=8)
    val_loader = DataLoader(val_set, batch_size=1,
                            shuffle=False, num_workers=8, drop_last=False)
    loader_dict = dict(
        train_loader=train_loader,
        val_loader=val_loader
    )

    return loader_dict

In [14]:
import torch
import numpy as np
from os.path import join as opj
from tqdm import tqdm
from sklearn.metrics import average_precision_score, f1_score, roc_auc_score


def evaluation(model, test_loader, affordance):
    results = torch.zeros(
        (len(test_loader), 2048, len(affordance)))
    targets = torch.zeros(
        (len(test_loader), 2048, len(affordance)))
    coordinate = np.zeros((0, 2048, 3))
    modelids = []
    modelcats = []
    with torch.no_grad():
        model.eval()
        total_L2distance = 0
        count = 0.0
        for i,  temp_data in tqdm(enumerate(test_loader), total=len(test_loader), smoothing=0.9):

            # modelcat identifies the category of the 3d model we are passing to our model
            (data, data1, label, modelid, modelcat) = temp_data

            data, label = data.float().cuda(), label.float().cuda()
            data = data.permute(0, 2, 1)
            batch_size = data.size()[0]
            num_point = data.size()[2]
            count += batch_size * num_point
            afford_pred = torch.sigmoid(model(data))
            afford_pred = afford_pred.permute(0, 2, 1).contiguous()
            L2distance = torch.sum(
                torch.pow(label-afford_pred, 2), dim=(0, 1))
            total_L2distance += L2distance
            score = afford_pred.squeeze()
            target_score = label.squeeze()
            results[i, :, :] = score
            targets[i, :, :] = target_score
            modelids.append(modelid[0])
            modelcats.append(modelcat[0])

    results = results.detach().cpu().numpy()
    targets = targets.detach().cpu().numpy()
    IOU = np.zeros((targets.shape[0], targets.shape[2]))
    targets = targets >= 0.5
    targets = targets.astype(int)

    IOU_thres = np.linspace(0, 1, 20)
    for i in range(IOU.shape[0]):
        t = targets[i, :, :]
        p = results[i, :, :]
        for j in range(t.shape[1]):
            t_true = t[:, j]
            p_score = p[:, j]
            if np.sum(t_true) == 0:
                IOU[i, j] = np.nan
            else:
                p_mask = (p_score > 0.5).astype(int)
                temp_iou = []
                for thre in IOU_thres:
                    p_mask = (p_score >= thre).astype(int)
                    intersect = np.sum(p_mask & t_true)
                    union = np.sum(p_mask | t_true)
                    temp_iou.append(1.*intersect/union)
                temp_iou = np.array(temp_iou)
                aiou = np.mean(temp_iou)
                IOU[i, j] = aiou

    IOU = np.nanmean(IOU, axis=0)

    outstr = 'Test :: test maIOU: %.6f' % (
        np.mean(IOU))

    return np.mean(IOU)

In [15]:
dataset_dir = './data'

dataset = build_dataset(dataset_dir)
loader = build_loader(dataset)

mIOU = evaluation(mlp, loader, ['Grasp', 'Push', 'Wrap'])
print(mIOU)

FileNotFoundError: [Errno 2] No such file or directory: './data/full_shape_train_data.pkl'