In [38]:
# Setup needed in order to work with GH and COLAB
!git clone https://github.com/GDennis01/Affordance_Highlighting_Project_2024
import sys
sys.path.append("/content/Affordance_Highlighting_Project_2024")
sys.path.append("/content/Affordance_Highlighting_Project_2024/data")



fatal: destination path 'Affordance_Highlighting_Project_2024' already exists and is not an empty directory.


In [39]:
!pip install git+https://github.com/openai/CLIP.git
!pip install kaolin -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.5.0_cu121.html

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-476b57v9
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-476b57v9
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Looking in links: https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.5.0_cu121.html


In [40]:
import clip
import copy
import json
import kaolin as kal
import kaolin.ops.mesh
import numpy as np
import os
import random
import torch
import torch.nn as nn
import torchvision

from itertools import permutations, product
from Normalization.MeshNormalizer import MeshNormalizer
from mesh import Mesh
from pathlib import Path
from render import Renderer
from tqdm import tqdm
from torch.autograd import grad
from torchvision import transforms
from utils import device, color_mesh

class NeuralHighlighter(nn.Module):
    def __init__(self):
        super(NeuralHighlighter, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(3, 1024),
            nn.ReLU(),
            nn.LayerNorm(1024),
        )

    def print_model(self):
        print(self.mlp)

    def forward(self, x):
        # raise NotImplementedError("Implement the Neural Highlighter Model as described in the project instructions")
        #  the model is a stack of Linear, ReLU, LayerNorm modules

        x = self.model(x)
        return x

def get_clip_model(clipmodel):
    model, preprocess = clip.load(clipmodel)
    return model, preprocess

# ================== HELPER FUNCTIONS =============================
def save_final_results(log_dir, name, mesh, mlp, vertices, colors, render, background):
    mlp.eval()
    with torch.no_grad():
        probs = mlp(vertices)
        max_idx = torch.argmax(probs, 1, keepdim=True)
        # for renders
        one_hot = torch.zeros(probs.shape).to(device)
        one_hot = one_hot.scatter_(1, max_idx, 1)
        sampled_mesh = mesh

        highlight = torch.tensor([204, 255, 0]).to(device)
        gray = torch.tensor([180, 180, 180]).to(device)
        colors = torch.stack((highlight/255, gray/255)).to(device)
        color_mesh(one_hot, sampled_mesh, colors)
        rendered_images, _, _ = render.render_views(sampled_mesh, num_views=5,
                                                                        show=False,
                                                                        center_azim=0,
                                                                        center_elev=0,
                                                                        std=1,
                                                                        return_views=True,
                                                                        lighting=True,
                                                                        background=background)
        # for mesh
        final_color = torch.zeros(vertices.shape[0], 3).to(device)
        final_color = torch.where(max_idx==0, highlight, gray)
        mesh.export(os.path.join(log_dir, f"{name}.ply"), extension="ply", color=final_color)
        save_renders(log_dir, 0, rendered_images, name='final_render.jpg')


def clip_loss(embedding,images,clip_model):
    # Loss = argmin di  1- (e_i*e_t)/(|e_i|*|e_t|) dove e_i è lo score dell'encoding di Clip Image Encoder e e_t quello del Clip Text encoder
     # raise NotImplementedError("\
    #Implement the Neural Highlighter Model as described in the project instructions \
    #Pass to this function the language embedding, the rendered images and the clip model\
    #return the calculated loss     \
    # ")
    text_features = clip_model.encode_text(embedding)
    img_features = clip_model.encode_image(images)

    cosine_similarity = ((text_features*img_features) / (text_features.norm(dim=-1, keepdim=True)  * img_features.norm(dim=-1, keepdim=True))).mean()
    cosine_similarity = 1- cosine_similarity
    return cosine_similarity
    # implement the loss


def save_renders(dir, i, rendered_images, name=None):
    if name is not None:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, name))
    else:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, 'renders/iter_{}.jpg'.format(i)))


In [41]:
seed = 0
# Constrain most sources of randomness
# (some torch backwards functions within CLIP are non-determinstic)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


render_res = 224
learning_rate = 0.0001
n_iter = 1000 # TODO: rimettere a 2500
res = 224
obj_path = '/content/Affordance_Highlighting_Project_2024/data/horse.obj'
n_augs = 5
output_dir = './output/'
clip_model = 'ViT-L/14'

Path(os.path.join(output_dir, 'renders')).mkdir(parents=True, exist_ok=True)

objbase, extension = os.path.splitext(os.path.basename(obj_path))

render = Renderer(dim=(render_res, render_res))
mesh = Mesh(obj_path)
MeshNormalizer(mesh)()

# Initialize variables
bg = torch.tensor((1., 1., 1.)).to(device)

log_dir = output_dir


# MLP Settings
mlp = NeuralHighlighter().to(device)
optim = torch.optim.Adam(mlp.parameters(), learning_rate)

# list of possible colors
rgb_to_color = {(204/255, 1., 0.): "highlighter", (180/255, 180/255, 180/255): "gray"}
color_to_rgb = {"highlighter": [204/255, 1., 0.], "gray": [180/255, 180/255, 180/255]}
full_colors = [[204/255, 1., 0.], [180/255, 180/255, 180/255]]
colors = torch.tensor(full_colors).to(device)


# --- Prompt ---
# encode prompt with CLIP
model,preprocess = get_clip_model(clip_model)
# print(model)
prompt = 'horseshoe'
tokenized_text = clip.tokenize([prompt]).to(device)

vertices = copy.deepcopy(mesh.vertices)
n_views = 5

losses = []

# Optimization loop
for i in tqdm(range(n_iter)):
    optim.zero_grad()

    # predict highlight probabilities
    pred_class = mlp(vertices)

    # color and render mesh
    sampled_mesh = mesh
    color_mesh(pred_class, sampled_mesh, colors)
    rendered_images, elev, azim = render.render_views(sampled_mesh, num_views=n_views,
                                                            show=False,
                                                            center_azim=0,
                                                            center_elev=0,
                                                            std=1,
                                                            return_views=True,
                                                            lighting=True,
                                                            background=bg)

    # Calculate CLIP Loss
    loss = clip_loss(tokenized_text,rendered_images,model)
    loss.backward(retain_graph=True)

    optim.step()

    # update variables + record loss
    with torch.no_grad():
        losses.append(loss.item())

    # report results
    if i % 100 == 0:
        print("Last 100 CLIP score: {}".format(np.mean(losses[-100:])))
        save_renders(log_dir, i, rendered_images)
        with open(os.path.join(log_dir, "training_info.txt"), "a") as f:
            f.write(f"For iteration {i}... Prompt: {prompt}, Last 100 avg CLIP score: {np.mean(losses[-100:])}, CLIP score {losses[-1]}\n")


# save results
save_final_results(log_dir, "cavallo",mesh, mlp, vertices, colors, render, bg)

# Save prompts
with open(os.path.join(output_dir, prompt), "w") as f:
    f.write('')

  0%|          | 1/1000 [00:00<08:15,  2.01it/s]

Last 100 CLIP score: 0.99951171875


 10%|█         | 101/1000 [00:38<05:48,  2.58it/s]

Last 100 CLIP score: 0.9995166015625


 20%|██        | 201/1000 [01:16<05:07,  2.60it/s]

Last 100 CLIP score: 0.999521484375


 30%|███       | 301/1000 [01:54<04:37,  2.52it/s]

Last 100 CLIP score: 0.99951171875


 40%|████      | 401/1000 [02:32<03:52,  2.58it/s]

Last 100 CLIP score: 0.99953125


 50%|█████     | 501/1000 [03:09<03:13,  2.58it/s]

Last 100 CLIP score: 0.9995263671875


 60%|██████    | 601/1000 [03:47<02:34,  2.59it/s]

Last 100 CLIP score: 0.9995166015625


 70%|███████   | 701/1000 [04:26<01:55,  2.59it/s]

Last 100 CLIP score: 0.9995166015625


 80%|████████  | 801/1000 [05:04<01:26,  2.31it/s]

Last 100 CLIP score: 0.9995263671875


 90%|█████████ | 901/1000 [05:42<00:37,  2.66it/s]

Last 100 CLIP score: 0.99951171875


100%|██████████| 1000/1000 [06:19<00:00,  2.63it/s]


In [42]:
# save results
print(background)


tensor([1., 1., 1.], device='cuda:0')
