## VLMap Creation
It takes around 20 minutes to build a VLMap with around 1000 RGBD frames. We also provide a pre-built VLMap. Skip to the Landmark Indexing part of the code to directly try our map.

In [3]:

import os
data_dir = r"C:\Users\Andrew Jeon\OneDrive\Desktop\vlmaps\data\5LpN3gDmAk7_1"

# setup parameters
# @markdown meters per cell size
cs = 0.05 # @param {type: "number"}
VOXEL_SIZE = 0.05 # @param {type: "number"}
# @markdown map resolution (gs x gs)
gs = 1000 # @param {type: "integer"}
# @markdown camera height (used for filtering out points on the floor)
camera_height = 1.5 # @param {type: "number"}
# @markdown depth pixels subsample rate
depth_sample_rate = 100 # @param {type: "integer"}
# @markdown data where rgb, depth, pose are loaded and map are saved
data_dir = data_dir # @param {type: "string"}



In [4]:
import open3d as o3d
from utils.clip_mapping_utils import load_pose, load_semantic, load_obj2cls_dict, save_map, cvt_obj_id_2_cls_id, depth2pc, transform_pc, get_sim_cam_mat, pos2grid_id, project_point
import numpy as np
import cv2
from tqdm import tqdm

def load_depth(depth_filepath):
    with open(depth_filepath, 'rb') as f:
        depth = np.load(f)
    return depth

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [5]:
cd "C:\Users\Andrew Jeon\OneDrive\Desktop\vlmaps"

C:\Users\Andrew Jeon\OneDrive\Desktop\vlmaps


In [6]:
from lseg.modules.models.lseg_net import LSegEncNet
from lseg.additional_utils.models import resize_image, pad_image, crop_image
import clip
import torch
import torchvision.transforms as transforms


img_save_dir = data_dir
mask_version = 1
crop_size = 480 # 480
base_size = 520 # 520
lang = "door,chair,ground,ceiling,other"
labels = lang.split(",")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
clip_version = "ViT-B/32"
clip_feat_dim = {'RN50': 1024, 'RN101': 512, 'RN50x4': 640, 'RN50x16': 768,
                'RN50x64': 1024, 'ViT-B/32': 512, 'ViT-B/16': 512, 'ViT-L/14': 768}[clip_version]

print("Loading CLIP model...")
clip_model, preprocess = clip.load(clip_version)  # clip.available_models()
clip_model.to(device).eval()
lang_token = clip.tokenize(labels)
lang_token = lang_token.to(device)
with torch.no_grad():
    text_feats = clip_model.encode_text(lang_token)
    text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
text_feats = text_feats.cpu().numpy()
model = LSegEncNet(lang, arch_option=0,
                    block_depth=0,
                    activation='lrelu',
                    crop_size=crop_size)
model_state_dict = model.state_dict()
pretrained_state_dict = torch.load("lseg/checkpoints/demo_e200.ckpt")
pretrained_state_dict = {k.lstrip('net.'): v for k, v in pretrained_state_dict['state_dict'].items()}
model_state_dict.update(pretrained_state_dict)
model.load_state_dict(pretrained_state_dict)

model.eval()
model = model.cuda()

norm_mean= [0.5, 0.5, 0.5]
norm_std = [0.5, 0.5, 0.5]
padding = [0.0] * 3
transform = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
    ]
)

print(f"loading scene {img_save_dir}")
rgb_dir = os.path.join(img_save_dir, "rgb")
depth_dir = os.path.join(img_save_dir, "depth")
pose_dir = os.path.join(img_save_dir, "pose")
semantic_dir = os.path.join(img_save_dir, "semantic")
obj2cls_path = os.path.join(img_save_dir, "obj2cls_dict.txt")

rgb_list = sorted(os.listdir(rgb_dir), key=lambda x: int(
    x.split("_")[-1].split(".")[0]))
depth_list = sorted(os.listdir(depth_dir), key=lambda x: int(
    x.split("_")[-1].split(".")[0]))
pose_list = sorted(os.listdir(pose_dir), key=lambda x: int(
    x.split("_")[-1].split(".")[0]))
pose_list = sorted(os.listdir(pose_dir), key=lambda x: int(
    x.split("_")[-1].split(".")[0]))
semantic_list = sorted(os.listdir(semantic_dir), key=lambda x: int(
    x.split("_")[-1].split(".")[0]))

rgb_list = [os.path.join(rgb_dir, x) for x in rgb_list]
depth_list = [os.path.join(depth_dir, x) for x in depth_list]
pose_list = [os.path.join(pose_dir, x) for x in pose_list]
semantic_list = [os.path.join(semantic_dir, x) for x in semantic_list]


map_save_dir = os.path.join(img_save_dir, "map")
os.makedirs(map_save_dir, exist_ok=True)
color_top_down_save_path = os.path.join(map_save_dir, f"color_top_down_{mask_version}.npy")
gt_save_path = os.path.join(map_save_dir, f"grid_{mask_version}_gt.npy")
grid_save_path = os.path.join(map_save_dir, f"grid_lseg_{mask_version}.npy")
weight_save_path = os.path.join(map_save_dir, f"weight_lseg_{mask_version}.npy")
obstacles_save_path = os.path.join(map_save_dir, "obstacles.npy")

obj2cls = load_obj2cls_dict(obj2cls_path)

# initialize a grid with zero position at the center
color_top_down_height = (camera_height + 1) * np.ones((gs, gs), dtype=np.float32)
color_top_down = np.zeros((gs, gs, 3), dtype=np.uint8)
gt = np.zeros((gs, gs), dtype=np.int32)
grid = np.zeros((gs, gs, clip_feat_dim), dtype=np.float32)
obstacles = np.ones((gs, gs), dtype=np.uint8)
weight = np.zeros((gs, gs), dtype=float)

save_map(color_top_down_save_path, color_top_down)
save_map(gt_save_path, gt)
save_map(grid_save_path, grid)
save_map(weight_save_path, weight)
save_map(obstacles_save_path, obstacles)

tf_list = []
data_iter = zip(rgb_list, depth_list, semantic_list, pose_list)


cuda
Loading CLIP model...


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


loading scene C:\Users\Andrew Jeon\OneDrive\Desktop\vlmaps\data\5LpN3gDmAk7_1
C:\Users\Andrew Jeon\OneDrive\Desktop\vlmaps\data\5LpN3gDmAk7_1\map\color_top_down_1.npy is saved.
C:\Users\Andrew Jeon\OneDrive\Desktop\vlmaps\data\5LpN3gDmAk7_1\map\grid_1_gt.npy is saved.
C:\Users\Andrew Jeon\OneDrive\Desktop\vlmaps\data\5LpN3gDmAk7_1\map\grid_lseg_1.npy is saved.
C:\Users\Andrew Jeon\OneDrive\Desktop\vlmaps\data\5LpN3gDmAk7_1\map\weight_lseg_1.npy is saved.
C:\Users\Andrew Jeon\OneDrive\Desktop\vlmaps\data\5LpN3gDmAk7_1\map\obstacles.npy is saved.


In [7]:
def GetVoxelCoor(point):
    return ((point) / VOXEL_SIZE).astype(int)

def GetVoxelId(point, min_bound_voxel_grid):
    return ((point - min_bound_voxel_grid) / VOXEL_SIZE).astype(int) # returns 3, 1

In [8]:
class Voxel:
    def __init__(self, coord, feature_shape) -> None:
        self.coordinates = coord
        self.feature_weight = 0
        self.color_weight = 0
        self.features = np.zeros(feature_shape)
        self.colors = np.zeros((3,1))
        self.class_mask = 0

    def update_color(self, color):
        self.colors = (self.colors * self.color_weight + color) / (self.color_weight + 1)
        self.color_weight += 1

    def update_feature(self, feature):
        """
        Updates voxel features with weighted average.

        Args:
            weight: A float representing the weight for this voxel.
            feature: A NumPy array with the same shape as self.features 
            representing the feature to be added.
        """

        if self.features.shape != feature.shape:
            raise ValueError("Feature shape must match voxel feature shape")

        # Weighted average update
        self.features = (self.features * self.feature_weight + feature) / (self.feature_weight + 1)
        self.feature_weight += 1

        
    def expectedColor(self):
        return self.sum / self.pc_count

In [11]:
# This thing outputs a image segmentation mask as a np array
def get_lseg_feat(model: LSegEncNet, image: np.array, labels, transform, crop_size=480, \
                 base_size=520, norm_mean=[0.5, 0.5, 0.5], norm_std=[0.5, 0.5, 0.5]):
    vis_image = image.copy()
    image = transform(image).unsqueeze(0).cuda() # adds 1 dimension at the start of the np array
    img = image[0].permute(1,2,0) # switches the order of the dimensions of the image
    img = img * 0.5 + 0.5
    
    batch, _, h, w = image.size() # batch is dimension 1, ignoring channel, y, x
    stride_rate = 2.0/3.0
    stride = int(crop_size * stride_rate)

    long_size = base_size
    if h > w:
        height = long_size
        width = int(1.0 * w * long_size / h + 0.5)
        short_size = width
    else:
        width = long_size
        height = int(1.0 * h * long_size / w + 0.5)
        short_size = height


    cur_img = resize_image(image, height, width, **{'mode': 'bilinear', 'align_corners': True})

    if long_size <= crop_size:
        pad_img = pad_image(cur_img, norm_mean,
                            norm_std, crop_size)
        print(pad_img.shape)
        with torch.no_grad():
            outputs, logits = model(pad_img, labels)
        outputs = crop_image(outputs, 0, height, 0, width)
    else:
        if short_size < crop_size:
            # pad if needed
            pad_img = pad_image(cur_img, norm_mean,
                                norm_std, crop_size)
        else:
            pad_img = cur_img
        _,_,ph,pw = pad_img.shape #.size()
        assert(ph >= height and pw >= width)
        h_grids = int(math.ceil(1.0 * (ph-crop_size)/stride)) + 1
        w_grids = int(math.ceil(1.0 * (pw-crop_size)/stride)) + 1
        with torch.cuda.device_of(image):
            with torch.no_grad():
                outputs = image.new().resize_(batch, model.out_c,ph,pw).zero_().cuda()
                logits_outputs = image.new().resize_(batch, len(labels),ph,pw).zero_().cuda()
            count_norm = image.new().resize_(batch,1,ph,pw).zero_().cuda()
        # grid evaluation
        for idh in range(h_grids):
            for idw in range(w_grids):
                h0 = idh * stride
                w0 = idw * stride
                h1 = min(h0 + crop_size, ph)
                w1 = min(w0 + crop_size, pw)
                crop_img = crop_image(pad_img, h0, h1, w0, w1)
                # pad if needed
                pad_crop_img = pad_image(crop_img, norm_mean,
                                            norm_std, crop_size)
                with torch.no_grad():
                    output, logits = model(pad_crop_img, labels)
                cropped = crop_image(output, 0, h1-h0, 0, w1-w0)
                cropped_logits = crop_image(logits, 0, h1-h0, 0, w1-w0)
                outputs[:,:,h0:h1,w0:w1] += cropped
                logits_outputs[:,:,h0:h1,w0:w1] += cropped_logits
                count_norm[:,:,h0:h1,w0:w1] += 1
        assert((count_norm==0).sum()==0)
        outputs = outputs / count_norm
        logits_outputs = logits_outputs / count_norm
        outputs = outputs[:,:,:height,:width]
        logits_outputs = logits_outputs[:,:,:height,:width]
    outputs = outputs.cpu()
    outputs = outputs.numpy() # B, D, H, W
    predicts = [torch.max(logit, 0)[1].cpu().numpy() for logit in logits_outputs]
    pred = predicts[0]

    return outputs

In [9]:
# num_frames = len(depth_list)
# resolution = 720 * 1080
# channels = 3 #RGB
# dimensions = 3 #XYZ
import math


voxel_grid = {}

for frame_index, data_sample in enumerate(data_iter):    
    rgb_path, depth_path, semantic_path, pose_path = data_sample
    
    bgr = cv2.imread(rgb_path)
    rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)

    # read pose
    pos, rot = load_pose(pose_path)  # z backward, y upward, x to the right
    rot_ro_cam = np.eye(3)
    rot_ro_cam[1, 1] = -1
    rot_ro_cam[2, 2] = -1
    rot = rot @ rot_ro_cam
    pos[1] += camera_height


    pose = np.eye(4)
    pose[:3, :3] = rot
    pose[:3, 3] = pos.reshape(-1)

    tf_list.append(pose)
    if len(tf_list) == 1:
        init_tf_inv = np.linalg.inv(tf_list[0]) 

    tf = init_tf_inv @ pose

    # read depth
    depth = load_depth(depth_path)

    # read semantic
    semantic = load_semantic(semantic_path)
    semantic = cvt_obj_id_2_cls_id(semantic, obj2cls)

    #TODO
    pix_feats = get_lseg_feat(model, rgb, labels, transform, crop_size, base_size, norm_mean, norm_std)
    
    # transform all points to the global frame
    pc, mask = depth2pc(depth)
    shuffle_mask = np.arange(pc.shape[1]) 
    np.random.shuffle(shuffle_mask)
    shuffle_mask = shuffle_mask[::depth_sample_rate]
    mask = mask[shuffle_mask]
    pc = pc[:, shuffle_mask]
    pc = pc[:, mask]
    pc_global = transform_pc(pc, tf)

    rgb_cam_mat = get_sim_cam_mat(rgb.shape[0], rgb.shape[1])
    #TODO
    feat_cam_mat = get_sim_cam_mat(pix_feats.shape[2], pix_feats.shape[3])

    # project all point cloud onto the ground
    for pixel_index, (p, p_local) in enumerate(zip(pc_global.T, pc.T)):
    
        x, y = pos2grid_id(gs, cs, p[0], p[2])

        single_global_point = (tf @ np.vstack([p_local.reshape(3,1), np.ones((1, 1))]) )[:3]

        rgb_px, rgb_py, rgb_pz = project_point(rgb_cam_mat, p_local)
        rgb_v = rgb[rgb_py, rgb_px, :].reshape(3,1)

        semantic_v = semantic[rgb_py, rgb_px] # this calculates the class id of the pixel, which is used to build the Ground Truth map
        if semantic_v == 40:
            semantic_v = -1

        # if not (px < 0 or py < 0 or px >= pix_feats.shape[3] or py >= pix_feats.shape[2]):
        #     feat = pix_feats[0, :, py, px]   #these are for finding the corresponding features for that pixel in the feature matrix
        #     grid[y, x] = (grid[y, x] * weight[y, x] + feat) / (weight[y, x] + 1) # grid[y,x] refers to value stored at current grid cell location whicih is likely the accumulated feature information, 
        #     # weight refers to how many points have contribute to current value of grid cell, 
        #     # feat is the LSEG feature embedding exxtracted for current point being processed
        #     # feat is added to existing features in cell, weighted by number of points already there weight[y,x]
        #     weight[y, x] += 1

        voxel_coor = GetVoxelCoor(single_global_point)
        voxel_key = str(voxel_coor[0,0]) + "," + str(voxel_coor[1,0]) + "," + str(voxel_coor[2,0])
        if voxel_key not in voxel_grid:
            voxel_grid[voxel_key] = Voxel(voxel_coor, pix_feats.shape[1]) #if not in voxel grid, then make it
        voxel_grid[voxel_key].update_color((rgb_v/ 255.0))
        

        # average the visual embeddings if multiple points are projected to the same grid cell
        # we want to change this to average each 3D voxel  
        px, py, pz = project_point(feat_cam_mat, p_local)
        if not (px < 0 or py < 0 or px >= pix_feats.shape[3] or py >= pix_feats.shape[2]):
            feat = pix_feats[0, :, py, px] #these are for finding the corresponding features for that pixel in the feature matrix
            voxel_grid[voxel_key].update_feature(feat)
        # voxel_grid[voxel_key] = (voxel_grid[voxel_key] * weight[y, x] + feat) / (weight[y, x] + 1)



In [12]:
save_map(r"C:\Users\Andrew Jeon\OneDrive\Desktop\vlmaps\maps\colormap.npy", np.array([i.colors for i in voxel_grid.values()]))
save_map(r"C:\Users\Andrew Jeon\OneDrive\Desktop\vlmaps\maps\featuremap.npy", np.array([i.features for i in voxel_grid.values()]))
save_map(r"C:\Users\Andrew Jeon\OneDrive\Desktop\vlmaps\maps\coordinates.npy", np.array([i.coordinates for i in voxel_grid.values()]))

NameError: name 'voxel_grid' is not defined

In [13]:
# @markdown Input the prompt as a string of object names separated by ","
%matplotlib inline
import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from utils.clip_mapping_utils import load_map, get_new_pallete, get_new_mask_pallete
from utils.clip_utils import get_text_feats
from utils.mp3dcat import mp3dcat
import clip



colors = load_map(r"C:\Users\Andrew Jeon\OneDrive\Desktop\vlmaps\maps\colormap.npy")
features = load_map(r"C:\Users\Andrew Jeon\OneDrive\Desktop\vlmaps\maps\featuremap.npy")
coordinates = load_map(r"C:\Users\Andrew Jeon\OneDrive\Desktop\vlmaps\maps\coordinates.npy")

# lang = "big flat counter, sofa, floor, chair, wash basin, other" # @param {type: "string"}
# lang = lang.split(",")

lang = mp3dcat # lang is all the matterport classes
text_feats = get_text_feats(lang, clip_model, clip_feat_dim)

map_feats = features
# map_feats = [i.features for i in voxel_grid.values()]
scores_list = map_feats @ text_feats.T

predicts = np.argmax(scores_list, axis=1)

# for voxel, class_mask in zip(voxel_grid, predicts):
#     voxel.class_mask = class_mask

#predicts = predicts.reshape((xmax - xmin + 1, ymax - ymin + 1))
# floor_mask = predicts == 2

# new_pallete = get_new_pallete(len(lang))

# mask, patches = get_new_mask_pallete(predicts, new_pallete, out_label_flag=True, labels=lang)
# seg = mask.convert("RGBA")
# seg = np.array(seg)
# seg[no_map_mask] = [225, 225, 225, 255]
# seg[floor_mask] = [225, 225, 225, 255]
# seg = Image.fromarray(seg)
# plt.figure(figsize=(10, 6), dpi=120)
# plt.legend(handles=patches, loc='upper left', bbox_to_anchor=(1., 1), prop={'size': 10})
# plt.axis('off')
# plt.title("VLMaps")
# plt.imshow(seg)
# plt.show()

In [14]:

import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from utils.clip_mapping_utils import load_map, get_new_pallete, get_new_mask_pallete
from utils.clip_utils import get_text_feats
from utils.mp3dcat import mp3dcat
import clip


color_list = [[1, 0, 0], [0, 1, 0], [0, 0, 1],
              [1, 0.5, 0], [0.5, 1, 0], [0, 1, 0.5], [0.5, 0, 1], [1, 0, 1],
              [0.7, 0.7, 0.7], [0.3, 0.3, 0.3], [1, 0.8, 0.6], [0.8, 1, 0.8],
              [0.6, 0.8, 1], [0.9, 0.5, 0], [0.5, 0.9, 0], [0, 0.6, 0.9],
              [0.9, 0, 0.6], [1, 1, 0.5], [0.2, 0.5, 0.8], [0.8, 0.2, 0.5],
              [0.5, 0.8, 0.2], [0.7, 0.4, 0.1], [0.1, 0.7, 0.4], [0.4, 0.1, 0.7],
              [0, 1, 1], [0, 0.5, 0.5], [0.5, 0, 0.5], [1, 0.75, 0.75],
              [0.75, 1, 0.75], [0.75, 0.75, 1], [0.25, 0.25, 0.25], [0.4, 0.4, 0.4],
              [0.6, 0.6, 0.6], [0, 0.2, 0.8], [0.8, 0, 0.2], [0.2, 0.8, 0],
              [0.4, 0.8, 0.4], [0.8, 0.4, 0.4], [0.4, 0.4, 0.8]]



# Convert the list to a NumPy array
color_array = np.array(color_list)


color_palette = color_array[0:len(lang),:]
print(color_palette)


# new_pallete = (np.array(new_pallete) / 255.0).reshape(3,6).T
# print(new_pallete)

[[1.   0.   0.  ]
 [0.   1.   0.  ]
 [0.   0.   1.  ]
 [1.   0.5  0.  ]
 [0.5  1.   0.  ]
 [0.   1.   0.5 ]
 [0.5  0.   1.  ]
 [1.   0.   1.  ]
 [0.7  0.7  0.7 ]
 [0.3  0.3  0.3 ]
 [1.   0.8  0.6 ]
 [0.8  1.   0.8 ]
 [0.6  0.8  1.  ]
 [0.9  0.5  0.  ]
 [0.5  0.9  0.  ]
 [0.   0.6  0.9 ]
 [0.9  0.   0.6 ]
 [1.   1.   0.5 ]
 [0.2  0.5  0.8 ]
 [0.8  0.2  0.5 ]
 [0.5  0.8  0.2 ]
 [0.7  0.4  0.1 ]
 [0.1  0.7  0.4 ]
 [0.4  0.1  0.7 ]
 [0.   1.   1.  ]
 [0.   0.5  0.5 ]
 [0.5  0.   0.5 ]
 [1.   0.75 0.75]
 [0.75 1.   0.75]
 [0.75 0.75 1.  ]
 [0.25 0.25 0.25]
 [0.4  0.4  0.4 ]
 [0.6  0.6  0.6 ]
 [0.   0.2  0.8 ]
 [0.8  0.   0.2 ]
 [0.2  0.8  0.  ]
 [0.4  0.8  0.4 ]
 [0.8  0.4  0.4 ]
 [0.4  0.4  0.8 ]]


In [None]:
# print(len(voxel_grid.values()))
# print(scores_list.shape)
# print(predicts.shape)

443307
(443307, 6)
(443307,)


In [59]:
!pip install open3d==0.18.0



DEPRECATION: pytorch-lightning 1.8.1 has a non-standard dependency specifier torch>=1.9.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063


In [15]:
colored_voxel_grid = o3d.geometry.VoxelGrid()
colored_voxel_grid.voxel_size = VOXEL_SIZE

# for voxel, mask in zip(voxel_grid.values(), predicts):
#     #print(new_pallete[mask])
#     o3d_voxel = o3d.geometry.Voxel(voxel.coordinates, color_palette[mask])
#     colored_voxel_grid.add_voxel(o3d_voxel)

for coordinate, mask in zip(coordinates, predicts):
    o3d_voxel = o3d.geometry.Voxel(coordinate, color_palette[mask])
    colored_voxel_grid.add_voxel(o3d_voxel)

In [61]:
# o3d.visualization.draw_geometries([voxel_grid])

In [16]:
vis = o3d.visualization.Visualizer()
vis.create_window(visible=True)
# Call only after creating visualizer window.
vis.get_render_option().background_color = [0, 0.3, 0]
vis.add_geometry(colored_voxel_grid)
vis.run()
vis.close()



: 