In [36]:
import torch 
from datasets.liferefer_dataset import LifeReferDataset
from datasets.strefer_dataset import STReferDataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np

In [None]:
# ------------------------------------------------------------------------
# BEAUTY DETR
# Copyright (c) 2022 Ayush Jain & Nikolaos Gkanatsios
# Licensed under CC-BY-NC [see LICENSE for details]
# All Rights Reserved
# ------------------------------------------------------------------------
# Parts adapted from Group-Free
# Copyright (c) 2021 Ze Liu. All Rights Reserved.
# Licensed under the MIT License.
# ------------------------------------------------------------------------

import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
from transformers import RobertaModel, RobertaTokenizerFast

from models.point_backbone_module import Pointnet2Backbone
from models.image_backbone_module import VisualBackbone

from models.modules import (
    PointsObjClsModule, GeneralSamplingModule,
    ClsAgnosticPredictHead, PositionEmbeddingLearned
)
from models.encoder_decoder_layers import (
    BiEncoder, BiEncoderLayer, BiDecoderLayer, MultiCALayer, ImageMultiCALayer
)


class BeaUTyDETR(nn.Module):
    """
    3D language grounder.

    Args:
        num_class (int): number of semantics classes to predict
        num_obj_class (int): number of object classes
        input_feature_dim (int): feat_dim of pointcloud (without xyz)
        num_queries (int): Number of queries generated
        num_decoder_layers (int): number of decoder layers
        self_position_embedding (str or None): how to compute pos embeddings
        contrastive_align_loss (bool): contrast queries and token features
        d_model (int): dimension of features
        fuse_img (bool): use detected box stream
        pointnet_ckpt (str or None): path to pre-trained pp++ checkpoint
        self_attend (bool): add self-attention in encoder
    """

    def __init__(self, args=None, num_class=50,
                 input_feature_dim=3,
                 num_queries=256,
                 num_decoder_layers=6, self_position_embedding='loc_learned',
                 contrastive_align_loss=True,
                 d_model=288, pointnet_ckpt=None, resnet_ckpt=None,
                 self_attend=True,
                 frame_num=2,
                 butd=False,
                 ):
        """Initialize layers."""
        super().__init__()   
        self.args = args     
        self.num_queries = num_queries
        self.num_decoder_layers = num_decoder_layers
        self.self_position_embedding = self_position_embedding
        self.contrastive_align_loss = contrastive_align_loss
        self.butd = butd

        # Visual encoder
        self.point_backbone_net = Pointnet2Backbone(
            input_feature_dim=input_feature_dim,
            width=1
        )
        if input_feature_dim == 3 and pointnet_ckpt is not None:
            self.point_backbone_net.load_state_dict(torch.load(
                pointnet_ckpt
            ), strict=False)
            for p in self.point_backbone_net.parameters():
                p.requires_grad = False

        self.image_backbone_net = VisualBackbone(d_model=d_model)
        if resnet_ckpt is not None:
            self.image_backbone_net.load_state_dict(torch.load(
                resnet_ckpt
            ), strict=False)
            for p in self.image_backbone_net.parameters():
                p.requires_grad = False
        
        # Box encoder
        if self.butd:
            self.box_embeddings = PositionEmbeddingLearned(6, 288)
            
        # MultiFrames Fusion
        self.multi_fuser = nn.ModuleList()
        self.multi_fuser_layers = 1
        for _ in range(self.multi_fuser_layers):
            self.multi_fuser.append(MultiCALayer(
                d_model, n_heads=8, dim_feedforward=256,
                dropout=0.1, activation="relu",
                frame_num=frame_num,
            ))

            
        self.image_multi_fuser = nn.ModuleList()
        for _ in range(self.multi_fuser_layers):
            self.image_multi_fuser.append(ImageMultiCALayer(
                d_model, n_heads=8, dim_feedforward=256,
                dropout=0.1, activation="relu",
                frame_num=frame_num,
            ))
        
        # Text Encoder
        t_type = "roberta-base"
        self.tokenizer = RobertaTokenizerFast.from_pretrained(t_type)
        self.text_encoder = RobertaModel.from_pretrained(t_type)
        for param in self.text_encoder.parameters():
            param.requires_grad = False

        self.text_projector = nn.Sequential(
            nn.Linear(self.text_encoder.config.hidden_size, d_model),
            nn.LayerNorm(d_model, eps=1e-12),
            nn.Dropout(0.1)
        )


        # Cross-encoder (Text-Points)
        self.pos_embed = PositionEmbeddingLearned(3, d_model)
        bi_layer_pc = BiEncoderLayer(
            d_model, dropout=0.1, activation="relu",
            n_heads=8, dim_feedforward=256,
            self_attend_lang=self_attend, self_attend_vis=self_attend,
            use_img_enc_attn=True,
            use_butd_enc_attn=butd
        )
        self.cross_encoder_text_points = BiEncoder(bi_layer_pc, 3)

        # Cross-encoder (Text-Image)
        # bi_layer_img = BiEncoderLayer(
        #     d_model, dropout=0.1, activation="relu",
        #     n_heads=8, dim_feedforward=256,
        #     self_attend_lang=self_attend, self_attend_vis=self_attend,
        #     use_img_enc_attn=False
        # )
        # self.cross_encoder_text_image = BiEncoder(bi_layer_img, 3)

        # Query initialization
        self.points_obj_cls = PointsObjClsModule(d_model)
        self.gsample_module = GeneralSamplingModule()
        self.decoder_query_proj = nn.Conv1d(d_model, d_model, kernel_size=1)

        # Proposal (layer for size and center)
        self.proposal_head = ClsAgnosticPredictHead(
            num_class, 1, num_queries, d_model,
            objectness=False, heading=False,
            compute_sem_scores=True
        )

        # Transformer decoder layers
        self.decoder = nn.ModuleList()
        for _ in range(self.num_decoder_layers):
            self.decoder.append(BiDecoderLayer(
                d_model, n_heads=8, dim_feedforward=256,
                dropout=0.1, activation="relu",
                self_position_embedding=self_position_embedding, butd=self.butd
            ))

        # Prediction heads
        self.prediction_heads = nn.ModuleList()
        for _ in range(self.num_decoder_layers):
            self.prediction_heads.append(ClsAgnosticPredictHead(
                num_class, 1, num_queries, d_model,
                objectness=False, heading=False,
                compute_sem_scores=True
            ))

        # Extra layers for contrastive losses
        if contrastive_align_loss:
            self.contrastive_align_projection_image = nn.Sequential(
                nn.Linear(d_model, d_model),
                nn.ReLU(),
                nn.Linear(d_model, d_model),
                nn.ReLU(),
                nn.Linear(d_model, 64)
            )
            self.contrastive_align_projection_text = nn.Sequential(
                nn.Linear(d_model, d_model),
                nn.ReLU(),
                nn.Linear(d_model, d_model),
                nn.ReLU(),
                nn.Linear(d_model, 64)
            )

        # Init
        self.init_bn_momentum()

    def _run_backbones(self, inputs, DEBUG=False):
        """Run visual and text backbones."""
        # Visual encoder
        if DEBUG: 
            import pdb; pdb.set_trace()
        point_clouds = inputs['point_clouds']

        print(f"[DEBUG - before doing anything] {point_clouds.shape}")


        B, K, N, C = point_clouds.shape
        point_clouds = point_clouds.view(B*K, N, C)
        if self.args.lr_backbone > 0:
            end_points = self.point_backbone_net(point_clouds, end_points={})
        else:
            with torch.no_grad():
                end_points = self.point_backbone_net(point_clouds, end_points={})
        if K == 1:
            end_points['seed_inds'] = end_points['fp2_inds']
            end_points['seed_xyz'] = end_points['fp2_xyz']
            end_points['seed_features'] = end_points['fp2_features']
        else:
            seed_inds = end_points['fp2_inds'].view(B, K, -1)
            seed_xyz = end_points['fp2_xyz'].view(B, K, -1, 3)
            seed_features = end_points['fp2_features'].view(B, K, -1, seed_xyz.shape[2])
            end_points['seed_inds'] = seed_inds[:, 0]
            end_points['seed_xyz'] = seed_xyz[:, 0]
            end_points['seed_features'] = seed_features[:, 0]
            end_points['additional_seed_inds'] = seed_inds
            end_points['additional_seed_xyz'] = seed_xyz
            end_points['additional_seed_features'] = seed_features
            end_points['fp2_inds'] = end_points['seed_inds']
            end_points['fp2_xyz'] = end_points['seed_xyz']
            end_points['fp2_features'] = end_points['seed_features']
        

        print(f"[DEBUG - after running pointnet] {end_points['fp2_xyz'].shape}")
        import pdb; pdb.set_trace()
        # Image encoder
        image = inputs['image']
        img_mask = inputs['img_mask']
        B, K, H, W = img_mask.shape
        image = image.view(B*K, -1, H, W)
        img_mask = img_mask.view(B*K, H, W)
        # end_points['image_feature']
        # end_points['img_mask']
        # end_points['img_pos']
        if self.args.lr_backbone > 0:
            end_points = self.image_backbone_net(image, img_mask, end_points=end_points)
        else:
            with torch.no_grad():
                end_points = self.image_backbone_net(image, img_mask, end_points=end_points)
        image_feature = end_points['image_feature'].view(B, K, end_points['image_feature'].shape[-2], end_points['image_feature'].shape[-1])
        image_mask = ~end_points['img_mask'].view(B, K, end_points['image_feature'].shape[-1])
        image_pos = end_points['img_pos'].view(B, K, end_points['img_pos'].shape[-2], end_points['img_pos'].shape[-1])
        end_points['image_feature'] = image_feature[:, 0]
        end_points['img_mask'] = image_mask[:, 0]
        end_points['img_pos'] = image_pos[:, 0]
        end_points['additional_image_feature'] = image_feature
        end_points['additional_img_mask'] = image_mask
        end_points['additional_img_pos'] = image_pos
        
        # Text encoder
        tokenized = self.tokenizer.batch_encode_plus(
            inputs['text'], padding="longest", return_tensors="pt"
        ).to(point_clouds.device)
        
        encoded_text = self.text_encoder(**tokenized)
        text_feats = self.text_projector(encoded_text.last_hidden_state)

        # Invert attention mask that we get from huggingface
        # because its the opposite in pytorch transformer
        text_attention_mask = tokenized.attention_mask.ne(1).bool()
        end_points['text_feats'] = text_feats
        end_points['text_attention_mask'] = text_attention_mask
        end_points['tokenized'] = tokenized
        return end_points

    def _generate_queries(self, xyz, features, end_points):
        # kps sampling
        points_obj_cls_logits = self.points_obj_cls(features)
        end_points['seeds_obj_cls_logits'] = points_obj_cls_logits
        sample_inds = torch.topk(
            torch.sigmoid(points_obj_cls_logits).squeeze(1),
            self.num_queries
        )[1].int()
        xyz, features, sample_inds = self.gsample_module(
            xyz, features, sample_inds
        )
        end_points['query_points_xyz'] = xyz  # (B, V, 3)
        end_points['query_points_feature'] = features  # (B, F, V)
        end_points['query_points_sample_inds'] = sample_inds  # (B, V)
        return end_points

    def forward(self, inputs, DEBUG=True):
        """
        Forward pass.
        Args:
            inputs: dict
                {point_clouds, text}
                point_clouds (tensor): (B, Npoint, 3 + input_channels)
                text (list): ['text0', 'text1', ...], len(text) = B

                more keys if fuse_img is enabled:
                    det_bbox_label_mask
                    det_boxes
                    det_class_ids
        Returns:
            end_points: dict
        """


        if DEBUG: 
            import pdb; pdb.set_trace()
            print("[DEBUG] Forward pass")
        # Within-modality encoding
        end_points = self._run_backbones(inputs, DEBUG)
        
        points_xyz = end_points['fp2_xyz']  # (B, points, 3)
        points_features = end_points['fp2_features']  # (B, F, points)
        points_mask = torch.zeros((len(points_xyz), points_xyz.size(1))).to(points_xyz.device).bool()  # (B, points)
        original_text_feats = end_points['text_feats']  # (B, L, F)
        text_padding_mask = end_points['text_attention_mask']  # (B, L)
        
        # Point Multi-Fuser
        additional_points_xyz = end_points['additional_seed_xyz']
        additional_points_features = end_points['additional_seed_features']
        if DEBUG: 
            import pdb; pdb.set_trace()
        for i in range(self.multi_fuser_layers):
            points_features = self.multi_fuser[i](
                query=points_features.transpose(1, 2).contiguous(),
                key=additional_points_features.transpose(-1, -2).contiguous(),
                value=additional_points_features.transpose(-1, -2).contiguous(),
                query_pos=points_xyz,
                key_pos=additional_points_xyz,
                multi_mask=inputs['dynamic_mask']
            )

        # Image Multi-Fuser
        image_features = end_points['image_feature']  # (B, F, N)
        img_mask = end_points['img_mask']  # (B, N)
        img_pos = end_points['img_pos']    # (B, F, N)
        additional_image_feature = end_points['additional_image_feature']
        additional_image_pos = end_points['additional_img_pos']
        additional_img_mask = end_points['additional_img_mask']

        for i in range(self.multi_fuser_layers):
            image_features = self.image_multi_fuser[i](
                query=image_features.transpose(1, 2).contiguous(),
                key=additional_image_feature.transpose(-1, -2).contiguous(),
                value=additional_image_feature.transpose(-1, -2).contiguous(),
                query_pos=img_pos,
                key_pos=additional_image_pos,
                multi_mask=inputs['dynamic_mask'],
                key_mask=additional_img_mask
            )
        image_features = image_features.transpose(1, 2).contiguous()


        # Box encoding
        if self.butd:
            # attend on those features
            detected_mask = ~inputs['det_bbox_label_mask']      # [111000] -> [000111]

            if DEBUG: 
                import pdb; pdb.set_trace()
            detected_feats =  self.box_embeddings(inputs['det_boxes']).transpose(1, 2).contiguous()
        else:
            detected_mask = None
            detected_feats = None
        
        if DEBUG: 
            import pdb; pdb.set_trace()
        # Cross-modality encoding (Text-Points)
        points_features, text_feats = self.cross_encoder_text_points(
            vis_feats=points_features.transpose(1, 2).contiguous(),
            pos_feats=self.pos_embed(points_xyz).transpose(1, 2).contiguous(),
            padding_mask=points_mask,
            text_feats=original_text_feats,
            text_padding_mask=text_padding_mask,
            end_points=end_points,
            enhanced_feats=image_features,
            enhanced_mask=img_mask,
            detected_feats=detected_feats,
            detected_mask=detected_mask
        )
        
        points_features = points_features.transpose(1, 2)
        points_features = points_features.contiguous()  # (B, F, points)
        end_points["text_memory"] = text_feats
        end_points['seed_features'] = points_features

        if DEBUG: 
            import pdb; pdb.set_trace()
        if self.contrastive_align_loss:
            proj_tokens = F.normalize(
                self.contrastive_align_projection_text(text_feats), p=2, dim=-1
            )
            end_points['proj_tokens'] = proj_tokens
        
        # Query Points Generation
        end_points = self._generate_queries(
            points_xyz, points_features, end_points
        )
        cluster_feature = end_points['query_points_feature']  # (B, F, V)
        cluster_xyz = end_points['query_points_xyz']  # (B, V, 3)
        query = self.decoder_query_proj(cluster_feature)
        query = query.transpose(1, 2).contiguous()  # (B, V, F)

        if self.contrastive_align_loss:
            end_points['proposal_proj_queries'] = F.normalize(
                self.contrastive_align_projection_image(query), p=2, dim=-1
            )

        if DEBUG: 
            import pdb; pdb.set_trace()
        # Proposals (one for each query)
        proposal_center, proposal_size = self.proposal_head(
            cluster_feature,
            base_xyz=cluster_xyz,
            end_points=end_points,
            prefix='proposal_'
        )
        base_xyz = proposal_center.detach().clone()  # (B, V, 3)
        base_size = proposal_size.detach().clone()  # (B, V, 3)
        query_mask = None

        # Decoder
        for i in range(self.num_decoder_layers):
            prefix = 'last_' if i == self.num_decoder_layers-1 else f'{i}head_'

            # Position Embedding for Self-Attention
            if self.self_position_embedding == 'none':
                query_pos = None
            elif self.self_position_embedding == 'xyz_learned':
                query_pos = base_xyz
            elif self.self_position_embedding == 'loc_learned':
                query_pos = torch.cat([base_xyz, base_size], -1)
            else:
                raise NotImplementedError

            # Transformer Decoder Layer
            query = self.decoder[i](
                query, points_features.transpose(1, 2).contiguous(),
                text_feats, query_pos,
                query_mask,
                text_padding_mask,
                detected_feats=(
                    detected_feats if self.butd
                    else None
                ),
                detected_mask=detected_mask if self.butd else None
            )  # (B, V, F)

            if self.contrastive_align_loss:
                end_points[f'{prefix}proj_queries'] = F.normalize(
                    self.contrastive_align_projection_image(query), p=2, dim=-1
                )

            # Prediction
            base_xyz, base_size = self.prediction_heads[i](
                query.transpose(1, 2).contiguous(),  # (B, F, V)
                base_xyz=cluster_xyz,
                end_points=end_points,
                prefix=prefix
            )
            base_xyz = base_xyz.detach().clone()
            base_size = base_size.detach().clone()

        return end_points

    def init_bn_momentum(self):
        """Initialize batch-norm momentum."""
        for m in self.modules():
            if isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d)):
                m.momentum = 0.1


In [38]:




def create_dataset(args, split):
    if args.dataset == 'liferefer':
        return LifeReferDataset(args, split)
    elif args.dataset == 'strefer':
        return STReferDataset(args, split)
    else:
        raise ValueError("Wrong Dataset")

from models.bdetr import BeaUTyDETR
# from .ap_helper import APCalculator, parse_predictions, parse_groundtruths
from models.losses import HungarianMatcher, SetCriterion, compute_hungarian_loss

def create_model(args):
    return BeaUTyDETR(
        args=args,
        num_class=args.max_lang_num,
        input_feature_dim=3,
        num_queries=args.num_queries,
        num_decoder_layers=args.num_decoder_layers,
        self_position_embedding='loc_learned',
        contrastive_align_loss=True,
        d_model=288,
        pointnet_ckpt=None,
        resnet_ckpt=None,
        self_attend=True,
        frame_num=args.frame_num,
        butd=args.butd
    )


from types import SimpleNamespace

def config():
    return SimpleNamespace()

args = config()
args.batch_size = 32 
args.butd = False 
args.dataset = 'liferefer'
args.debug = False
args.dynamic = True
args.epochs = 100
args.frame_num = 2
args.img_size = 384
args.lr = 0.0001
args.lr_backbone = 0.001
args.lr_step = [45, 80]
args.max_lang_num = 100
args.max_obj_num = 100
args.num_decoder_layers = 6
args.num_queries = 256
args.num_workers = 8
args.pretrain = 'weights/liferefer_weights.pth'
args.seed = 42
args.text_encoder_lr = 1e-05
args.val_epoch = 1
args.verbose_step = 10
args.warmup_epoch = -1
args.work_dir = 'outputs/debug'

print("Create Dataset")
test_dataset = create_dataset(args, 'test')
generator = torch.Generator()
test_loader = DataLoader(test_dataset, args.batch_size, shuffle=False, num_workers=args.num_workers, generator=generator)

print("Create Model")
model = create_model(args)

# with open("Model_architecture.txt", "w") as f:
#     f.write(str(model))

# import sys;exit()
model.load_state_dict(torch.load(args.pretrain, map_location='cpu')['model'], strict=True)

# model.cuda() 
model.to("cpu")


# Evaluate the model 
model.eval()
loss = 0
total_predict_boxes = []


# test_loader[0]

Create Dataset




Create Model


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [39]:
len(test_dataset)

10730

In [40]:
len(test_loader)

336

In [41]:
for batch in test_loader:
    # print(batch)
    break

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Traceback (most recent call last):
  File "/home/avishka/anaconda3/envs/wildrefer_env/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
    finalizer()
  File "/home/avishka/anaconda3/envs/wildrefer_env/lib/python3.8/multiprocessing/util.py", line 224, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/home/avishka/anaconda3/envs/wildrefer_env/lib/python3.8/multiprocessing/util.py", line 133, in _remove_temp_dir
    rmtree(tempdir)
  File "/home/avishka/anaconda3/envs/wildrefer_env/lib/python3.8/shutil.py", line 722, in rmtree
    onerror(os.rmdir, path, sys.exc_info())
  File "/home/avishka/anaconda3/envs/wildrefer_env/lib/python3.8/shutil.py", line 720, in rmtree
    os.rmdir(path)
OSError: [Errno 39] Directory not empty: '/tmp/pymp-aio9jg3a'


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [42]:
args.batch_size

32

In [43]:
batch['point_clouds'].shape

torch.Size([32, 2, 30000, 6])

In [44]:
one_test_sample = test_dataset[0]

In [45]:
test_dataset[0]['point_clouds'].shape

(2, 30000, 6)

In [46]:
input_data = one_test_sample
for key in input_data:
    print(key)
    print(type(input_data[key]))
    if isinstance(input_data[key], torch.Tensor):
        print("before", input_data[key].shape)
        input_data[key] = input_data[key].unsqueeze(0).cuda()
        print("after", input_data[key].shape)

point_clouds
<class 'numpy.ndarray'>
text
<class 'str'>
dynamic_mask
<class 'numpy.ndarray'>
image
<class 'numpy.ndarray'>
img_mask
<class 'numpy.ndarray'>
center_label
<class 'numpy.ndarray'>
size_gts
<class 'numpy.ndarray'>
box_label_mask
<class 'numpy.ndarray'>
point_instance_label
<class 'numpy.ndarray'>
sem_cls_label
<class 'numpy.ndarray'>
tokens_positive
<class 'numpy.ndarray'>
positive_map
<class 'numpy.ndarray'>


In [47]:
batch;

In [48]:
one_test_sample_from_batch = dict()
for key in batch:
    print(key)
    if isinstance(batch[key], torch.Tensor):
        one_test_sample_from_batch[key] = batch[key][:1,].cuda()
    else:
        one_test_sample_from_batch[key] = batch[key][:1]


point_clouds
text
dynamic_mask
image
img_mask
center_label
size_gts
box_label_mask
point_instance_label
sem_cls_label
tokens_positive
positive_map


In [49]:
one_test_sample_from_batch['point_clouds'].shape

torch.Size([1, 2, 30000, 6])

In [50]:
one_test_sample_from_batch['text']

['a girl wearing a dark gray jacket and carrying a pink school bag stands by the slide , and she turns her head to look at the slide . not mentioned']

In [51]:
pprint(one_test_sample_from_batch)

{'box_label_mask': tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], device='cuda:0'),
 'center_label': tensor([[[13.5800,  0.4178, -0.4951],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.000

In [52]:
from pprint import pprint 

pprint(batch.keys())

dict_keys(['point_clouds', 'text', 'dynamic_mask', 'image', 'img_mask', 'center_label', 'size_gts', 'box_label_mask', 'point_instance_label', 'sem_cls_label', 'tokens_positive', 'positive_map'])


In [53]:
batch['point_clouds'].shape

torch.Size([32, 2, 30000, 6])

In [54]:
batch['text']

['a girl wearing a dark gray jacket and carrying a pink school bag stands by the slide , and she turns her head to look at the slide . not mentioned', "a man is sitting and playing on a dinosaur rocker . the little boy does not look in the man's direction . not mentioned", 'this is a sunny day . a girl wearing a white shirt is standing on the escalator .she also wears dark pants and black bag .at the same time , she turns around to look at the camera . not mentioned', 'in a bright room , a girl with long hair is writing on a board . she wears short sleeves and short pants . she is standing . not mentioned', 'this is a sunny day . a middle-aged man is standing and talking to a man standing next to him . he is watching the children who are playing . not mentioned', 'in a bright room , a man wearing shirt and mask is sitting on a chair . he is looking at a loptap . he is surrounded by a grounp of men . not mentioned', 'in a bright room , a girl with long pants is walking to the door . she

In [55]:
len(batch['text'])

32

In [56]:
model.cuda()
with torch.inference_mode():
    end_points = model(one_test_sample_from_batch, DEBUG=False)


  dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)


In [57]:
end_points.keys()

dict_keys(['sa1_inds', 'sa1_xyz', 'sa1_features', 'sa2_inds', 'sa2_xyz', 'sa2_features', 'sa3_xyz', 'sa3_features', 'sa4_xyz', 'sa4_features', 'fp2_features', 'fp2_xyz', 'fp2_inds', 'seed_inds', 'seed_xyz', 'seed_features', 'additional_seed_inds', 'additional_seed_xyz', 'additional_seed_features', 'image_feature', 'img_mask', 'img_pos', 'additional_image_feature', 'additional_img_mask', 'additional_img_pos', 'text_feats', 'text_attention_mask', 'tokenized', 'text_memory', 'proj_tokens', 'seeds_obj_cls_logits', 'query_points_xyz', 'query_points_feature', 'query_points_sample_inds', 'proposal_proj_queries', 'proposal_base_xyz', 'proposal_center', 'proposal_pred_size', 'proposal_sem_cls_scores', '0head_proj_queries', '0head_base_xyz', '0head_center', '0head_pred_size', '0head_sem_cls_scores', '1head_proj_queries', '1head_base_xyz', '1head_center', '1head_pred_size', '1head_sem_cls_scores', '2head_proj_queries', '2head_base_xyz', '2head_center', '2head_pred_size', '2head_sem_cls_scores', '

In [58]:
# Get the keys which are not in the end_points 

set(one_test_sample_from_batch.keys()) - set(end_points.keys())

{'tokens_positive', 'point_instance_label', 'text', 'image', 'point_clouds', 'size_gts', 'box_label_mask', 'sem_cls_label', 'dynamic_mask', 'positive_map', 'center_label'}

In [59]:
for key in one_test_sample_from_batch:
        if key not in end_points:
            end_points[key] = one_test_sample_from_batch[key]

In [60]:
# contrast
pred_center = end_points['last_center'].detach().cpu()
pred_size = end_points["last_pred_size"].detach().cpu()
pred_boxes = torch.concat([pred_center, pred_size], dim=-1).numpy()


In [61]:
pred_boxes.shape

(1, 256, 6)

In [62]:
proj_tokens = end_points['proj_tokens']  # (B, tokens, 64)
proj_queries = end_points['last_proj_queries']  # (B, Q, 64)
sem_scores = torch.matmul(proj_queries, proj_tokens.transpose(-1, -2))
sem_scores_ = sem_scores / 0.07  # (B, Q, tokens)
sem_scores = torch.softmax(sem_scores_, dim=-1)

In [63]:
token = end_points['tokenized']
mask = token['attention_mask'].detach().cpu()
last_pos = mask.sum(1) - 2

In [64]:
model

BeaUTyDETR(
  (point_backbone_net): Pointnet2Backbone(
    (sa1): PointnetSAModuleVotes(
      (grouper): QueryAndGroup()
      (mlp_module): SharedMLP(
        (layer0): Conv2d(
          (conv): Conv2d(6, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(
            (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          )
          (activation): ReLU(inplace=True)
        )
        (layer1): Conv2d(
          (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(
            (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          )
          (activation): ReLU(inplace=True)
        )
        (layer2): Conv2d(
          (conv): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(
            (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          )
     

In [65]:
bs = sem_scores.shape[0]
pred_box = np.zeros((bs, 7))
for i in range(bs):
    sim = 1 - sem_scores[i, :, last_pos[i]]
    max_idx = torch.argmax(sim)
    box = pred_boxes[i, max_idx.item()]
    pred_box[i, :6] = box

In [66]:
total_predict_boxes.append(pred_box)

In [67]:
predict_boxes = np.vstack(total_predict_boxes)
acc25, acc50, m_iou = test_dataset.evaluate(predict_boxes)

100%|██████████| 1/1 [00:00<00:00, 37117.73it/s]


In [68]:
acc50

0.0