In [1]:
import os
import json

import os.path as osp
os.chdir(osp.dirname(os.getcwd()))



In [43]:

import pandas as pd 

from collections import Counter

import numpy as np

from loguru import logger
'''
description:  获取SR3D 作者划分好的 训练集 和测试集
param {*} split
return {*}
'''
def get_split_list(dataset_name="sr3d" ,split='train'):
    with open('data/meta_data/%s_%s_scans.txt' % (dataset_name,split)) as f:
        scan_ids = set(eval(f.read()))
    logger.info(f" len of {dataset_name,split} : {len(scan_ids)}")
    return scan_ids





'''
description:  从SR3D作者那获取 详细的标注数据
param {*} split_name
return {*}
'''
def get_refer_it_3D(split_name='sr3d'):
    scanrefer_root="datasets/refer_it_3d"
    data = pd.read_csv(osp.join(scanrefer_root,split_name+".csv"))
    
    logger.info(f"len of {split_name} : {data.shape[0]}")
    all_attrs = data.columns
    logger.info(f" column of {split_name} : {all_attrs}")
    logger.info(f"scene number : {len(set(data['scan_id']))}")

    stat = Counter(data['scan_id'])

    scane_stat = np.array([v for k ,v in stat.items()])
    avg_sample =scane_stat.mean()
    min_sample =scane_stat.min()
    max_sample =scane_stat.max()


    logger.info(f"min sample: {min_sample} \n max sample : {max_sample} \n avg sample: {avg_sample}")

    
    print(data.iloc[0,:])
    return data
    
      



def generate_NR3D_labeled_scene_txt(labeled_ratio):
    nr3d_ids = get_split_list(dataset_name='nr3d')

    num_scans = len(nr3d_ids)
    num_labeled_scans = int(num_scans*labeled_ratio)


    choices = np.random.choice(num_scans, num_labeled_scans, replace=False)#* 从num_scans 挑选num_labeled_scans 个场景 出来 

    labeled_scan_names = list(np.array(list(nr3d_ids))[choices])
    
    with open(os.path.join('data/meta_data/nr3d_train_{}.txt'.format(labeled_ratio)), 'w') as f:
        f.write('\n'.join(labeled_scan_names))
    
    logger.info('\tSelected {} labeled scans, remained {} unlabeled scans'.format(len(labeled_scan_names),num_scans- len(labeled_scan_names)))


    
    
# sr3d_ids = get_split_list()
# sr3d_ids_test = get_split_list(split = 'test')
# nr3d_ids = get_split_list(dataset_name='nr3d')
# nr3d_ids_test = get_split_list(dataset_name='nr3d',split = 'test')



# get_refer_it_3D('sr3d+')
# get_refer_it_3D('sr3d')
# get_refer_it_3D('sr3d_test')
# get_refer_it_3D('sr3d_train')

nr3d = get_refer_it_3D('nr3d')

2022-10-14 22:48:22.383 | INFO     | __main__:get_refer_it_3D:32 - len of nr3d : 41503
2022-10-14 22:48:22.384 | INFO     | __main__:get_refer_it_3D:34 -  column of nr3d : Index(['assignmentid', 'stimulus_id', 'utterance', 'correct_guess',
       'speaker_id', 'listener_id', 'scan_id', 'instance_type', 'target_id',
       'tokens', 'dataset', 'mentions_target_class', 'uses_object_lang',
       'uses_spatial_lang', 'uses_color_lang', 'uses_shape_lang'],
      dtype='object')
2022-10-14 22:48:22.388 | INFO     | __main__:get_refer_it_3D:35 - scene number : 641
2022-10-14 22:48:22.393 | INFO     | __main__:get_refer_it_3D:45 - min sample: 7 
 max sample : 259 
 avg sample: 64.74726989079564


assignmentid                                                         32618
stimulus_id                             scene0525_00-plant-5-9-10-11-12-62
utterance                The plant at the far right hand side of the bo...
correct_guess                                                         True
speaker_id                                                              47
listener_id                                                            310
scan_id                                                       scene0525_00
instance_type                                                        plant
target_id                                                                9
tokens                   ['the', 'plant', 'at', 'the', 'far', 'right', ...
dataset                                                               nr3d
mentions_target_class                                                 True
uses_object_lang                                                      True
uses_spatial_lang        

In [44]:



nr3d_train =  get_split_list(dataset_name='nr3d')
nr3d_test = get_split_list(dataset_name='nr3d',split='test')


len_test= len_train= 0
for idx in range(nr3d.shape[0]):
    
    if nr3d.iloc[idx,:]['scan_id'] in nr3d_train:
        len_train+=1
    else:
        len_test+=1

print(f"len of train : {len_train}, len of test : {len_test}")



2022-10-14 22:49:17.837 | INFO     | __main__:get_split_list:16 -  len of ('nr3d', 'train') : 511
2022-10-14 22:49:17.839 | INFO     | __main__:get_split_list:16 -  len of ('nr3d', 'test') : 130


len of train : 32919, len of test : 8584


In [31]:
# for x in np.linspace(0.1,0.9,9):
#     generate_NR3D_labeled_scene_txt(round(x,1))


In [22]:

def get_scanrefer(split=None):
    if split is not None :
        path = "datasets/scanrefer/ScanRefer_filtered_%s.json"%split
    else :
        path = "datasets/scanrefer/ScanRefer_filtered.json"

    with open (path,'r')as f :
        data =json.load(f)
    
    length = len(data)
    logger.info(f" len of {split} split : {length}")
    all_scene = set([x['scene_id']  for x in data])
    logger.info(f" scene number  of {split} split : {len(all_scene)}")

    all_object_id = set([x['object_id']  for x in data])
    logger.info(f" object number  of {split} split : {len(all_object_id)}")

    all_anno_id = set([x['ann_id']  for x in data])
    logger.info(f" anno number  of {split} split : {len(all_anno_id)}")

    print(data[0])
    
    return all_scene



def generate_scanrefer_labeled_scene_txt(labeled_ratio):
    all_scenes = get_scanrefer(split='train')

    num_scans = len(all_scenes)
    num_labeled_scans = int(num_scans*labeled_ratio)


    choices = np.random.choice(num_scans, num_labeled_scans, replace=False)#* 从num_scans 挑选num_labeled_scans 个场景 出来 

    labeled_scan_names = list(np.array(list(all_scenes))[choices])
    
    with open(os.path.join('datasets/scanrefer/ScanRefer_filtered_train_{}.txt'.format(labeled_ratio)), 'w') as f:
        f.write('\n'.join(labeled_scan_names))
    
    logger.info('\tSelected {} labeled scans, remained {} unlabeled scans'.format(len(labeled_scan_names),num_scans- len(labeled_scan_names)))


    
    
        


# get_scanrefer(split='val')
# get_scanrefer(split='test')
# get_scanrefer(split='train')
# get_scanrefer()



In [5]:

# for x in np.linspace(0.1,0.9,9):
#     generate_scanrefer_labeled_scene_txt(round(x,1))