In [1]:
import os
import json

import os.path as osp

import pandas as pd 

from collections import Counter

import numpy as np

from loguru import logger


os.chdir(osp.dirname(os.getcwd()))

In [2]:

'''
description:  获取SR3D 作者划分好的 训练集 和测试集
param {*} split
return {*}
'''
def get_split_list(dataset_name="sr3d" ,split='train'):
    with open('data/meta_data/%s_%s_scans.txt' % (dataset_name,split)) as f:
        scan_ids = set(eval(f.read()))
    logger.info(f" len of {dataset_name,split} : {len(scan_ids)}")
    return scan_ids


def get_ratio_split_list(ratio,dataset_name="sr3d" ,split='train'):
    with open('data/meta_data/%s_%s_%.1f.txt' % (dataset_name,split,ratio)) as f:
        scan_ids = f.read().split("\n")
    logger.info(f" len of {dataset_name,split,ratio} : {len(scan_ids)}")
    return scan_ids


def read_txt(file_name):
    with open(file_name,'r') as f:
        scan_ids = f.read().split("\n")
    logger.info(f" len of {file_name} : {len(scan_ids)}")
    return scan_ids


'''
description:  从SR3D作者那获取 详细的标注数据
param {*} split_name
return {*}
'''
def get_refer_it_3D(split_name='sr3d'):
    scanrefer_root="datasets/refer_it_3d"
    data = pd.read_csv(osp.join(scanrefer_root,split_name+".csv"))
    
    # logger.info(f"len of {split_name} : {data.shape[0]}")
    all_attrs = data.columns
    # logger.info(f" column of {split_name} : {all_attrs}")
    # logger.info(f"scene number : {len(set(data['scan_id']))}")

    stat = Counter(data['scan_id'])

    scane_stat = np.array([v for k ,v in stat.items()])
    avg_sample =scane_stat.mean()
    min_sample =scane_stat.min()
    max_sample =scane_stat.max()


    # logger.info(f"min sample: {min_sample} \n max sample : {max_sample} \n avg sample: {avg_sample}")    
    # print(data.iloc[0,:])
    return data
    
      

def save_txt(path,data):
    with open(path, 'w') as f:
        f.write(data)
    

'''
description:  
param {*} labeled_ratio
return {*}
'''
def generate_NR3D_labeled_scene_txt(labeled_ratio):
    nr3d_ids = get_split_list(dataset_name='nr3d')

    num_scans = len(nr3d_ids)
    num_labeled_scans = int(num_scans*labeled_ratio)


    choices = np.random.choice(num_scans, num_labeled_scans, replace=False)#* 从num_scans 挑选num_labeled_scans 个场景 出来 

    labeled_scan_names = list(np.array(list(nr3d_ids))[choices])
    
    save_txt(os.path.join('data/meta_data/nr3d_train_{}.txt'.format(labeled_ratio)),'\n'.join(labeled_scan_names))
   
    logger.info('\tSelected {} labeled scans, remained {} unlabeled scans'.format(len(labeled_scan_names),num_scans- len(labeled_scan_names)))


    
    
# sr3d_ids = get_split_list()
# sr3d_ids_test = get_split_list(split = 'test')
# nr3d_ids = get_split_list(dataset_name='nr3d')
# nr3d_ids_test = get_split_list(dataset_name='nr3d',split = 'test')

# get_refer_it_3D('sr3d+')
# get_refer_it_3D('sr3d')
# get_refer_it_3D('sr3d_test')
# get_refer_it_3D('sr3d_train')


In [3]:

'''
description:  统计NR3D 数据集
return {*}
'''
def statstic_nr3d():
    nr3d = get_refer_it_3D('nr3d')
    nr3d_train =  get_split_list(dataset_name='nr3d')
    len_test= len_train= 0
    for idx in range(nr3d.shape[0]):
        if nr3d.iloc[idx,:]['scan_id'] in nr3d_train:
            len_train+=1
        else:
            len_test+=1
    print(f"len of train : {len_train}, len of test : {len_test}")



'''
description:  根据比例统计每个子集
param {*} ratio
param {*} datasets
return {*}
'''
def statstic_nr3d_by_ratio(ratio,datasets='nr3d'):
    
    nr3d = get_refer_it_3D(datasets)
    nr3d_ratio=  get_ratio_split_list(ratio,dataset_name=datasets)
    
    num = 0 
    for idx in range(nr3d.shape[0]):
        if nr3d.iloc[idx,:]['scan_id'] in nr3d_ratio:
            num+=1
        
    print(f"len of {datasets,ratio} : {num}")








In [4]:



def get_scene_data(all_data,scene = 'scene0525_00'):
    

    return_data = [] 
    for  idx in range(all_data.shape[0]):

        if all_data.iloc[idx]['scan_id'] == scene:
            return_data.append(all_data.iloc[idx])
            

    return return_data


def get_assignment_id(all_data,scene_list):
    assignments = []
    for idx in range(all_data.shape[0]):
        if all_data.iloc[idx]['scan_id'] in scene_list: 
            assignments.append (all_data.iloc[idx]['assignmentid'])
        
    return assignments

# scene_0525  = get_scene_data(nr3d)
# all_assign_ids = get_assignment_id(nr3d)




In [5]:

def split_labeled_according_assignment_id(assign_ids , ratio=0.4):
    
    
    length = len(assign_ids)
    choices = np.random.choice(length,int(length*ratio))

    return np.array(assign_ids)[choices]




nr3d = get_refer_it_3D('nr3d')
train_split = get_split_list(dataset_name='nr3d')

nr3d_all_assign_ids = get_assignment_id(nr3d,train_split)




2022-10-17 12:09:31.995 | INFO     | __main__:get_split_list:9 -  len of ('nr3d', 'train') : 511


In [10]:


# for ratio in np.linspace(0.1,0.9,9):
#     ratio = round(ratio,1)
#     split_labeled_data = split_labeled_according_assignment_id(nr3d_all_assign_ids,ratio)
#     print(f"length  = {len(split_labeled_data)}")
#     save_txt(os.path.join('data/meta_data/nr3d_train_{}.txt'.format(ratio)),'\n'.join(split_labeled_data.astype(np.str0).tolist()))



save_txt(os.path.join('data/meta_data/nr3d_train_all_assignmentid.txt'),'\n'.join(np.array(nr3d_all_assign_ids).astype(np.str0).tolist()))

In [10]:

data = read_txt('data/meta_data/nr3d_train_{}.txt'.format(0.2))



sum_=0
for idx in range(nr3d.shape[0]):
    
    # if str(nr3d.iloc[idx]['assignmentid']) in data and nr3d.iloc[idx]['mentions_target_class']  and nr3d.iloc[idx]['correct_guess'] :
    if str(nr3d.iloc[idx]['assignmentid']) in data and nr3d.iloc[idx]['mentions_target_class']:
        sum_+=1
    


print(sum_)

    

    

2022-10-17 12:04:49.376 | INFO     | __main__:read_txt:23 -  len of data/meta_data/nr3d_train_0.2.txt : 6583


5498


In [7]:

def get_scanrefer(split=None):
    if split is not None :
        path = "datasets/scanrefer/ScanRefer_filtered_%s.json"%split
    else :
        path = "datasets/scanrefer/ScanRefer_filtered.json"

    with open (path,'r')as f :
        data =json.load(f)
    
    length = len(data)
    # logger.info(f" len of {split} split : {length}")
    # all_scene = set([x['scene_id']  for x in data])
    # logger.info(f" scene number  of {split} split : {len(all_scene)}")

    # all_object_id = set([x['object_id']  for x in data])
    # logger.info(f" object number  of {split} split : {len(all_object_id)}")

    # all_anno_id = set([x['ann_id']  for x in data])
    # logger.info(f" anno number  of {split} split : {len(all_anno_id)}")

    # print(data[0])
    
    return data

def get_ratio_scanrefer(ratio,split=None):
    
    path = "datasets/scanrefer/ScanRefer_filtered_train_%.1f.txt"%(ratio)

    scanrefer = get_scanrefer(split='train')
    scan_ids = read_txt(path)

    
    num = 0 
    for refer in scanrefer:
        if refer['scene_id'] in scan_ids:
            num+=1
    
    print(f"len of scanrefer {ratio} : {num}")
    



'''
description: 
param {*} labeled_ratio
return {*}
'''
def generate_scanrefer_labeled_scene_txt(labeled_ratio):
    all_scenes = get_scanrefer(split='train')

    num_scans = len(all_scenes)
    num_labeled_scans = int(num_scans*labeled_ratio)


    choices = np.random.choice(num_scans, num_labeled_scans, replace=False)#* 从num_scans 挑选num_labeled_scans 个场景 出来 

    labeled_scan_names = list(np.array(list(all_scenes))[choices])
    
    with open(os.path.join('datasets/scanrefer/ScanRefer_filtered_train_{}.txt'.format(labeled_ratio)), 'w') as f:
        f.write('\n'.join(labeled_scan_names))
    
    logger.info('\tSelected {} labeled scans, remained {} unlabeled scans'.format(len(labeled_scan_names),num_scans- len(labeled_scan_names)))


    

'''
description:  一次性生成所有比例的数据集划分
return {*}
'''
def generate_ratio_labeled_datasets():
    for x in np.linspace(0.1,0.9,9):
        generate_NR3D_labeled_scene_txt(round(x,1))
    for x in np.linspace(0.1,0.9,9):
        generate_scanrefer_labeled_scene_txt(round(x,1))

'''
description:  tmp code for stat 
return {*}
'''
def stat():
    datasets = ['sr3d','nr3d']

    for dataset in datasets:
        for n in np.linspace(0.1,0.9,9):
            # print(round(n,1))
            statstic_nr3d_by_ratio(round(n,1),dataset)
            
    for n in np.linspace(0.1,0.9,9):
        get_ratio_scanrefer(round(n,1))
        


# get_scanrefer(split='val')
# get_scanrefer(split='test')
# get_scanrefer()
# scanrefer = get_scanrefer(split='train')


