In [1]:
import os
import numpy as np
import pandas as pd

base = 'SHREC14'
data_dir = 'data'

In [2]:
def get_df_sketches(data_dir, sk_path):
    split = []
    cat = []
    paths = []
    ids = []
    for root, _, files in os.walk(os.path.join(data_dir, sk_path)):
        for f in files:
            if f[-3:] == 'png':
                split.append(root.split(os.path.sep)[-1])
                cat.append(root.split(os.path.sep)[-2])
                ids.append(os.path.splitext(f)[0])
                paths.append(os.path.join(sk_path, cat[-1], split[-1], f))

    df = pd.DataFrame(data={'cat': cat, 'split': split, 'id': ids},
                      index=paths)
    return df

In [3]:
# 获取草图的标签
sk_path = os.path.join(base, 'SHREC14LSSTB_SKETCHES', 'SHREC14LSSTB_SKETCHES')
df_sk = get_df_sketches(data_dir, sk_path)
print('sk_path: ', sk_path)
print('df_sk: ', df_sk)
print('shape: ', df_sk.shape)
# print('df_sk.loc[0]: ', df_sk.loc['cat'])

sk_path:  SHREC14\SHREC14LSSTB_SKETCHES\SHREC14LSSTB_SKETCHES
df_sk:                                                              cat  split     id
SHREC14\SHREC14LSSTB_SKETCHES\SHREC14LSSTB_SKET...     airplane   test     14
SHREC14\SHREC14LSSTB_SKETCHES\SHREC14LSSTB_SKET...     airplane   test     19
SHREC14\SHREC14LSSTB_SKETCHES\SHREC14LSSTB_SKET...     airplane   test     23
SHREC14\SHREC14LSSTB_SKETCHES\SHREC14LSSTB_SKET...     airplane   test     24
SHREC14\SHREC14LSSTB_SKETCHES\SHREC14LSSTB_SKET...     airplane   test     25
...                                                         ...    ...    ...
SHREC14\SHREC14LSSTB_SKETCHES\SHREC14LSSTB_SKET...  wrist_watch  train  19911
SHREC14\SHREC14LSSTB_SKETCHES\SHREC14LSSTB_SKET...  wrist_watch  train  19913
SHREC14\SHREC14LSSTB_SKETCHES\SHREC14LSSTB_SKET...  wrist_watch  train  19916
SHREC14\SHREC14LSSTB_SKETCHES\SHREC14LSSTB_SKET...  wrist_watch  train  19918
SHREC14\SHREC14LSSTB_SKETCHES\SHREC14LSSTB_SKET...  wrist_watch  train  

In [4]:
def get_df_models(data_dir, cad_anno, cad_path):
    # read meta file
    fpath = os.path.join(data_dir, cad_anno)

    with open(fpath, 'r') as f:
        content = f.readlines()

    labels = {}
    current_cat = ''
    for line in content[3:]:
        line = line.strip('\r\n')
        line = line.strip('\t')
        line = line.strip()
        if len(line.split()) == 3:
            current_cat = line.split()[0]
        elif line != '':
            labels[line] = current_cat

    # read model folder
    cat = []
    ids = []
    paths = []
    for root, _, files in os.walk(os.path.join(data_dir, cad_path)):
        for f in files:
            if f[-3:] == 'off':
                ids.append(os.path.splitext(f)[0])
                cat.append(labels[ids[-1][1:]])
                paths.append(os.path.join(cad_path, f))

    df = pd.DataFrame(data={'cat': cat, 'id': ids},
                      index=paths)
    return df

In [5]:

# 获取3d模型的标签
# get cad labels
cad_path = os.path.join(base, 'SHREC14LSSTB_TARGET_MODELS')
eval_path = os.path.join(base, 'SHREC14_Sketch_Evaluation_CVIU')
cad_anno = os.path.join(eval_path, 'SHREC14_SBR_Model.cla')
print('cad_path: ', cad_path)
print('cad_anno: ', cad_anno)
print('eval_path: ', eval_path)
# get cad labels
df_cad = get_df_models(data_dir, cad_anno, cad_path)
print('df_cad: ', df_cad)

cad_path:  SHREC14\SHREC14LSSTB_TARGET_MODELS
cad_anno:  SHREC14\SHREC14_Sketch_Evaluation_CVIU\SHREC14_SBR_Model.cla
eval_path:  SHREC14\SHREC14_Sketch_Evaluation_CVIU
df_cad:                                                        cat       id
SHREC14\SHREC14LSSTB_TARGET_MODELS\M000001.off  bookshelf  M000001
SHREC14\SHREC14LSSTB_TARGET_MODELS\M000002.off      table  M000002
SHREC14\SHREC14LSSTB_TARGET_MODELS\M000003.off   airplane  M000003
SHREC14\SHREC14LSSTB_TARGET_MODELS\M000004.off      train  M000004
SHREC14\SHREC14LSSTB_TARGET_MODELS\M000005.off    bicycle  M000005
...                                                   ...      ...
SHREC14\SHREC14LSSTB_TARGET_MODELS\M008983.off      chair  M008983
SHREC14\SHREC14LSSTB_TARGET_MODELS\M008984.off    bicycle  M008984
SHREC14\SHREC14LSSTB_TARGET_MODELS\M008985.off      table  M008985
SHREC14\SHREC14LSSTB_TARGET_MODELS\M008986.off      sword  M008986
SHREC14\SHREC14LSSTB_TARGET_MODELS\M008987.off      horse  M008987

[8987 rows x 2 co

In [6]:

save_dir = os.path.join('labels', base)
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# 将dataframe数据存储到.hdf5文件内，便于以后读取
# 读取：pd.read_hdf(文件名, key)
df_sk.to_hdf(os.path.join(save_dir, 'sk_orig.hdf5'), 'sk')
df_cad.to_hdf(os.path.join(save_dir, 'cad_orig.hdf5'), 'cad')
print(os.path.join(save_dir, 'sk_orig.hdf5'))
print(os.path.join(save_dir, 'cad_orig.hdf5'))

# 把模型文件的路径存下来
with open(os.path.join(save_dir, 'cad.txt'), 'w') as f:
    for item in df_cad.index:
        f.write('%s\n' % item)

labels\SHREC14\sk_orig.hdf5
labels\SHREC14\cad_orig.hdf5


In [7]:
def split_models(df_sk, df_cad):
    # 对3d模型的标签去重并排序，同时记录每个类别下的样本数量
    # vv：去重后的类别标签
    # cc：每个类别含有的样本数量
    vv, cc = np.unique(df_cad['cat'], return_counts=True)
    print('vv: ', vv)
    print('cc: ', cc)

    # 3d模型数量较少，许多类别下仅有少数样本，不利于学习，故应去除
    # 仅取出样本数量大于50的类别来使用
    coi = vv[cc > 50]
    n_coi = cc[cc > 50]
    print('coi: ', coi)
    print('n_coi: ', n_coi)
    print(df_sk['cat'])

    # 根据前面筛选的下标来取数据
    new_df_sk = df_sk.loc[df_sk['cat'].isin(coi)].copy()
    new_df_cad = df_cad.loc[df_cad['cat'].isin(coi)].copy()
    # print('new_df_sk: ', new_df_sk)
    print('new_df_cad(before split): ', new_df_cad)

    # randomly split instances
    np.random.seed(1234)
    # 先把所有的3d模型的划分标记为训练集，然后随机选取每个类20%的样本作为测试集
    new_df_cad.loc[:, 'split'] = 'train'
    for c, n in zip(coi, n_coi):
        to_select = int(np.floor(n * 0.2))
        subset = new_df_cad.loc[new_df_cad['cat'] == c, 'id']
        print('subset lebel: ', c, ': ', subset)
        # replace：True表示可以取相同数字，False表示不可以取相同数字
        id_to_select = np.random.choice(subset, size=to_select, replace=False)
        new_df_cad.loc[new_df_cad['id'].isin(id_to_select), 'split'] = 'test'
    print('------------------------------------')
    print('new_df_sk(after split): ')
    print(new_df_sk)
    print('------------------------------------')
    print('new_df_cad(after split): ')
    print(new_df_cad)

    return new_df_sk, new_df_cad

In [8]:
# split between train and test cad models
# following Qi et al BMVC 2018
new_df_sk, new_df_cad = split_models(df_sk, df_cad)

# 把划分好的数据集标签信息存储到labels\PART-SHREC14
save_dir = os.path.join('labels', 'PART-' + base)
print('save_dir: ', save_dir)
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

new_df_sk.to_hdf(os.path.join(save_dir, 'sk_orig.hdf5'), 'sk')
new_df_cad.to_hdf(os.path.join(save_dir, 'cad_orig.hdf5'), 'cad')

vv:  ['airplane' 'alarm_clock' 'ant' 'apple' 'armchair' 'ashtray' 'axe'
 'banana' 'barn' 'baseball_bat' 'basket' 'bathtub' 'bear_animal' 'bed'
 'bee' 'beer_mug' 'bench' 'bicycle' 'blimp' 'book' 'bookshelf' 'bowl'
 'brain' 'bread' 'bridge' 'bus' 'bush' 'butterfly' 'cabinet' 'cake'
 'camel' 'camera' 'candle' 'cannon' 'car_sedan' 'castle' 'cell_phone'
 'chair' 'chandelier' 'church' 'cigarette' 'computer_monitor' 'couch'
 'cow' 'crab' 'crocodile' 'cup' 'diamond' 'dog' 'dolphin' 'door'
 'door_handle' 'dragon' 'duck' 'ear' 'elephant' 'eyeglasses' 'face' 'fan'
 'fire_hydrant' 'fish' 'floor_lamp' 'flower_with_stem' 'flying_bird'
 'flying_saucer' 'fork' 'frog' 'giraffe' 'grapes' 'guitar' 'hammer' 'hand'
 'hat' 'head' 'helicopter' 'helmet' 'horse' 'hot_air_balloon' 'hourglass'
 'house' 'human_skeleton' 'ice_cream_cone' 'kangaroo' 'key' 'keyboard'
 'knife' 'ladder' 'laptop' 'leaf' 'lightbulb' 'lighter' 'lion' 'mailbox'
 'microphone' 'microscope' 'monkey' 'motorbike' 'mug' 'mushroom' 'octopus'
 'o