In [1]:
import os
import json

class CountDatasets:
    def __init__(self,
                file_type,
                dataset_path,
                json_file_path,
                save_path):
        self._file_type = file_type
        self._dataset_path = dataset_path
        self._json_file_path = json_file_path
        self._save_path = save_path
        self.lost_json = {}
        self._get_dataset_list()
        self._get_json_file()
    
    def _get_dataset_list(self):
        self.dataset_list = os.listdir(os.path.join(self._dataset_path, self._file_type))

    def _get_json_file(self):
        filename = os.path.join(self._json_file_path, '%s.json' % self._file_type)
        with open(filename) as f:
            json_file = json.load(f)
            print("<%s> is loaded." % filename)
        self.json_file = json_file
    
    def find_lost(self):
        json_id_map={image.get("id"):image for image in self.json_file.get("images")}
        
        for image_id in self.dataset_list:
            image = json_id_map.get(image_id.split('_')[-1],{})
            image['count'] = 1
            json_id_map[image_id.split('_')[-1]] = image
        
        lost_list = [value for value in json_id_map.values() if value.get('count') is None]
        self.lost_json['images'] = lost_list
        
    
    def get_diff(self):
        # 查看数据的差值
        json_file_number = len(self.json_file.get("images"))
        dataset_list_number = len(self.dataset_list)
        print("Total of json file:", json_file_number)
        print("Total of dataset file:", dataset_list_number)
        print("Diff between json and download file:", (json_file_number - dataset_list_number))
        
        self.find_lost()
        print("Total number of lost images:", len(self.lost_json.get('images')))
    
    def save_lost_file(self):
        filename = os.path.join(self._save_path, '%s_lost.json' % self._file_type)
        with open(filename, 'w') as f:
            json.dump(self.lost_json, f)
            print("[ %s ] is saved." % filename)
    
    def diff_and_save(self):
        self.get_diff()
        self.save_lost_file()
        

In [2]:
val_dataset = CountDatasets(file_type = 'val',
                        dataset_path = '/tf/imaterialist-product-2019/datasets',
                        json_file_path = '/tf/imaterialist-product-2019/init_files',
                        save_path = '/tf/imaterialist-product-2019/preprocess')
val_dataset.get_diff()
val_dataset.save_lost_file()

</tf/imaterialist-product-2019/init_files/val.json> is loaded.
Total of json file: 10095
Total of dataset file: 10095
Diff between json and download file: 0
Total number of lost images: 0
[ /tf/imaterialist-product-2019/preprocess/val_lost.json ] is saved.


In [3]:
train_dataset = CountDatasets(file_type = 'train',
                        dataset_path = '/tf/imaterialist-product-2019/datasets',
                        json_file_path = '/tf/imaterialist-product-2019/init_files',
                        save_path = '/tf/imaterialist-product-2019/preprocess')
train_dataset.get_diff()
train_dataset.save_lost_file()

</tf/imaterialist-product-2019/init_files/train.json> is loaded.
Total of json file: 1011532
Total of dataset file: 1008641
Diff between json and download file: 2891
Total number of lost images: 2891
[ /tf/imaterialist-product-2019/preprocess/train_lost.json ] is saved.


In [4]:
test_dataset = CountDatasets(file_type = 'test',
                        dataset_path = '/tf/imaterialist-product-2019/datasets',
                        json_file_path = '/tf/imaterialist-product-2019/init_files',
                        save_path = '/tf/imaterialist-product-2019/preprocess')
test_dataset.get_diff()
test_dataset.save_lost_file()

</tf/imaterialist-product-2019/init_files/test.json> is loaded.
Total of json file: 90834
Total of dataset file: 90833
Diff between json and download file: 1
Total number of lost images: 1
[ /tf/imaterialist-product-2019/preprocess/test_lost.json ] is saved.
