In [1]:
import json
import sys
import os
import cv2
import numpy as np
import concurrent

sys.path.insert(0, '../../')
from data_loader import get_data_to_load, update_data_to_load

MAX_VARIANCE_THRESHOLD = 300

files = get_data_to_load(loading_file='../03_mapping/data_list', file_location='../01_enriching/.data/', image_file_location='../../1_data_collection/.data/', from_remote_only=True, download_link='env', allow_file_location_env=True, allow_image_file_location_env=True, allow_json_file_location_env=True)
files_non_mapped = get_data_to_load(loading_file='./pre_filtering_data_list', file_location='../01_enriching/.data/', image_file_location='../../1_data_collection/.data/', from_remote_only=True, download_link='env', allow_file_location_env=True, allow_image_file_location_env=True, allow_json_file_location_env=True)

original_order = files.copy()
original_order_non_mapped = files_non_mapped.copy()

# sort files by file name
files.sort()
files_non_mapped.sort()

images = list(filter(lambda x: x.endswith('.png'), files))
images_non_mapped = list(filter(lambda x: x.endswith('.png'), files_non_mapped))
jsons = list(filter(lambda x: x.endswith('.json'), files))
jsons_non_mapped = list(filter(lambda x: x.endswith('.json'), files_non_mapped))

print(len(images))
print(len(images_non_mapped))

filtered_files = []
filtered_files_non_mapped = []

variance_map = {}
variance_map_non_mapped = {}

def process_image(image_path, json_path):
  image = cv2.imread(image_path)
  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
  
  variance_colors = np.var(image, axis=(0, 1))
  
  max_variance = np.max(variance_colors)
  
  return [image_path, json_path, max_variance]

with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
  results = [executor.submit(process_image, image_path, json_path) for image_path, json_path in zip(images, jsons)]
  
  for result in results:
    if result.result() is not None:
      image_path, json_path, max_variance = result.result()
      
      variance_map[os.path.basename(image_path)] = max_variance
      
      if max_variance > MAX_VARIANCE_THRESHOLD:
        filtered_files.append(image_path)
        filtered_files.append(json_path)
      else:
        print("Too low max RGB variance:", os.path.basename(image_path))
        print("Corresponding result:", os.path.basename(json_path))
        print("Max RGB variance:", max_variance)
        
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
  results = [executor.submit(process_image, image_path, json_path) for image_path, json_path in zip(images_non_mapped, jsons_non_mapped)]
  
  for result in results:
    if result.result() is not None:
      image_path, json_path, max_variance = result.result()
      
      variance_map_non_mapped[os.path.basename(image_path)] = max_variance
      
      if max_variance > MAX_VARIANCE_THRESHOLD:
        filtered_files_non_mapped.append(image_path)
        filtered_files_non_mapped.append(json_path)
      else:
        print("Too low max RGB variance:", os.path.basename(image_path))
        print("Corresponding result:", os.path.basename(json_path))
        print("Max RGB variance:", max_variance)
      
    
filtered_images = list(filter(lambda x: x.endswith('.png'), filtered_files))
filtered_images_non_mapped = list(filter(lambda x: x.endswith('.png'), filtered_files_non_mapped))
print(len(filtered_images))
print(len(filtered_images_non_mapped))

with open('variance_map', 'w', encoding='utf8') as f:
    json.dump(variance_map, f, ensure_ascii=False)
    
with open('variance_map_non_mapped', 'w', encoding='utf8') as f:
    json.dump(variance_map_non_mapped, f, ensure_ascii=False)
    
# Sort back to original order (some files were removed)
filtered_files = sorted(filtered_files, key=lambda x: original_order.index(x))
filtered_files_non_mapped = sorted(filtered_files_non_mapped, key=lambda x: original_order_non_mapped.index(x))

update_data_to_load(filtered_files, old_loading_file='../03_mapping/data_list', file_location='../01_enriching/data/', image_file_location='../../1_data_collection/data/', from_remote_only=True, download_link='env', allow_file_location_env=True, allow_image_file_location_env=True, allow_json_file_location_env=True)
update_data_to_load(filtered_files_non_mapped, new_loading_file='./updated_data_list_non_mapped', old_loading_file='./pre_filtering_data_list', file_location='../01_enriching/data/', image_file_location='../../1_data_collection/data/', from_remote_only=True, download_link='env', allow_file_location_env=True, allow_image_file_location_env=True, allow_json_file_location_env=True)

Getting files list from remote
Got files list from remote
Parsed files list from remote
All remote files: 274796
All local files: 403361
Filtering out unpaired files
Filtered out 17666 unpaired files
Relevant files: 257130
Getting files list from remote
Got files list from remote
Parsed files list from remote
All remote files: 274796
All local files: 403361
Filtering out unpaired files
Filtered out 17666 unpaired files
Relevant files: 257130
30953
128565
Too low max RGB variance: geoguessr_location_singleplayer_0cZnbBk46V6mYOYU_4.png
Corresponding result: geoguessr_result_singleplayer_0cZnbBk46V6mYOYU_4.json
Max RGB variance: 0.0
Too low max RGB variance: geoguessr_location_singleplayer_0fPz2yJE6Od1b2xC_1.png
Corresponding result: geoguessr_result_singleplayer_0fPz2yJE6Od1b2xC_1.json
Max RGB variance: 0.0
Too low max RGB variance: geoguessr_location_singleplayer_0fPz2yJE6Od1b2xC_2.png
Corresponding result: geoguessr_result_singleplayer_0fPz2yJE6Od1b2xC_2.json
Max RGB variance: 0.0
Too 