In [1]:
import json
import sys
import os
import cv2
import numpy as np
import concurrent

sys.path.insert(0, '../../')
from data_loader import get_data_to_load, load_json_files, update_data_to_load

MAX_VARIANCE_THRESHOLD = 300
LAPLACIAN_VARIANCE_THRESHOLD = 30

files = get_data_to_load(loading_file='../03_mapping/data_list', file_location='../01_enriching/.data/', image_file_location='../../1_data_collection/.data/', from_remote_only=True, download_link='default', allow_file_location_env=True, allow_image_file_location_env=True, allow_json_file_location_env=True, allow_download_link_env=True)
files_non_mapped = get_data_to_load(loading_file='./pre_filtering_data_list', file_location='../01_enriching/.data/', image_file_location='../../1_data_collection/.data/', from_remote_only=True, download_link='default', allow_file_location_env=True, allow_image_file_location_env=True, allow_json_file_location_env=True, allow_download_link_env=True)

original_order = files.copy()
original_order_non_mapped = files_non_mapped.copy()

# sort files by file name
files.sort()
files_non_mapped.sort()

images = list(filter(lambda x: x.endswith('.png'), files))
images_non_mapped = list(filter(lambda x: x.endswith('.png'), files_non_mapped))
jsons = list(filter(lambda x: x.endswith('.json'), files))
jsons_non_mapped = list(filter(lambda x: x.endswith('.json'), files_non_mapped))

print(len(images))
print(len(images_non_mapped))

filtered_files = []
filtered_files_non_mapped = []

variance_map = {}
variance_map_non_mapped = {}

laplacian_variance_map = {}
laplacian_variance_map_non_mapped = {}

# kernel to detect blurry images
laplacian_kernel = np.array([[0, 1, 0], [1, -4, 1], [0, 1, 0]])

def process_image(image_path, json_path):
  image = cv2.imread(image_path)
  try:
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
  except cv2.error as e:
    print("Error reading image:", os.path.basename(image_path))
    print(e)
    return None
  
  variance_colors = np.var(image, axis=(0, 1))
  
  max_variance = np.max(variance_colors)
  
  # get variance to detect blurry images, use grayscale image
  gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
  laplacian = cv2.filter2D(gray, cv2.CV_32F, laplacian_kernel)
  
  laplacian_variance = float(np.var(laplacian))
  
  return [image_path, json_path, max_variance, laplacian_variance]

with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
  results = [executor.submit(process_image, image_path, json_path) for image_path, json_path in zip(images, jsons)]
  
  for result in results:
    if result.result() is not None:
      image_path, json_path, max_variance, laplacian_variance = result.result()
      
      variance_map[os.path.basename(image_path)] = max_variance
      laplacian_variance_map[os.path.basename(image_path)] = laplacian_variance
      
      if max_variance > MAX_VARIANCE_THRESHOLD:
        if laplacian_variance > LAPLACIAN_VARIANCE_THRESHOLD:
          filtered_files.append(image_path)
          filtered_files.append(json_path)
        else:
          print("Too low laplacian variance:", os.path.basename(image_path))
          print("Corresponding result:", os.path.basename(json_path))
          print("Laplacian variance:", laplacian_variance)
      else:
        print("Too low max RGB variance:", os.path.basename(image_path))
        print("Corresponding result:", os.path.basename(json_path))
        print("Max RGB variance:", max_variance)
        
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
  results = [executor.submit(process_image, image_path, json_path) for image_path, json_path in list(zip(images_non_mapped, jsons_non_mapped))[:500]]
  
  for result in results:
    if result.result() is not None:
      image_path, json_path, max_variance, laplacian_variance = result.result()
      
      variance_map_non_mapped[os.path.basename(image_path)] = max_variance
      laplacian_variance_map_non_mapped[os.path.basename(image_path)] = laplacian_variance
      
      if max_variance > MAX_VARIANCE_THRESHOLD:
        if laplacian_variance > LAPLACIAN_VARIANCE_THRESHOLD:
          filtered_files.append(image_path)
          filtered_files.append(json_path)
        else:
          print("Too low laplacian variance:", os.path.basename(image_path))
          print("Corresponding result:", os.path.basename(json_path))
          print("Laplacian variance:", laplacian_variance)
      else:
        print("Too low max RGB variance:", os.path.basename(image_path))
        print("Corresponding result:", os.path.basename(json_path))
        print("Max RGB variance:", max_variance)
      
    
filtered_images = list(filter(lambda x: x.endswith('.png'), filtered_files))
filtered_images_non_mapped = list(filter(lambda x: x.endswith('.png'), filtered_files_non_mapped))
print(len(filtered_images))
print(len(filtered_images_non_mapped))

with open('variance_map', 'w', encoding='utf8') as f:
    json.dump(variance_map, f, ensure_ascii=False)
    
with open('variance_map_non_mapped', 'w', encoding='utf8') as f:
    json.dump(variance_map_non_mapped, f, ensure_ascii=False)
    
with open('laplacian_variance_map', 'w', encoding='utf8') as f:
    json.dump(laplacian_variance_map, f, ensure_ascii=False)
    
with open('laplacian_variance_map_non_mapped', 'w', encoding='utf8') as f:
    json.dump(laplacian_variance_map_non_mapped, f, ensure_ascii=False)
    
current_files = filtered_files.copy()
current_files_non_mapped = filtered_files_non_mapped.copy()

images = list(filter(lambda x: x.endswith('.png'), files))
images_non_mapped = list(filter(lambda x: x.endswith('.png'), files_non_mapped))
jsons = list(filter(lambda x: x.endswith('.json'), files))
jsons_non_mapped = list(filter(lambda x: x.endswith('.json'), files_non_mapped))

jsons_data = load_json_files(jsons)
jsons_data_non_mapped = load_json_files(jsons_non_mapped)

def remove_duplicates(current_images, current_jsons, current_jsons_data):
  current_filtered_files = []

  current_duplicate_locations = 0

  previous_locations = set()

  for image_path, json_path, json_data in zip(current_images, current_jsons, current_jsons_data):
    # if they have the same coordinates, then they are the same location, only use the first
    str_coords = str(json_data['coordinates'])
    if str_coords in previous_locations:
      current_duplicate_locations += 1
      continue
    else:
      previous_locations.add(str_coords)
      current_filtered_files.append(image_path)
      current_filtered_files.append(json_path)
      
  return current_filtered_files, current_duplicate_locations

filtered_files, duplicate_locations = remove_duplicates(images, jsons, jsons_data)

print("Duplicate locations:", duplicate_locations)

filtered_files_non_mapped, duplicate_locations_non_mapped = remove_duplicates(images_non_mapped, jsons_non_mapped, jsons_data_non_mapped)

print("Duplicate locations non mapped:", duplicate_locations_non_mapped)

print(len(filtered_images))
print(len(filtered_images_non_mapped))

filtered_files_set = set(filtered_files)
filtered_files_non_mapped_set = set(filtered_files_non_mapped)
    
# Sort back to original order (some files were removed)
filtered_files = [file for file in original_order if file in filtered_files_set]
filtered_files_non_mapped = [file for file in original_order_non_mapped if file in filtered_files_non_mapped_set]

update_data_to_load(filtered_files, old_loading_file='../03_mapping/data_list', file_location='../01_enriching/data/', image_file_location='../../1_data_collection/data/', from_remote_only=True, download_link='default', allow_file_location_env=True, allow_image_file_location_env=True, allow_json_file_location_env=True, allow_download_link_env=True)
update_data_to_load(filtered_files_non_mapped, new_loading_file='./updated_data_list_non_mapped', old_loading_file='./pre_filtering_data_list', file_location='../01_enriching/data/', image_file_location='../../1_data_collection/data/', from_remote_only=True, download_link='default', allow_file_location_env=True, allow_image_file_location_env=True, allow_json_file_location_env=True, allow_download_link_env=True)

All local files: 623972
Filtering out unpaired files
Filtered out 17972 unpaired files
Relevant files: 403796
All local files: 623972
Filtering out unpaired files
Filtered out 17972 unpaired files
Relevant files: 403796
54538
201898
Too low max RGB variance: geoguessr_location_singleplayer_04aIMEDoIsolXMPG_0.png
Corresponding result: geoguessr_result_singleplayer_04aIMEDoIsolXMPG_0.json
Max RGB variance: 0.0
Too low max RGB variance: geoguessr_location_singleplayer_08LAHDNXMTUTEF8o_0.png
Corresponding result: geoguessr_result_singleplayer_08LAHDNXMTUTEF8o_0.json
Max RGB variance: 0.0
Too low max RGB variance: geoguessr_location_singleplayer_0M8pcOqBtGPWdghb_0.png
Corresponding result: geoguessr_result_singleplayer_0M8pcOqBtGPWdghb_0.json
Max RGB variance: 0.0
Too low max RGB variance: geoguessr_location_singleplayer_0ORjQsJIoYvpalOO_0.png
Corresponding result: geoguessr_result_singleplayer_0ORjQsJIoYvpalOO_0.json
Max RGB variance: 0.0
Too low max RGB variance: geoguessr_location_single