In [4]:
import json
import sys
import os
import cv2
import numpy as np
import concurrent

sys.path.insert(0, '../../')
from data_loader import get_data_to_load, load_json_files, update_data_to_load

MAX_VARIANCE_THRESHOLD = 300
LAPLACIAN_VARIANCE_THRESHOLD = 30
# Only enable if thresholds and filtering didn't change
USE_PREVIOUS = True

files = get_data_to_load(loading_file='../03_mapping/data_list', file_location='../01_enriching/.data/', image_file_location='../../1_data_collection/.data/', from_remote_only=True, download_link='default', allow_file_location_env=True, allow_image_file_location_env=True, allow_json_file_location_env=True, allow_download_link_env=True)
files_more_countries = get_data_to_load(loading_file='../03_mapping/more_data_list', file_location='../01_enriching/.data/', image_file_location='../../1_data_collection/.data/', from_remote_only=True, download_link='default', allow_file_location_env=True, allow_image_file_location_env=True, allow_json_file_location_env=True, allow_download_link_env=True)
files_non_mapped = get_data_to_load(loading_file='./pre_filtering_data_list', file_location='../01_enriching/.data/', image_file_location='../../1_data_collection/.data/', from_remote_only=True, download_link='default', allow_file_location_env=True, allow_image_file_location_env=True, allow_json_file_location_env=True, allow_download_link_env=True)

original_order = files.copy()
original_order_more_countries = files_more_countries.copy()
original_order_non_mapped = files_non_mapped.copy()

# sort files by file name
files.sort()
files_more_countries.sort()
files_non_mapped.sort()

images = list(filter(lambda x: x.endswith('.png'), files))
images_more_countries = list(filter(lambda x: x.endswith('.png'), files_more_countries))
images_non_mapped = list(filter(lambda x: x.endswith('.png'), files_non_mapped))
jsons = list(filter(lambda x: x.endswith('.json'), files))
jsons_more_countries = list(filter(lambda x: x.endswith('.json'), files_more_countries))
jsons_non_mapped = list(filter(lambda x: x.endswith('.json'), files_non_mapped))

print(len(images))
print(len(images_more_countries))
print(len(images_non_mapped))

filtered_files = []
filtered_files_more_countries = []
filtered_files_non_mapped = []

variance_map = {}
variance_map_more_countries = {}
variance_map_non_mapped = {}

laplacian_variance_map = {}
laplacian_variance_map_more_countries = {}
laplacian_variance_map_non_mapped = {}

# pre load previous variance data if available
if USE_PREVIOUS:
  if os.path.exists('variance_map'):
    with open('variance_map', 'r', encoding='utf8') as f:
      variance_map = json.load(f)
  if os.path.exists('variance_map_more'):
    with open('variance_map_more', 'r', encoding='utf8') as f:
      variance_map_more_countries = json.load(f)
  if os.path.exists('variance_map_non_mapped'):
    with open('variance_map_non_mapped', 'r', encoding='utf8') as f:
      variance_map_non_mapped = json.load(f)
  if os.path.exists('laplacian_variance_map'):
    with open('laplacian_variance_map', 'r', encoding='utf8') as f:
      laplacian_variance_map = json.load(f)
  if os.path.exists('laplacian_variance_map_more'):
    with open('laplacian_variance_map_more', 'r', encoding='utf8') as f:
      laplacian_variance_map_more_countries = json.load(f)
  if os.path.exists('laplacian_variance_map_non_mapped'):
    with open('laplacian_variance_map_non_mapped', 'r', encoding='utf8') as f:
      laplacian_variance_map_non_mapped = json.load(f)

# kernel to detect blurry images
laplacian_kernel = np.array([[0, 1, 0], [1, -4, 1], [0, 1, 0]])

def process_image(image_path, json_path):
  image = cv2.imread(image_path)
  try:
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
  except cv2.error as e:
    print("Error reading image:", os.path.basename(image_path))
    print(e)
    return None
  
  variance_colors = np.var(image, axis=(0, 1))
  
  max_variance = np.max(variance_colors)
  
  # get variance to detect blurry images, use grayscale image
  gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
  laplacian = cv2.filter2D(gray, cv2.CV_32F, laplacian_kernel)
  
  laplacian_variance = float(np.var(laplacian))
  
  return [image_path, json_path, max_variance, laplacian_variance]

def process_all(current_images, current_jsons, current_variance_map, current_laplacian_variance_map, current_filtered_files):
  def process_image_with_cache(image_path, json_path):
    image_basename = os.path.basename(image_path)
    
    if image_basename in current_variance_map:
      max_variance = current_variance_map[image_basename]
    else:
      max_variance = None
      
    if image_basename in current_laplacian_variance_map:
      laplacian_variance = current_laplacian_variance_map[image_basename]
    else:
      laplacian_variance = None
      
    if max_variance is None or laplacian_variance is None:
      result = process_image(image_path, json_path)
      if result is not None:
        image_path, json_path, max_variance, laplacian_variance = result
      else:
        return None
      
    return image_path, json_path, max_variance, laplacian_variance
    
  
  with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
    results = [executor.submit(process_image_with_cache, image_path, json_path) for image_path, json_path in zip(current_images, current_jsons)]
    
    for result in results:
      if result.result() is not None:
        image_path, json_path, max_variance, laplacian_variance = result.result()
        
        image_basename = os.path.basename(image_path)
        json_basename = os.path.basename(json_path)
        
        current_variance_map[image_basename] = max_variance
        current_laplacian_variance_map[image_basename] = laplacian_variance
        
        if max_variance > MAX_VARIANCE_THRESHOLD:
          if laplacian_variance > LAPLACIAN_VARIANCE_THRESHOLD:
            current_filtered_files.append(image_path)
            current_filtered_files.append(json_path)
          else:
            print("Too low laplacian variance:", image_basename)
            print("Corresponding result:", json_basename)
            print("Laplacian variance:", laplacian_variance)
        else:
          print("Too low max RGB variance:", image_basename)
          print("Corresponding result:", json_basename)
          print("Max RGB variance:", max_variance)
          
    current_filtered_images = list(filter(lambda x: x.endswith('.png'), current_filtered_files))
    
    return current_filtered_files, current_filtered_images
        
filtered_files, filtered_images = process_all(images, jsons, variance_map, laplacian_variance_map, filtered_files)

filtered_files_more_countries, filtered_images_more_countries = process_all(images_more_countries, jsons_more_countries, variance_map_more_countries, laplacian_variance_map_more_countries, filtered_files_more_countries)

filtered_files_non_mapped, filtered_images_non_mapped = process_all(images_non_mapped, jsons_non_mapped, variance_map_non_mapped, laplacian_variance_map_non_mapped, filtered_files_non_mapped)
      
print(len(filtered_images))
print(len(filtered_images_more_countries))
print(len(filtered_images_non_mapped))

# remove files from map that are not in the filtered list
filtered_files_basenames_set = set(map(os.path.basename, filtered_files))
filtered_files_more_countries_basenames_set = set(map(os.path.basename, filtered_files_more_countries))
filtered_files_non_mapped_basenames_set = set(map(os.path.basename, filtered_files_non_mapped))

variance_map = {k: v for k, v in variance_map.items() if k in filtered_files_basenames_set}
variance_map_more_countries = {k: v for k, v in variance_map_more_countries.items() if k in filtered_files_more_countries_basenames_set}
variance_map_non_mapped = {k: v for k, v in variance_map_non_mapped.items() if k in filtered_files_non_mapped_basenames_set}
laplacian_variance_map = {k: v for k, v in laplacian_variance_map.items() if k in filtered_files_basenames_set}
laplacian_variance_map_more_countries = {k: v for k, v in laplacian_variance_map_more_countries.items() if k in filtered_files_more_countries_basenames_set}
laplacian_variance_map_non_mapped = {k: v for k, v in laplacian_variance_map_non_mapped.items() if k in filtered_files_non_mapped_basenames_set}

with open('variance_map', 'w', encoding='utf8') as f:
    json.dump(variance_map, f, ensure_ascii=False)
    
with open('variance_map_more', 'w', encoding='utf8') as f:
    json.dump(variance_map_more_countries, f, ensure_ascii=False)
    
with open('variance_map_non_mapped', 'w', encoding='utf8') as f:
    json.dump(variance_map_non_mapped, f, ensure_ascii=False)
    
with open('laplacian_variance_map', 'w', encoding='utf8') as f:
    json.dump(laplacian_variance_map, f, ensure_ascii=False)
    
with open('laplacian_variance_map_more', 'w', encoding='utf8') as f:
    json.dump(laplacian_variance_map_more_countries, f, ensure_ascii=False)
    
with open('laplacian_variance_map_non_mapped', 'w', encoding='utf8') as f:
    json.dump(laplacian_variance_map_non_mapped, f, ensure_ascii=False)
    
current_files = filtered_files.copy()
current_files_more_countries = filtered_files_more_countries.copy()
current_files_non_mapped = filtered_files_non_mapped.copy()

images = list(filter(lambda x: x.endswith('.png'), current_files))
images_more_countries = list(filter(lambda x: x.endswith('.png'), current_files_more_countries))
images_non_mapped = list(filter(lambda x: x.endswith('.png'), current_files_non_mapped))
jsons = list(filter(lambda x: x.endswith('.json'), current_files))
jsons_more_countries = list(filter(lambda x: x.endswith('.json'), current_files_more_countries))
jsons_non_mapped = list(filter(lambda x: x.endswith('.json'), current_files_non_mapped))

jsons_data = load_json_files(jsons, allow_err=True)
jsons_data_more_countries = load_json_files(jsons_more_countries, allow_err=True)
jsons_data_non_mapped = load_json_files(jsons_non_mapped, allow_err=True)

def remove_duplicates(current_images, current_jsons, current_jsons_data):
  current_filtered_files = []

  current_duplicate_locations = 0

  previous_locations = set()

  for image_path, json_path, json_data in zip(current_images, current_jsons, current_jsons_data):
    if json_data is None:
      continue
    # if they have the same coordinates, then they are the same location, only use the first
    str_coords = str(json_data['coordinates'])
    if str_coords in previous_locations:
      current_duplicate_locations += 1
      continue
    else:
      previous_locations.add(str_coords)
      current_filtered_files.append(image_path)
      current_filtered_files.append(json_path)
      
  current_filtered_images = list(filter(lambda x: x.endswith('.png'), current_filtered_files))
      
  return current_filtered_files, current_filtered_images, current_duplicate_locations

filtered_files, filtered_images, duplicate_locations = remove_duplicates(images, jsons, jsons_data)

print("Duplicate locations:", duplicate_locations)

filtered_files_more_countries, filtered_images_more_countries, duplicate_locations_more_countries = remove_duplicates(images_more_countries, jsons_more_countries, jsons_data_more_countries)

print("Duplicate locations more countries:", duplicate_locations_more_countries)

filtered_files_non_mapped, filtered_images_non_mapped, duplicate_locations_non_mapped = remove_duplicates(images_non_mapped, jsons_non_mapped, jsons_data_non_mapped)

print("Duplicate locations non mapped:", duplicate_locations_non_mapped)

print(len(filtered_images))
print(len(filtered_images_more_countries))
print(len(filtered_images_non_mapped))

filtered_files_set = set(filtered_files)
filtered_files_more_countries_set = set(filtered_files_more_countries)
filtered_files_non_mapped_set = set(filtered_files_non_mapped)
    
# Sort back to original order (some files were removed)
filtered_files = [file for file in original_order if file in filtered_files_set]
filtered_files_more_countries = [file for file in original_order_more_countries if file in filtered_files_more_countries_set]
filtered_files_non_mapped = [file for file in original_order_non_mapped if file in filtered_files_non_mapped_set]

update_data_to_load(filtered_files, old_loading_file='../03_mapping/data_list', file_location='../01_enriching/data/', image_file_location='../../1_data_collection/data/', from_remote_only=True, download_link='default', allow_file_location_env=True, allow_image_file_location_env=True, allow_json_file_location_env=True, allow_download_link_env=True)
update_data_to_load(filtered_files_more_countries, new_loading_file='./updated_data_list_more', old_loading_file='../03_mapping/more_data_list', file_location='../01_enriching/data/', image_file_location='../../1_data_collection/data/', from_remote_only=True, download_link='default', allow_file_location_env=True, allow_image_file_location_env=True, allow_json_file_location_env=True, allow_download_link_env=True)
update_data_to_load(filtered_files_non_mapped, new_loading_file='./updated_data_list_non_mapped', old_loading_file='./pre_filtering_data_list', file_location='../01_enriching/data/', image_file_location='../../1_data_collection/data/', from_remote_only=True, download_link='default', allow_file_location_env=True, allow_image_file_location_env=True, allow_json_file_location_env=True, allow_download_link_env=True)

Getting files list from remote
Got files list from remote
Parsed files list from remote
All remote files: 705681
All local files: 705681
Filtering out unpaired files
Filtered out 27805 unpaired files
Relevant files: 677876
Getting files list from remote
Got files list from remote
Parsed files list from remote
All remote files: 705681
All local files: 705681
Filtering out unpaired files
Filtered out 27805 unpaired files
Relevant files: 677876
Getting files list from remote
Got files list from remote
Parsed files list from remote
All remote files: 705681
All local files: 705681
Filtering out unpaired files
Filtered out 27805 unpaired files
Relevant files: 677876
95153
82473
338938


libpng error: Read Error


Error reading image: geoguessr_location_singleplayer_03MrvgM0hjFtwQVB_3.png
OpenCV(4.9.0) /Users/xperience/GHA-OpenCV-Python2/_work/opencv-python/opencv-python/opencv/modules/imgproc/src/color.cpp:196: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'

Too low max RGB variance: geoguessr_location_singleplayer_00RUSiKypPGLhMjI_0.png
Corresponding result: geoguessr_result_singleplayer_00RUSiKypPGLhMjI_0.json
Max RGB variance: 0.0
Too low max RGB variance: geoguessr_location_singleplayer_04aIMEDoIsolXMPG_0.png
Corresponding result: geoguessr_result_singleplayer_04aIMEDoIsolXMPG_0.json
Max RGB variance: 0.0
Too low max RGB variance: geoguessr_location_singleplayer_08LAHDNXMTUTEF8o_0.png
Corresponding result: geoguessr_result_singleplayer_08LAHDNXMTUTEF8o_0.json
Max RGB variance: 0.0
Too low max RGB variance: geoguessr_location_singleplayer_0E2f8UWXhDx2ABZP_0.png
Corresponding result: geoguessr_result_singleplayer_0E2f8UWXhDx2ABZP_0.json
Max RGB variance: 0.0
Too low max R

libpng error: Read Error


Too low max RGB variance: geoguessr_location_singleplayer_00RUSiKypPGLhMjI_0.png
Corresponding result: geoguessr_result_singleplayer_00RUSiKypPGLhMjI_0.json
Max RGB variance: 0.0
Too low max RGB variance: geoguessr_location_singleplayer_04aIMEDoIsolXMPG_0.png
Corresponding result: geoguessr_result_singleplayer_04aIMEDoIsolXMPG_0.json
Max RGB variance: 0.0
Too low max RGB variance: geoguessr_location_singleplayer_08LAHDNXMTUTEF8o_0.png
Corresponding result: geoguessr_result_singleplayer_08LAHDNXMTUTEF8o_0.json
Max RGB variance: 0.0
Too low max RGB variance: geoguessr_location_singleplayer_0E2f8UWXhDx2ABZP_0.png
Corresponding result: geoguessr_result_singleplayer_0E2f8UWXhDx2ABZP_0.json
Max RGB variance: 0.0
Too low max RGB variance: geoguessr_location_singleplayer_0FWcmlORR9hPRjcS_0.png
Corresponding result: geoguessr_result_singleplayer_0FWcmlORR9hPRjcS_0.json
Max RGB variance: 0.0
Too low max RGB variance: geoguessr_location_singleplayer_0J6s8ND11fhb3IqY_0.png
Corresponding result: ge

libpng error: Read Error


Error reading image: geoguessr_location_singleplayer_03MrvgM0hjFtwQVB_3.png
OpenCV(4.9.0) /Users/xperience/GHA-OpenCV-Python2/_work/opencv-python/opencv-python/opencv/modules/imgproc/src/color.cpp:196: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'

Too low max RGB variance: geoguessr_location_singleplayer_00RUSiKypPGLhMjI_0.png
Corresponding result: geoguessr_result_singleplayer_00RUSiKypPGLhMjI_0.json
Max RGB variance: 0.0
Too low max RGB variance: geoguessr_location_singleplayer_04aIMEDoIsolXMPG_0.png
Corresponding result: geoguessr_result_singleplayer_04aIMEDoIsolXMPG_0.json
Max RGB variance: 0.0
Too low max RGB variance: geoguessr_location_singleplayer_08LAHDNXMTUTEF8o_0.png
Corresponding result: geoguessr_result_singleplayer_08LAHDNXMTUTEF8o_0.json
Max RGB variance: 0.0
Too low max RGB variance: geoguessr_location_singleplayer_0E2f8UWXhDx2ABZP_0.png
Corresponding result: geoguessr_result_singleplayer_0E2f8UWXhDx2ABZP_0.json
Max RGB variance: 0.0
Too low max R