In [1]:
import requests
import json
import time
import logging
import os
from urllib.parse import urlparse, unquote

In [2]:
def setup_logging():
    """Set up logging to file."""
    logging.basicConfig(filename='geo_boundaries_errors.log', level=logging.ERROR,
                        format='%(asctime)s:%(levelname)s:%(message)s')

def fetch_data(url):
    """Fetch data from the provided URL."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.json()
    except requests.RequestException as e:
        logging.error(f"Failed to fetch data from {url}: {e}")
        return None

def download_geojson(url, folder_path):
    """Download and save the GeoJSON file, keeping the original file name from the URL."""
    try:
        file_name = unquote(urlparse(url).path.split('/')[-1])
        response = requests.get(url)
        response.raise_for_status()

        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(response.text)

        print(f"Successfully downloaded and saved {file_name}")
        return True
    except requests.RequestException as e:
        logging.error(f"Failed to fetch or save GeoJSON for {file_name}: {e}")
        return False

def manage_geoboundaries_retrieval(adm_level, delay=3):
    setup_logging()
    
    base_url = f"https://www.geoboundaries.org/api/current/gbOpen/ALL/{adm_level}/"
    folder_name = "country-geojson"
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    
    print(f"Fetching data from {base_url}")
    data = fetch_data(base_url)
    
    if data:
        for item in data:
            simplified_geojson_url = item['simplifiedGeometryGeoJSON']
            print(f"Downloading GeoJSON from {simplified_geojson_url}")
            download_geojson(simplified_geojson_url, folder_name)
            
            # Wait for 3 seconds before making the next API call
            time.sleep(delay)

In [3]:
# manage_geoboundaries_retrieval("ADM2")

In [22]:
def list_geojson_files(directory_path):
    return [f for f in os.listdir(directory_path) if f.endswith('.geojson')]


def separate_files_by_adm_level(geojson_files, directory_path):
    countries = {}
    adm1_subareas = {}
    adm2_subareas = {}

    for file in geojson_files:
        if 'ADM0' in file:
            iso_code = file.split('-')[1]
            countries[iso_code] = os.path.join(directory_path, file)
        elif 'ADM1' in file:
            iso_code = file.split('-')[1]
            if iso_code not in adm1_subareas:
                adm1_subareas[iso_code] = []
            adm1_subareas[iso_code].append(os.path.join(directory_path, file))
        elif 'ADM2' in file:
            iso_code = file.split('-')[1]
            if iso_code not in adm2_subareas:
                adm2_subareas[iso_code] = []
            adm2_subareas[iso_code].append(os.path.join(directory_path, file))
    
    return countries, adm1_subareas, adm2_subareas


def process_adm_files(adm_files, subareas):
    subarea_list = []
    for adm_path in adm_files:
        with open(adm_path, 'r', encoding='utf-8') as sub_file:
            sub_data = json.load(sub_file)
            for sub_feature in sub_data['features']:
                sub_isocode = sub_feature['properties'].get('shapeISO')
                sub_name = sub_feature['properties'].get('shapeName')
                sub_id = sub_feature['properties'].get('shapeID')
                if sub_name:
                    subarea_list.append({
                        'isocode': sub_isocode,
                        'name': sub_name,
                        'id': sub_id
                    })
    return subarea_list


def process_countries(countries, adm1_subareas, adm2_subareas):
    country_list = []
    missing_countries = []

    for iso_code, path in countries.items():
        with open(path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            valid_entry = False
            for feature in data['features']:
                country_isocode = feature['properties'].get('shapeISO')
                country_name = feature['properties'].get('shapeName')
                if country_isocode and country_name:
                    valid_entry = True
                    country_entry = {
                        'isocode': country_isocode,
                        'name': country_name,
                        'adm1_subareas': process_adm_files(adm1_subareas.get(iso_code, []), 'adm1'),
                        'adm2_subareas': process_adm_files(adm2_subareas.get(iso_code, []), 'adm2')
                    }
                    country_list.append(country_entry)

            if not valid_entry:
                missing_countries.append({
                    'filename': os.path.basename(path),
                    'reason': 'Invalid ADM0 file content'
                })
    
    return country_list, missing_countries


def process_missing_countries(adm1_subareas, adm2_subareas, existing_countries):
    missing_countries = []

    for iso_code, adm1_files in adm1_subareas.items():
        if iso_code not in existing_countries:
            country_entry = {
                'isocode': iso_code,
                'name': 'Unknown',
                'adm1_subareas': process_adm_files(adm1_files, 'adm1'),
                'adm2_subareas': process_adm_files(adm2_subareas.get(iso_code, []), 'adm2')
            }
            missing_countries.append(country_entry)
    
    return missing_countries


def write_json_files(country_list, missing_countries):
    with open('countries.json', 'w') as file:
        json.dump(country_list, file, indent=4)

    with open('missing_countries.json', 'w') as file:
        json.dump(missing_countries, file, indent=4)


def create_country_list_json(directory_path='country-geojson'):
    geojson_files = list_geojson_files(directory_path)
    countries, adm1_subareas, adm2_subareas = separate_files_by_adm_level(geojson_files, directory_path)
    
    country_list, initial_missing_countries = process_countries(countries, adm1_subareas, adm2_subareas)
    additional_missing_countries = process_missing_countries(adm1_subareas, adm2_subareas, countries)
    
    missing_countries = initial_missing_countries + additional_missing_countries
    
    write_json_files(country_list, missing_countries)

    print(f"Number of countries: {len(country_list)}")
    print(f"Number of missing entries: {len(missing_countries)}")

    return 'country_list.json', 'missing_countries.json'

In [23]:
output_file, missing_file = create_country_list_json()
print(f"Data has been successfully saved to {output_file}")
print(f"Missing countries have been saved to {missing_file}")

Number of countries: 230
Number of missing entries: 0
Data has been successfully saved to country_list.json
Missing countries have been saved to missing_countries.json
