In [1]:
import requests
import os
import concurrent.futures
import numpy as np
import re
import random
import time

In [2]:
def check_files_bboxdir(bbox_path):
    if os.path.isdir(bbox_path):
        return [file for file in os.listdir(bbox_path) if os.path.isfile(os.path.join(bbox_path, file))]
    return None

def extract_number_from_string(s : str) -> int:
    return int(re.findall(r'\d+', s)[0])

def first_discontinuity_index(list_pages):
    list_pages = sorted(list_pages)  # Ensure the list is sorted
    for i in range(len(list_pages) - 1):
        if list_pages[i] + 1 != list_pages[i + 1]:
            return i + 1  # Return the index of the missing page.
    return -1  # Return -1 if no discontinuities are found


# Example usage
#directory = "./gpx_traces/bbox_3"
#files = check_files_bboxdir(directory)
#if files is not None:
#    pages = {extract_number_from_string(s) for s in files}
#    # print(pages)
#    print(first_discontinuity_index(pages))
#else:
#    print("Directory does not exist.")

### Parallel batch-continuous version

In [3]:
def compute_bounding_boxes(bbox : tuple[float,float,float,float], step : float) -> list[tuple[float,float,float,float]]:

    list_bboxes = []
    
    min_lon = bbox[0]
    min_lat = bbox[1]
    max_lon = bbox[2]
    max_lat = bbox[3]
    for curr_lon in np.arange(min_lon, max_lon, step) :
        for curr_lat in np.arange(min_lat, max_lat, step) :
            list_bboxes.append(f'{curr_lon},{curr_lat},{min(curr_lon + step, max_lon)},{min(curr_lat + step, max_lat)}')

    return list_bboxes

In [4]:
def fetch_page(bbox, page):
    url = f'{api_url}?bbox={bbox}&page={page}'
    print(f'Sending HTTP request to {url}...')

    while True :
        try :
            time.sleep(2.0)
            response = requests.get(url)
            return page, response

        # If an exception occurs, e.g., malformed response, we resubmit the request after a small pause.
        except Exception as err:
            print(f'Exception occurred for request {url}, trying again...')
            time.sleep(2.0)

In [None]:
# Partition the overall bbox in smaller bboxes such that the API request will be accepted by OSM.
# NOTE: limit_deg should be 0.25, but beyond 0.06 the OSM API don't seem to work properly. Probably there's another limit in place too.
bbox_NY = (-74.259,40.477,-73.700,40.918)
limit_deg = 0.06
list_bboxes = compute_bounding_boxes(bbox_NY, limit_deg)
print(f'Number of bounding boxes to download: {len(list_bboxes)}')


# OSM API endpoint for GPS trackpoints
api_url = 'https://api.openstreetmap.org/api/0.6/trackpoints'


# Directory to save the downloaded GPX files
output_dir = 'gpx_traces'
os.makedirs(output_dir, exist_ok=True)


max_workers = 16
idx_bbox = 0
# list_bboxes = list_bboxes[idx_bbox :]
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    for bbox in list_bboxes :
        print(f'Processing {idx_bbox}-th bbox: {bbox}')

        # Check if the directory for this bbox already exists and, if this is true, which pages have already been downloaded.
        files = check_files_bboxdir(f'{output_dir}/bbox_{str(idx_bbox)}')
        next_page = 0
        if files is not None: 
            pages = {extract_number_from_string(s) for s in files}
            idx_check = first_discontinuity_index(pages)
            
            print(f'Bbox dir detected, discontinuity check: {idx_check}')
            if idx_check == -1 :
                print('We have already downloaded all the data for this bbox, go to the next one')
                idx_bbox += 1
                continue # No discontinuities detected: assume that this bbox's data has already been completely downloaded. 
            else : next_page = idx_check # Discontinuity detected: restart downloading the bbox's data from idx_check.
        
        else:
            print(f'Bbox dir not detected, download everything.')
            os.makedirs(f'{output_dir}/bbox_{str(idx_bbox)}', exist_ok=False)

        
        ### Start downloading the pages. ###
        # Submit an initial batch of tasks equal to max_workers
        future_to_page = {executor.submit(fetch_page, bbox, p): p for p in range(next_page, next_page + max_workers)}
        next_page += max_workers
        while future_to_page:
            # Iterate over completed futures
            for future in concurrent.futures.as_completed(list(future_to_page.keys())):
                
                p, response = future.result()
                
                # Case 1 - we have succesfully downloaded a page.
                if response.status_code == 200 and '<trkpt' in response.text:
                    gpx_filename = os.path.join(output_dir, 'bbox_' + str(idx_bbox), f'trackpoints_page_{p}.gpx')
                    with open(gpx_filename, 'w', encoding='utf-8') as file:
                        file.write(response.text)
                    print(f'Saved: {gpx_filename}')
                    
                    # Submit a new task to keep the pool full.
                    future_to_page[executor.submit(fetch_page, bbox, next_page)] = next_page
                    next_page += 1

                
                # Case 2 - timeout or a page does not exist.
                else:
                    # Case 2.1 - Page does not exist because the bbox's data has been exausted. Let the worker finish its execution.
                    if response.status_code == 200 : 
                        print(f'No trace data available for page {p}, response code: {response.status_code}.')
                        
                    # Case 2.2 - timeout because bbox is too big or the server is under heavy load. In this case: retry!
                    #            NOTE: If the bbox is too big then reduce its size, or the server might always go in timeout.
                    else :
                        print(f'Timeout for page {p}, response code: {response.status_code}, retry.')
                        # print(f'response code: {response.text}')
                        # Resubmit the task that went in timeout.
                        future_to_page[executor.submit(fetch_page, bbox, p)] = p

                
                # Remove the completed future from the dictionary.
                del future_to_page[future]

        # Update the index of the bounding box that'll be considered next.
        idx_bbox += 1

Number of bounding boxes to download: 80
Processing 0-th bbox: -74.259,40.477,-74.199,40.537
Bbox dir detected, discontinuity check: -1
We have already downloaded all the data for this bbox, go to the next one
Processing 1-th bbox: -74.259,40.537,-74.199,40.597
Bbox dir detected, discontinuity check: -1
We have already downloaded all the data for this bbox, go to the next one
Processing 2-th bbox: -74.259,40.597,-74.199,40.657000000000004
Bbox dir detected, discontinuity check: -1
We have already downloaded all the data for this bbox, go to the next one
Processing 3-th bbox: -74.259,40.657000000000004,-74.199,40.717000000000006
Bbox dir detected, discontinuity check: -1
We have already downloaded all the data for this bbox, go to the next one
Processing 4-th bbox: -74.259,40.717000000000006,-74.199,40.77700000000001
Bbox dir detected, discontinuity check: 30
Sending HTTP request to https://api.openstreetmap.org/api/0.6/trackpoints?bbox=-74.259,40.717000000000006,-74.199,40.777000000000