In [1]:
import json

# Load the provided JSON file
file_path = 'assets/index.json'

with open(file_path, 'r') as file:
    data = json.load(file)

total_files = 0
total_size = 0

for dataset in data.get('datasets', []):
    files = dataset.get('files', [])
    total_files += len(files)
    total_size += sum(int(file['size']) for file in files)

total_files, total_size


(35860, 3120908957711)

In [2]:
# Create a dictionary for quick lookup from the JSON data (much quicker than recursively looking up in the JSON)

def create_lookup_dict(json_data):

    lookup_dict = {}
    for dataset in json_data.get('datasets', []):
        for file in dataset.get('files', []):
            lookup_dict[file['name']] = (file['size'], file['timestamp'])
    return lookup_dict

# Create a lookup dictionary
lookup_dict = create_lookup_dict(data)

In [3]:
def parse_coordinates_from_filename(filename):
    """
    Extract easting and northing values from the file name.
    For example, from '3dm_32_280_5656_1_nw.laz', it will extract (32280, 5656).
    """
    parts = filename.split('_')
    easting = int(parts[1] + parts[2])
    northing = int(parts[3])
    return easting, northing

def calculate_zone_files_and_size(files, max_easting, max_northing):
    """
    Calculate the number of files and total size for each zone based on the given limits.
    """
    zones = {
        'zone_1': {'count': 0, 'size': 0},
        'zone_2': {'count': 0, 'size': 0},
        'zone_3': {'count': 0, 'size': 0},
        'zone_4': {'count': 0, 'size': 0}
    }

    for file in files:
        size = int(file['size'])
        easting, northing = parse_coordinates_from_filename(file['name'])

        if easting < max_easting and northing < max_northing:
            zones['zone_1']['count'] += 1
            zones['zone_1']['size'] += size
        elif easting < max_easting and northing >= max_northing:
            zones['zone_2']['count'] += 1
            zones['zone_2']['size'] += size
        elif easting >= max_easting and northing >= max_northing:
            zones['zone_3']['count'] += 1
            zones['zone_3']['size'] += size
        else: # easting >= max_easting and northing >= max_northing
            zones['zone_4']['count'] += 1
            zones['zone_4']['size'] += size

    return zones

def print_zone_results(zones):
    """
    Print the results for each zone.
    """
    for zone, data in zones.items():
        size_gb = data['size'] / (1024**3)  # Convert bytes to gigabytes
        print(f"{zone.upper()}: Number of files = {data['count']}, Total size = {size_gb:.2f} GB")


In [4]:
def calculate_size(filenames, lookup_dict = lookup_dict):
    total_files = 0
    total_size = 0

    for filename in filenames:
        file_info = lookup_dict.get(filename)
        if file_info:
            total_files += 1
            total_size += int(file_info[0])  # file_info[0] is the size

    return total_files, round(total_size / (1024**3), 2)  # Size in GB

In [5]:
def find_file_info(filename, json_data):
    """
    Search for a file in the JSON data and return its size and timestamp if found.
    If not found, return False.
    """
    for dataset in json_data.get('datasets', []):
        for file in dataset.get('files', []):
            if file['name'] == filename:
                return file['size'], file['timestamp']

    return False

In [6]:
# Find min max easting and northing for a list of names

def find_minmax_EN(filenames):

  min_easting, min_northing = parse_coordinates_from_filename(filenames[0])
  max_easting = min_easting
  max_northing = min_northing

  for file in filenames:

    easting, northing = parse_coordinates_from_filename(file)

    if easting < min_easting: min_easting = easting
    if northing < min_northing: min_northing = northing
    if easting > max_easting: max_easting = easting
    if northing > max_northing: max_northing = northing

  return min_easting, max_easting, min_northing, max_northing

#Segregate files into four zones based on the given limits and return their filenames.

def segregate_files_into_main_zones(files, max_easting, max_northing):
    """
    
    """

    zones = {
        'zone_1': [],
        'zone_2': [],
        'zone_3': [],
        'zone_4': []
    }

    for file in files:
        filename = file['name']
        easting, northing = parse_coordinates_from_filename(filename)

        if easting < max_easting and northing < max_northing:
            zones[list(zones.keys())[0]].append(filename)
        elif easting < max_easting and northing >= max_northing:
            zones[list(zones.keys())[1]].append(filename)
        elif easting >= max_easting and northing >= max_northing:
            zones[list(zones.keys())[2]].append(filename)
        else: # easting >= max_easting and northing >= max_northing
            zones[list(zones.keys())[3]].append(filename)

    return zones

def segregate_files_into_sub_zones(files, easting_range, northing_range, best_easting, best_northing, idx):
    """
    Segregate files into four subzones based on the given limits and return their filenames.
    """
    zones = {
          'zone_%s.1'%(idx): [],
          'zone_%s.2'%(idx): [],
          'zone_%s.3'%(idx): [],
          'zone_%s.4'%(idx): [],
      }

    for file in files:
        filename = file['name']
        easting, northing = parse_coordinates_from_filename(filename)

        if easting_range.start <= easting < best_easting and northing_range.start <= northing < best_northing:
            zones[list(zones.keys())[0]].append(filename)
        elif easting_range.start <= easting < best_easting and best_northing <= northing <= northing_range.stop :
            zones[list(zones.keys())[1]].append(filename)
        elif best_easting <= easting <= easting_range.stop and best_northing <= northing <= northing_range.stop:
            zones[list(zones.keys())[2]].append(filename)
        elif best_easting <= easting <= easting_range.stop and northing_range.start <= northing < best_northing:
            zones[list(zones.keys())[3]].append(filename)

    return zones

def write_zone_files(zones):
    """
    Write the filenames of each zone into separate text files.
    """
    for zone, filenames in zones.items():
        with open(f'{zone}_files.txt', 'w') as file:
            for filename in filenames:
                file.write(filename + '\n')

def calculate_zone_size(files, easting_start, easting_limit, northing_start, northing_limit):

  zone_size = 0

  for east in range(easting_start, easting_limit, 10):
    for north in range(northing_start, northing_limit, 10):

      tentative_filename = f'3dm_{str(east)[0:2]}_{str(east)[2:]}_{str(north)}_1_nw.laz'

      if tentative_filename in lookup_dict:

        size = int(lookup_dict[tentative_filename][0])

        zone_size += size

  return zone_size

def find_balanced_zones(files, easting_range, northing_range):
    best_balance = float('inf')
    best_easting = None
    best_northing = None

    for easting_limit in easting_range:
        for northing_limit in northing_range:

          tentative_zone_1 = calculate_zone_size(files, easting_range.start, easting_limit, northing_range.start, northing_limit)
          tentative_zone_2 = calculate_zone_size(files, easting_range.start, easting_limit, northing_limit, northing_range.stop)
          tentative_zone_3 = calculate_zone_size(files, easting_limit, easting_range.stop, northing_limit, northing_range.stop)
          tentative_zone_4 = calculate_zone_size(files, easting_limit, easting_range.stop, northing_range.start, northing_limit)

          sizes = [tentative_zone_1, tentative_zone_2, tentative_zone_3, tentative_zone_4]

          balance = max(sizes) - min(sizes)  # Example balance metric

          if balance < best_balance:
              best_balance = balance
              best_easting = easting_limit
              best_northing = northing_limit

    return best_easting, best_northing, best_balance


# Divide in zones and subzones

In [7]:
# Find the best division for the 4 main zones

min_easting, max_easting, min_northing, max_northing = find_minmax_EN(list(lookup_dict.keys()))

easting_range = range(min_easting, max_easting, 10) # step at 10 speeds up finding the best division
northing_range = range(min_northing, max_northing, 10)

print('Finding the 4 balanced main zones...')

all_files = []
for dataset in data.get('datasets', []):
    all_files.extend(dataset.get('files', []))

best_easting, best_northing, _ = find_balanced_zones(all_files, easting_range, northing_range)

print(f"Best balance at Easting: {best_easting}, Northing: {best_northing} for the 4 main zones")
print('\n')

# Create our 4 main zones

main_zones = segregate_files_into_main_zones(all_files, best_easting, best_northing)

idx = 1
sub_zones = []
number_of_files = 0
total_size = 0
coordinates = []

for zone in main_zones: # for each main zone

  # Find the best division in 4 subzones

  min_easting, max_easting, min_northing, max_northing = find_minmax_EN(main_zones[zone])

  easting_range = range(min_easting, max_easting, 10)
  northing_range = range(min_northing, max_northing, 10)

  print(f'Finding balanced subzones of main zone {idx}...')

  best_easting, best_northing, _ = find_balanced_zones(main_zones[zone], easting_range, northing_range)

  print(f'For main zone {zone}, the best balance is obtained for: Easting: {best_easting}, Northing: {best_northing}')

  # Create the subzones and write files

  sub_zones.append(segregate_files_into_sub_zones(all_files, easting_range, northing_range, best_easting, best_northing, idx))

  subtotal_number_of_files = 0
  subtotal_size = 0

  # List of coordinates for generating a KML or SHP
  coordinates.append((easting_range.start, best_easting, northing_range.start, best_northing)) # subzone x.1
  coordinates.append((easting_range.start, best_easting, best_northing, northing_range.stop)) # subzone x.2
  coordinates.append((best_easting, easting_range.stop, best_northing, northing_range.stop)) # subzone x.3
  coordinates.append((best_easting, easting_range.stop, northing_range.start, best_northing)) # subzone x.4

  for j in range(4): # For each of the 4 subzones

    files_number = len(list(sub_zones[idx-1].values())[j])
    sub_size = calculate_size(list(sub_zones[idx-1].values())[j])[1]

    subtotal_number_of_files += files_number
    subtotal_size += sub_size

    print(f'{str(files_number)} files in {list(sub_zones[idx-1].keys())[j]}, size = {str(sub_size)} GB')

  number_of_files += subtotal_number_of_files
  total_size += subtotal_size

  #write_zone_files(sub_zones[idx-1]) # uncomment to write new txt files

  idx += 1

  print('\n')

print('- Verification - ')
print('\n')
print(f'Total number of files after processing: {number_of_files}, cumulated size: {total_size}')
print(f'Comparison basis from original index file: {total_files}, {str(round(sum(int(file_info[0]) for file_info in lookup_dict.values()) / (1024**3), 2))} GB')

Finding the 4 balanced main zones...
Best balance at Easting: 32400, Northing: 5686 for the 4 main zones


Finding balanced subzones of main zone 1...
For main zone zone_1, the best balance is obtained for: Easting: 32330, Northing: 5636
1692 files in zone_1.1, size = 287.82 GB
1996 files in zone_1.2, size = 140.43 GB
3500 files in zone_1.3, size = 327.18 GB
2416 files in zone_1.4, size = 272.47 GB


Finding balanced subzones of main zone 2...
For main zone zone_2, the best balance is obtained for: Easting: 32378, Northing: 5726
3012 files in zone_2.1, size = 194.44 GB
3396 files in zone_2.2, size = 140.96 GB
1572 files in zone_2.3, size = 69.91 GB
880 files in zone_2.4, size = 123.63 GB


Finding balanced subzones of main zone 3...
For main zone zone_3, the best balance is obtained for: Easting: 32470, Northing: 5726
2800 files in zone_3.1, size = 303.22 GB
4828 files in zone_3.2, size = 212.89 GB
3972 files in zone_3.3, size = 261.87 GB
1680 files in zone_3.4, size = 116.03 GB


Find

In [9]:
sub_zones[1]['zone_2.3']

['3dm_32_378_5726_1_nw.laz',
 '3dm_32_378_5727_1_nw.laz',
 '3dm_32_378_5728_1_nw.laz',
 '3dm_32_378_5729_1_nw.laz',
 '3dm_32_378_5730_1_nw.laz',
 '3dm_32_378_5731_1_nw.laz',
 '3dm_32_378_5732_1_nw.laz',
 '3dm_32_378_5733_1_nw.laz',
 '3dm_32_378_5734_1_nw.laz',
 '3dm_32_378_5735_1_nw.laz',
 '3dm_32_378_5736_1_nw.laz',
 '3dm_32_378_5737_1_nw.laz',
 '3dm_32_378_5738_1_nw.laz',
 '3dm_32_378_5739_1_nw.laz',
 '3dm_32_378_5740_1_nw.laz',
 '3dm_32_378_5741_1_nw.laz',
 '3dm_32_378_5742_1_nw.laz',
 '3dm_32_378_5743_1_nw.laz',
 '3dm_32_378_5744_1_nw.laz',
 '3dm_32_378_5745_1_nw.laz',
 '3dm_32_378_5746_1_nw.laz',
 '3dm_32_378_5747_1_nw.laz',
 '3dm_32_378_5748_1_nw.laz',
 '3dm_32_378_5749_1_nw.laz',
 '3dm_32_378_5750_1_nw.laz',
 '3dm_32_378_5751_1_nw.laz',
 '3dm_32_378_5752_1_nw.laz',
 '3dm_32_378_5753_1_nw.laz',
 '3dm_32_378_5754_1_nw.laz',
 '3dm_32_378_5755_1_nw.laz',
 '3dm_32_378_5756_1_nw.laz',
 '3dm_32_378_5757_1_nw.laz',
 '3dm_32_378_5758_1_nw.laz',
 '3dm_32_378_5759_1_nw.laz',
 '3dm_32_378_5

In [129]:
!zip results.zip *.txt

  adding: zone_1.1_files.txt (deflated 90%)
  adding: zone_1.2_files.txt (deflated 90%)
  adding: zone_1.3_files.txt (deflated 90%)
  adding: zone_1.4_files.txt (deflated 90%)
  adding: zone_2.1_files.txt (deflated 90%)
  adding: zone_2.2_files.txt (deflated 89%)
  adding: zone_2.3_files.txt (deflated 90%)
  adding: zone_2.4_files.txt (deflated 90%)
  adding: zone_3.1_files.txt (deflated 90%)
  adding: zone_3.2_files.txt (deflated 90%)
  adding: zone_3.3_files.txt (deflated 90%)
  adding: zone_3.4_files.txt (deflated 90%)
  adding: zone_4.1_files.txt (deflated 90%)
  adding: zone_4.2_files.txt (deflated 91%)
  adding: zone_4.3_files.txt (deflated 91%)
  adding: zone_4.4_files.txt (deflated 90%)


# Generate a KML file showing our division in subzones

In [13]:
import simplekml
import utm

def utm_to_latlon(x, y):
    # Convert lat/lon to UTM coordinates
    lat, lon = utm.to_latlon((x - 32000)*1000, y*1000, 32, 'U')

    return lat, lon

def create_kml_with_squares(coordinates, kml_filename):
    """
    Create a KML file with squares from a list of coordinates.
    """
    kml = simplekml.Kml()

    for min_easting, max_easting, min_northing, max_northing in coordinates:
        # Convert each corner to WGS84
        min_lat, min_lon = utm_to_latlon(min_easting, min_northing)
        max_lat, max_lon = utm_to_latlon(max_easting, max_northing)

        # Create square as a polygon
        pol = kml.newpolygon(name="Square",
                             outerboundaryis=[(min_lon, min_lat),
                                              (min_lon, max_lat),
                                              (max_lon, max_lat),
                                              (max_lon, min_lat),
                                              (min_lon, min_lat)])
        pol.style.linestyle.color = simplekml.Color.red
        pol.style.linestyle.width = 3
        pol.style.polystyle.color = simplekml.Color.changealphaint(100, simplekml.Color.green)

    kml.save(kml_filename)


In [14]:
create_kml_with_squares(coordinates, 'NRW_division.kml')

# Download

In [10]:
url_prefix = 'https://www.opengeodata.nrw.de/produkte/geobasis/hm/3dm_l_las/3dm_l_las/'

download_list = [url_prefix + item for item in sub_zones[1]['zone_2.3']]

In [9]:
download_list

['https://www.opengeodata.nrw.de/produkte/geobasis/hm/3dm_l_las/3dm_l_las/3dm_32_280_5652_1_nw.laz',
 'https://www.opengeodata.nrw.de/produkte/geobasis/hm/3dm_l_las/3dm_l_las/3dm_32_280_5653_1_nw.laz',
 'https://www.opengeodata.nrw.de/produkte/geobasis/hm/3dm_l_las/3dm_l_las/3dm_32_280_5654_1_nw.laz',
 'https://www.opengeodata.nrw.de/produkte/geobasis/hm/3dm_l_las/3dm_l_las/3dm_32_280_5655_1_nw.laz',
 'https://www.opengeodata.nrw.de/produkte/geobasis/hm/3dm_l_las/3dm_l_las/3dm_32_280_5656_1_nw.laz',
 'https://www.opengeodata.nrw.de/produkte/geobasis/hm/3dm_l_las/3dm_l_las/3dm_32_280_5657_1_nw.laz',
 'https://www.opengeodata.nrw.de/produkte/geobasis/hm/3dm_l_las/3dm_l_las/3dm_32_280_5658_1_nw.laz',
 'https://www.opengeodata.nrw.de/produkte/geobasis/hm/3dm_l_las/3dm_l_las/3dm_32_280_5659_1_nw.laz',
 'https://www.opengeodata.nrw.de/produkte/geobasis/hm/3dm_l_las/3dm_l_las/3dm_32_280_5660_1_nw.laz',
 'https://www.opengeodata.nrw.de/produkte/geobasis/hm/3dm_l_las/3dm_l_las/3dm_32_280_5661_1

In [12]:
import requests
from time import sleep
import os
from tqdm import tqdm

def download_file(url, directory, attempt=1, max_attempts=5):

    file_name = url.split('/')[-1]
    file_path = os.path.join(directory, file_name)  # Combine the directory with the file name

    # Check if the file already exists
    if os.path.exists(file_path):
        #print(f"File {file_name} already exists in {directory}. Skipping download.")
        return

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an error for bad status codes

        with open(file_path, 'wb') as file:
            file.write(response.content)
        #print(f"Downloaded {file_name} to {directory}")

    except requests.RequestException as e:
        if attempt < max_attempts:
            print(f"Attempt {attempt} failed for {url}. Retrying...")
            sleep(1)  # Wait for 1 second before retrying
            download_file(url, directory, attempt + 1, max_attempts)
        else:
            print(f"Failed to download {url} after {max_attempts} attempts.")

download_directory = "/Volumes/SSD2/Split_NRW/zone_2.3/"

# Create the directory if it doesn't exist
if not os.path.exists(download_directory):
    os.makedirs(download_directory)

# Download each file
for url in tqdm(download_list):
    download_file(url, download_directory)

100%|██████████| 1572/1572 [5:09:34<00:00, 11.82s/it]  


# Create a download list for DEM files

In [8]:
def generate_url(filename):
    # Split the filename to extract the required parts
    parts = filename.split('_')
    xxx = parts[2]
    yyyy = parts[3]

    # Calculate zzz and qqqq
    aaa = str(int(xxx) + 1).zfill(len(xxx))
    bbbb = str(int(yyyy) + 1).zfill(len(yyyy))

    # Construct and return the URL
    url = f"https://www.wcs.nrw.de/geobasis/wcs_nw_dgm?REQUEST=GetCoverage&SERVICE=WCS&VERSION=2.0.1&COVERAGEID=nw_dgm&FORMAT=image/tiff&SUBSET=x({xxx}000,{aaa}000)&SUBSET=y({yyyy}000,{bbbb}000)&OUTFILE=dgm1_32_{xxx}_{yyyy}_1_nw&APP=timonline"
    return url



In [13]:
# Example usage
filename = '3dm_32_286_5630_1_nw.laz'
url = generate_url(filename)
url


In [9]:
def write_dem_download_lists(zones):
    for zone, filenames in zones.items():
        with open(f'./download_lists/{zone}_DEM_files.txt', 'w') as file:
            for filename in filenames:
                url = generate_url(filename)
                file.write(url + '\n')

In [17]:
write_dem_download_lists(sub_zones[0])

# Download DEM files

In [10]:
def load_file_to_list(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()
        # Strip newline characters from each line
        lines = [line.strip() for line in lines]
    return lines

In [13]:
dem_download_list = load_file_to_list('./download_lists/zone_1.4_DEM_files.txt')

In [14]:
import requests
from time import sleep
import os
from tqdm import tqdm

def find_filename(url):
    params = url.split('&')

    for param in params:
        if param.startswith('OUTFILE='):
            # Extracting and returning the value of OUTFILE
            return param.split('=')[1] + '.tif'

    # Return None if OUTFILE is not found
    return None

def download_file(url, directory, attempt=1, max_attempts=5):
    """Download a file with retries on failure and save it to a specified directory."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an error for bad status codes

        file_name = find_filename(url)
        file_path = os.path.join(directory, file_name)  # Combine the directory with the file name

        with open(file_path, 'wb') as file:
            file.write(response.content)
        #print(f"Downloaded {file_name} to {directory}")

    except requests.RequestException as e:
        if attempt < max_attempts:
            #print(f"Attempt {attempt} failed for {url}. Retrying...")
            sleep(1)  # Wait for 1 second before retrying
            download_file(url, directory, attempt + 1, max_attempts)
        else:
            print(f"Failed to download {url} after {max_attempts} attempts.")

download_directory = "/Volumes/SSD2/Split_NRW/zone_1.4_DEM/"

# Create the directory if it doesn't exist
if not os.path.exists(download_directory):
    os.makedirs(download_directory)

# Download each file
for url in tqdm(dem_download_list):
    download_file(url, download_directory)

  0%|          | 0/2416 [00:00<?, ?it/s]

100%|██████████| 2416/2416 [44:49<00:00,  1.11s/it]
