# Notebook 1: Data Collection and Inventory

Purpose: download and organize all 14 datasets required for the analysis.



Datasets to collect:

1. Wind Speed (Global Wind Atlas)

2. Bathymetry (GEBCO 2020)

3. Power Grid (World Bank/OSM)

4. Ports (Morocco ANP)

5. Shoreline (GSHHG)

6. Tourism POIs (OpenStreetMap)

7. Airports (OpenStreetMap)

8. Sediment Thickness (NOAA)

9. Submarine Cables (TeleGeography)

10. Shipping Routes (EMODnet)

11. Protected Areas (UNEP-WCMC)

12. EEZ Boundaries (Marine Regions)

13. Blue Flag Beaches (Blue Flag Global)

14. Bird Migration Routes (Movebank)



Expected duration: 2-3 hours (download speed dependent).

## Setup and Imports

In [None]:
import os

from pathlib import Path

import requests

from tqdm import tqdm

import pandas as pd

from datetime import datetime



# Project paths

PROJECT_ROOT = Path.cwd().parent

RAW_DATA_DIR = PROJECT_ROOT / "data" / "raw"



# Ensure raw data directory exists

RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)



print(f"Project root: {PROJECT_ROOT}")

print(f"Raw data directory: {RAW_DATA_DIR}")

## Data Inventory Checklist

In [None]:
# Dataset catalog

DATASETS = {

    "Wind Speed": {

        "file": "wind_speed_100m.tif",

        "source": "Global Wind Atlas 3.0",

        "url": "https://globalwindatlas.info/",

        "format": "GeoTIFF",

        "manual_download": True

    },

    "Bathymetry": {

        "file": "bathymetry_gebco.nc",

        "source": "GEBCO 2020",

        "url": "https://www.gebco.net/data_and_products/gridded_bathymetry_data/",

        "format": "NetCDF",

        "manual_download": True

    },

    "Power Grid": {

        "file": "power_grid.shp",

        "source": "OpenStreetMap",

        "url": "https://download.geofabrik.de/africa/morocco.html",

        "format": "Shapefile",

        "manual_download": True

    },

    "Ports": {

        "file": "ports.csv",

        "source": "Morocco ANP or Natural Earth",

        "url": "https://www.naturalearthdata.com/downloads/10m-cultural-vectors/ports/",

        "format": "CSV or Shapefile",

        "manual_download": True

    },

    "Shoreline": {

        "file": "shoreline_morocco.shp",

        "source": "GSHHG v2.3.7",

        "url": "https://www.soest.hawaii.edu/pwessel/gshhg/",

        "format": "Shapefile",

        "manual_download": True

    },

    "Tourism": {

        "file": "tourism_morocco.shp",

        "source": "OpenStreetMap",

        "url": "https://download.geofabrik.de/africa/morocco.html",

        "format": "Shapefile",

        "manual_download": True

    },

    "Airports": {

        "file": "airports_morocco.shp",

        "source": "OpenStreetMap",

        "url": "https://download.geofabrik.de/africa/morocco.html",

        "format": "Shapefile",

        "manual_download": True

    },

    "Sediment Thickness": {

        "file": "sediment_thickness.nc",

        "source": "NOAA Geophysics",

        "url": "https://www.ngdc.noaa.gov/mgg/sedthick/",

        "format": "NetCDF or GeoTIFF",

        "manual_download": True

    },

    "Submarine Cables": {

        "file": "submarine_cables.geojson",

        "source": "TeleGeography",

        "url": "https://github.com/telegeography/www.submarinecablemap.com",

        "format": "GeoJSON",

        "manual_download": False,

        "direct_url": "https://raw.githubusercontent.com/telegeography/www.submarinecablemap.com/master/web/public/api/v3/cable/cable-geo.json"

    },

    "Shipping Routes": {

        "file": "shipping_density.tif",

        "source": "EMODnet",

        "url": "https://www.emodnet-humanactivities.eu/view-data.php",

        "format": "GeoTIFF",

        "manual_download": True

    },

    "Protected Areas": {

        "file": "protected_areas_morocco.shp",

        "source": "UNEP-WCMC",

        "url": "https://www.protectedplanet.net/en",

        "format": "Shapefile",

        "manual_download": True

    },

    "EEZ": {

        "file": "morocco_eez.shp",

        "source": "Marine Regions",

        "url": "https://www.marineregions.org/downloads.php",

        "format": "Shapefile",

        "manual_download": True

    },

    "Blue Flag Beaches": {

        "file": "blue_flag_beaches.csv",

        "source": "Blue Flag Global",

        "url": "https://www.blueflag.global/all-bf-sites",

        "format": "CSV",

        "manual_download": True

    },

    "Bird Migration": {

        "file": "bird_migration_routes.shp",

        "source": "Movebank or BirdLife",

        "url": "https://www.movebank.org/",

        "format": "Shapefile",

        "manual_download": True

    }

}



print(f"Total datasets to collect: {len(DATASETS)}")

## Check Current Data Status

In [None]:
def check_data_status():

    """Check which datasets have been downloaded."""

    status_list = []

    for name, info in DATASETS.items():

        file_path = RAW_DATA_DIR / info["file"]

        exists = file_path.exists()

        if exists:

            size_mb = file_path.stat().st_size / (1024 * 1024)

            size_str = f"{size_mb:.2f} MB"

            status_icon = "OK"

        else:

            size_str = "-"

            status_icon = "MISSING"

        status_list.append({

            "Dataset": name,

            "Status": status_icon,

            "Filename": info["file"],

            "Size": size_str,

            "Source": info["source"]

        })

    return pd.DataFrame(status_list)



# Display current status

status_df = check_data_status()

print("Data collection status:\n")

print(status_df.to_string(index=False))

downloaded = (status_df["Status"] == "OK").sum()

total = len(status_df)

progress = (downloaded / total) * 100

print(f"\nProgress: {downloaded}/{total} ({progress:.1f}%) datasets collected")

## Dataset 1: Wind Speed (Global Wind Atlas)



Manual download instructions:

1. Go to https://globalwindatlas.info/

2. Choose Download, Wind Speed at 100 m

3. Region bounds: north 36, south 21, west -20, east -1

4. Format: GeoTIFF

5. Save to data/raw/wind_speed_100m.tif

In [None]:
# Verify wind speed file

wind_speed_path = RAW_DATA_DIR / "wind_speed_100m.tif"



if wind_speed_path.exists():

    print("Wind speed data found.")

    import rasterio

    with rasterio.open(wind_speed_path) as src:

        print(f"  CRS: {src.crs}")

        print(f"  Shape: {src.shape}")

        print(f"  Bounds: {src.bounds}")

        print(f"  NoData: {src.nodata}")

else:

    print("Wind speed data not found. Please download manually.")

## Dataset 2: Bathymetry (GEBCO 2020)



Manual download instructions:

1. Go to https://www.gebco.net/data_and_products/gridded_bathymetry_data/

2. Select GEBCO 2020 grid

3. Region bounds: north 36, south 21, west -20, east -1

4. Format: NetCDF or GeoTIFF

5. Save to data/raw/bathymetry_gebco.nc

In [None]:
# Verify bathymetry file

bathymetry_path = RAW_DATA_DIR / "bathymetry_gebco.nc"



if bathymetry_path.exists():

    print("Bathymetry data found.")

    import xarray as xr

    ds = xr.open_dataset(bathymetry_path)

    print(f"  Variables: {list(ds.data_vars)}")

    print(f"  Coordinates: {list(ds.coords)}")

    print(f"  Dimensions: {ds.dims}")

    ds.close()

else:

    print("Bathymetry data not found. Please download manually.")

## Dataset 9: Submarine Cables (auto-download)



This dataset can be fetched directly from the TeleGeography repository.

In [None]:
def download_submarine_cables():

    """Download submarine cable data from TeleGeography."""

    url = "https://raw.githubusercontent.com/telegeography/www.submarinecablemap.com/master/web/public/api/v3/cable/cable-geo.json"

    output_path = RAW_DATA_DIR / "submarine_cables.geojson"

    if output_path.exists():

        print("Submarine cables data already exists.")

        return

    print("Downloading submarine cables data...")

    try:

        response = requests.get(url, stream=True, timeout=60)

        response.raise_for_status()

        total_size = int(response.headers.get("content-length", 0))

        with open(output_path, "wb") as f, tqdm(total=total_size, unit="B", unit_scale=True, desc="Submarine cables") as pbar:

            for chunk in response.iter_content(chunk_size=8192):

                f.write(chunk)

                pbar.update(len(chunk))

        print(f"Downloaded to {output_path}")

    except Exception as exc:

        print(f"Error downloading submarine cables: {exc}")



# Run download

download_submarine_cables()

## Final Data Inventory Summary

In [None]:
# Refresh status

final_status_df = check_data_status()



print("=" * 60)

print("FINAL DATA COLLECTION STATUS")

print("=" * 60)

print(final_status_df.to_string(index=False))



downloaded = (final_status_df["Status"] == "OK").sum()

total = len(final_status_df)

progress = (downloaded / total) * 100

print(f"\nCollection progress: {downloaded}/{total} ({progress:.1f}%)")



missing = final_status_df[final_status_df["Status"] != "OK"]

if missing.empty:

    print("All datasets collected. Proceed to preprocessing.")

else:

    print("Missing datasets:")

    for _, row in missing.iterrows():

        print(f"  - {row['Dataset']} ({row['Filename']})")



# Save inventory

inventory_path = RAW_DATA_DIR / "_data_inventory.csv"

final_status_df.to_csv(inventory_path, index=False)

print(f"Inventory saved to {inventory_path}")

## Manual Download Checklist



Copy and check off items as you download them:



- [ ] Wind Speed (Global Wind Atlas)

- [ ] Bathymetry (GEBCO 2020)

- [ ] Power Grid (OpenStreetMap)

- [ ] Ports (ANP or Natural Earth)

- [ ] Shoreline (GSHHG)

- [ ] Tourism POIs (OpenStreetMap)

- [ ] Airports (OpenStreetMap)

- [ ] Sediment Thickness (NOAA)

- [x] Submarine Cables (auto-download)

- [ ] Shipping Routes (EMODnet)

- [ ] Protected Areas (UNEP-WCMC)

- [ ] EEZ Boundaries (Marine Regions)

- [ ] Blue Flag Beaches (Blue Flag Global)

- [ ] Bird Migration Routes (Movebank)

## Next Steps



After collecting all datasets:

1. Run 02_data_preprocessing.ipynb to standardize CRS, clip to EEZ, and resample.

2. Proceed to criteria mapping, fuzzy AHP, suitability analysis, and visualization.