# Copernicus data
In this notebook we download observation data from Copernicus for some chosen stations.

### Load packages

In [4]:
import copernicusmarine
from pprint import pprint
import xarray as xr
import matplotlib.pyplot as plt
import pandas as pd
import re
import os

### Define parameters

In [5]:
username = 'mjessen'
password = 'gM-Rzb4cS4Lwudk'
dataset_id = 'cmems_obs-ins_glo_phybgcwav_mynrt_na_irr'
part = 'latest'
params_cat = 'waves'
feature = 'TS'

start_date = '2022-01-01'
end_date = '2023-12-31'

north = 56
south = 50
west = -2
east = 10

### Read index file

In [22]:
df = pd.read_csv("copernicus-data/index_monthly.txt", skiprows=5)
print(df.shape)
df.columns

(542559, 12)


Index(['# product_id', 'file_name', 'geospatial_lat_min', 'geospatial_lat_max',
       'geospatial_lon_min', 'geospatial_lon_max', 'time_coverage_start',
       'time_coverage_end', 'institution', 'date_update', 'data_mode',
       'parameters'],
      dtype='object')

### Filter stations by location and parameters

In [23]:
# Filter by location and parameters
filtered_df = df[(df['geospatial_lat_min'] > south) &
                 (df['geospatial_lat_max'] < north) &
                 (df['geospatial_lon_min'] > west) &
                 (df['geospatial_lon_max'] < east) &
                 (df['geospatial_lon_max'] == df['geospatial_lon_min'] ) &
                 (df['geospatial_lat_max'] == df['geospatial_lat_min'] ) &
                 (df['parameters'].str.contains("VHM0", na=False)) &
                 #(df['parameters'].str.contains("VTZA", na=False)) &
                 #(df['parameters'].str.contains("VTM01", na=False)) | (df['parameters'].str.contains("VTM02", na=False))
                 (df['file_name'].str.contains('202301'))
                 #& (df['file_name'].str.contains('Q1'))
                ]

print(filtered_df.shape)

# Copy df for later use
file_df = filtered_df[['file_name']].copy()

# Print basenames of filenames
file_names = file_df['file_name']
for name in file_names:
    basename = os.path.basename(name)
    print(basename)

(113, 12)
NO_TS_MO_6200041_202301.nc
NO_TS_MO_6200042_202301.nc
NO_TS_MO_6200170_202301.nc
NO_TS_MO_6200304_202301.nc
NO_TS_MO_6200305_202301.nc
NO_TS_MO_6201008_202301.nc
NO_TS_MO_6201009_202301.nc
NO_TS_MO_6201010_202301.nc
NO_TS_MO_6201011_202301.nc
NO_TS_MO_6201012_202301.nc
NO_TS_MO_6201013_202301.nc
NO_TS_MO_6201014_202301.nc
NO_TS_MO_6201015_202301.nc
NO_TS_MO_6201017_202301.nc
NO_TS_MO_6201018_202301.nc
NO_TS_MO_6201019_202301.nc
NO_TS_MO_6201045_202301.nc
NO_TS_MO_6201046_202301.nc
NO_TS_MO_6201047_202301.nc
NO_TS_MO_6201050_202301.nc
NO_TS_MO_6201051_202301.nc
NO_TS_MO_6201052_202301.nc
NO_TS_MO_6201059_202301.nc
NO_TS_MO_6201067_202301.nc
NO_TS_MO_6201068_202301.nc
NO_TS_MO_6201082_202301.nc
NO_TS_MO_6201083_202301.nc
NO_TS_MO_6202109_202301.nc
NO_TS_MO_6202110_202301.nc
NO_TS_MO_6202112_202301.nc
NO_TS_MO_6202600_202301.nc
NO_TS_MO_6202601_202301.nc
NO_TS_MO_6202603_202301.nc
NO_TS_MO_A122_202301.nc
NO_TS_MO_A2Buoy_202301.nc
NO_TS_MO_AWG_202301.nc
NO_TS_MO_AkkaertSouthwestB

### Add to stations.txt file

In [24]:
with open("stations.txt", "w") as f:
    f.write('6200145\n')
    f.write('6200293\n')
    f.write('6201045\n')
    f.write('6201050\n')
    f.write('6201058\n')
    f.write('6200288\n')
    f.write('6200044\n')
    f.write('6201059\n')
    f.write('6200144\n')
    f.write('6201047\n')
    f.write('A121_\n')
    f.write('AkkaertSouthwestBuoy\n')
    f.write('Europlatform2\n')
    f.write('Europlatform3\n')
    f.write('F3platform\n')
    f.write('IJmuidenMunitiestort_\n')
    f.write('J61\n')
    f.write('K13a_\n')
    f.write('K141_\n')
    f.write('KeetenBoei\n')
    f.write('KwintebankBuoy\n')
    f.write('L91_\n')
    f.write('LichteilandGoeree1_\n')
    f.write('MaeslantkeringZeezijdeNoordMeetpaal\n')
    f.write('MaeslantkeringZeezijdeZuidMeetpaal\n')
    f.write('NieuwpoortBuoy\n')
    f.write('Nymindegab\n')
    f.write('Oosterschelde11\n')
    f.write('OstendEasternPalisadeBuoy\n')
    f.write('OverloopVanValkenisse\n')
    f.write('PasVanTerneuzenBoei\n')
    f.write('Q1_\n')
    f.write('WaddenEierlandseGat\n')
    f.write('WesthinderBuoy\n')
    f.write('ZeebruggeZandopvangkadeBuoy\n')
    f.write('ZwinBuoy\n')

In [25]:
with open("stations_new.txt", "w") as f:
    f.write('6200288\n')
    f.write('6200044\n')
    f.write('6201059\n')
    f.write('6200144\n')
    f.write('6201047\n')

### Functions

In [26]:
def download_copernicus_data(filename, csv_path, output_directory, username, password, dataset_id):

    with open (filename, "r") as f:
        station_files = [line.strip() for line in f if line.strip()]
        print(station_files)
    
    # Number of stations
    n_stations = len(station_files)
    print(f"Number of stations: {n_stations}")

    # Modify station files
    pattern = "|".join(re.escape(s) for s in station_files)  # escape in case of special characters

    # Filter by location and parameters
    filtered_df = df[(df['file_name'].str.contains(pattern, na=False)) & (df['file_name'].str.contains('202301')) ]
    print(filtered_df.shape)

    # Copy df for later use
    file_df = filtered_df[['file_name']].copy()

    # Print basenames of filenames
    file_names = file_df['file_name']
    for name in file_names:
        basename = os.path.basename(name)
        print(basename)
    
    # Update stations.csv file
    stations_data = []

    for station in station_files:
        for name in filtered_df['file_name']:
            if station in name:
                row = filtered_df[filtered_df['file_name'] == name]
                lat, lon = row['geospatial_lat_min'].values[0], row['geospatial_lon_min'].values[0]
                stations_data.append({'station': station, 'lat': lat, 'lon': lon})

    station_df = pd.DataFrame(stations_data)

    # Keep only unique rows
    df_unique = station_df.drop_duplicates(subset='station', keep='first')

    # If file exists, load it and update
    if os.path.exists(csv_path):
        existing_df = pd.read_csv(csv_path)
        # Concatenate and drop duplicates
        updated_df = pd.concat([existing_df, df_unique], ignore_index=True)
        updated_df = updated_df.drop_duplicates(subset='station', keep='first')
    else:
        updated_df = df_unique


    # Save back
    updated_df.to_csv(csv_path, index=False)
    print(f"Saved stations to {csv_path}")

    # Filter by location and parameters
    filtered_df = df[(df['file_name'].str.contains(pattern, na=False))
                    & (df['file_name'].str.contains('2023')) 
                    ]

    print(filtered_df.shape)

    # Copy df for later use
    file_df = filtered_df[['file_name']].copy()

    # Create output txt file with files to download
    with open("output.txt", "w") as f:
        for filename in file_df["file_name"]:
            f.write(str(filename) + "\n")

    # Define list with files to download
    file_list = 'output.txt'

    # Download data from Copernicus
    print("Starting download...")
    copernicusmarine.get(
        username=username,
        password=password,
        dataset_id=dataset_id,
        index_parts=False,
        file_list = file_list,
        output_directory=output_directory,
        no_directories=True
    )


In [27]:
def convert_nc_to_csv(station_file, filename, obs_fldr, csv_fldr):
    print("Converting .nc files to .csv files...")

    with open(station_file, "r") as f:
        station_names = [line.strip() for line in f if line.strip()]

    print(station_names)

    # Get full file names from output.txt
    with open(filename, "r") as f:
        file_names = [line.strip() for line in f if line.strip()]
    
    print(file_names)

    for station in station_names:
        stations_data = []

        for name in file_names:
            if station in name:
                # Load the NetCDF file
                basename = os.path.basename(name)
                ds = xr.open_dataset(os.path.join(obs_fldr, basename))

                # Convert to dataframe
                df = ds.to_dataframe().reset_index()

                # Extract the wanted variables
                wanted = ['TIME', 'VHM0', 'VTZA', 'VMDR', 'VTPK']
                cols = [col for col in wanted if col in df.columns]

                df = df[cols].copy()
                df = df.dropna()

                stations_data.append(df)
        
        #print(f"Processed {basename} for station {station}")
        
        if stations_data:
            station_df = pd.concat(stations_data, ignore_index=True)
            output_csv = os.path.join(csv_fldr, f"{station}.csv")
            station_df.to_csv(output_csv, index=False)
            print(f"Saved data for station {station} to {output_csv}")
        else:
            print(f"No data found for station {station}")

    


### Download copernicus data

In [11]:
with open("output.txt") as f:
    file_list = [line.strip() for line in f if line.strip()]  # removes empty lines


file_list[0]

'INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/cmems_obs-ins_glo_phybgcwav_mynrt_na_irr_202311/monthly/MO/202310/NO_TS_MO_6200044_202310.nc'

In [8]:
import os

In [14]:
# Define list with files to download


copernicusmarine.get(
        username=username,
        password=password,
        dataset_id=dataset_id,
        index_parts=False,
        file_list = 'output.txt',
        output_directory="./raw-data",
        no_directories=True)

INFO - 2025-09-15T14:02:58Z - Selected dataset version: "202311"
INFO - 2025-09-15T14:02:58Z - Selected dataset part: "latest"
Downloading files: 100%|██████████| 1/1 [00:01<00:00,  1.72s/it]


ResponseGet(files=[FileGet(s3_url='s3://mdl-native-01/native/INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/cmems_obs-ins_glo_phybgcwav_mynrt_na_irr_202311/monthly/MO/202310/NO_TS_MO_6200044_202310.nc', https_url='https://s3.waw3-1.cloudferro.com/mdl-native-01/native/INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/cmems_obs-ins_glo_phybgcwav_mynrt_na_irr_202311/monthly/MO/202310/NO_TS_MO_6200044_202310.nc', file_size=0.08996295928955078, last_modified_datetime='2025-09-04T21:19:37+00:00', etag='"1ef8c1d049a0f21645f15061bdb9941b"', file_format='.nc', output_directory=WindowsPath('raw-data'), filename='NO_TS_MO_6200044_202310.nc', file_path=WindowsPath('raw-data/NO_TS_MO_6200044_202310.nc'), file_status='DOWNLOADED')], files_deleted=None, files_not_found=None, number_of_files_to_download=1, total_size=0.08996295928955078, status='000', message='The request was successful.')

In [28]:
download_copernicus_data(filename="stations.txt", csv_path='../observations/stations.csv', output_directory='./raw-data', username=username, password=password, dataset_id=dataset_id)

['6200145', '6200293', '6201045', '6201050', '6201058', '6200288', '6200044', '6201059', '6200144', '6201047', 'A121_', 'AkkaertSouthwestBuoy', 'Europlatform2', 'Europlatform3', 'F3platform', 'IJmuidenMunitiestort_', 'J61', 'K13a_', 'K141_', 'KeetenBoei', 'KwintebankBuoy', 'L91_', 'LichteilandGoeree1_', 'MaeslantkeringZeezijdeNoordMeetpaal', 'MaeslantkeringZeezijdeZuidMeetpaal', 'NieuwpoortBuoy', 'Nymindegab', 'Oosterschelde11', 'OstendEasternPalisadeBuoy', 'OverloopVanValkenisse', 'PasVanTerneuzenBoei', 'Q1_', 'WaddenEierlandseGat', 'WesthinderBuoy', 'ZeebruggeZandopvangkadeBuoy', 'ZwinBuoy']
Number of stations: 36
(33, 12)
NO_TS_MO_6200144_202301.nc
NO_TS_MO_6200145_202301.nc
NO_TS_MO_6201045_202301.nc
NO_TS_MO_6201047_202301.nc
NO_TS_MO_6201050_202301.nc
NO_TS_MO_6201059_202301.nc
NO_TS_MO_AkkaertSouthwestBuoy_202301.nc
NO_TS_MO_Europlatform2_202301.nc
NO_TS_MO_Europlatform3_202301.nc
NO_TS_MO_F3platform_202301.nc
NO_TS_MO_IJmuidenMunitiestort_202301.nc
NO_TS_MO_J61_202301.nc
NO_TS_

INFO - 2025-09-15T13:47:32Z - Selected dataset version: "202311"
INFO - 2025-09-15T13:47:32Z - Selected dataset part: "latest"


KeyboardInterrupt: 

### Convert nc data file to csv files

In [None]:
convert_nc_to_csv(station_file='stations.txt',filename="output.txt", obs_fldr = 'raw-data', csv_fldr='../observations')

Converting .nc files to .csv files...
['6200288', '6200044', '6201059', '6200144', '6201047']
['INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/cmems_obs-ins_glo_phybgcwav_mynrt_na_irr_202311/monthly/MO/202201/NO_TS_MO_6200144_202201.nc', 'INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/cmems_obs-ins_glo_phybgcwav_mynrt_na_irr_202311/monthly/MO/202202/NO_TS_MO_6200144_202202.nc', 'INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/cmems_obs-ins_glo_phybgcwav_mynrt_na_irr_202311/monthly/MO/202203/NO_TS_MO_6200144_202203.nc', 'INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/cmems_obs-ins_glo_phybgcwav_mynrt_na_irr_202311/monthly/MO/202204/NO_TS_MO_6200144_202204.nc', 'INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/cmems_obs-ins_glo_phybgcwav_mynrt_na_irr_202311/monthly/MO/202205/NO_TS_MO_6200144_202205.nc', 'INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/cmems_obs-ins_glo_phybgcwav_mynrt_na_irr_202311/monthly/MO/202206/NO_TS_MO_6200144_202206.nc', 'INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/cmems_obs-ins_glo_ph