# Copernicus data
In this notebook we download observation data from Copernicus for some chosen stations.

### Load packages

In [1]:
import copernicusmarine
from pprint import pprint
import xarray as xr
import matplotlib.pyplot as plt
import pandas as pd
import re
import os

### Define parameters

In [2]:
username = 'mjessen'
password = 'gM-Rzb4cS4Lwudk'
dataset_id = 'cmems_obs-ins_glo_phybgcwav_mynrt_na_irr'
part = 'latest'
params_cat = 'waves'
feature = 'TS'

start_date = '2022-01-01'
end_date = '2022-12-31'

north = 56
south = 50
west = -2
east = 10

### Read index file

In [3]:
df = pd.read_csv("copernicus-data/index_monthly.txt", skiprows=5)
print(df.shape)
df.columns

(542559, 12)


Index(['# product_id', 'file_name', 'geospatial_lat_min', 'geospatial_lat_max',
       'geospatial_lon_min', 'geospatial_lon_max', 'time_coverage_start',
       'time_coverage_end', 'institution', 'date_update', 'data_mode',
       'parameters'],
      dtype='object')

### Filter stations by location and parameters

In [9]:
# Filter by location and parameters
filtered_df = df[(df['geospatial_lat_min'] > south) &
                 (df['geospatial_lat_max'] < north) &
                 (df['geospatial_lon_min'] > west) &
                 (df['geospatial_lon_max'] < east) &
                 (df['geospatial_lon_max'] == df['geospatial_lon_min'] ) &
                 (df['geospatial_lat_max'] == df['geospatial_lat_min'] ) &
                 (df['parameters'].str.contains("VHM0", na=False)) &
                 #(df['parameters'].str.contains("VTZA", na=False)) &
                 #(df['parameters'].str.contains("VTM01", na=False)) | (df['parameters'].str.contains("VTM02", na=False))
                 (df['file_name'].str.contains('202201'))
                 #& (df['file_name'].str.contains('Q1'))
                ]

print(filtered_df.shape)

# Copy df for later use
file_df = filtered_df[['file_name']].copy()

# Print basenames of filenames
file_names = file_df['file_name']
for name in file_names:
    basename = os.path.basename(name)
    print(basename)

(110, 12)
NO_TS_MO_6200041_202201.nc
NO_TS_MO_6200042_202201.nc
NO_TS_MO_6200170_202201.nc
NO_TS_MO_6200304_202201.nc
NO_TS_MO_6200305_202201.nc
NO_TS_MO_6201008_202201.nc
NO_TS_MO_6201009_202201.nc
NO_TS_MO_6201010_202201.nc
NO_TS_MO_6201011_202201.nc
NO_TS_MO_6201012_202201.nc
NO_TS_MO_6201013_202201.nc
NO_TS_MO_6201014_202201.nc
NO_TS_MO_6201015_202201.nc
NO_TS_MO_6201017_202201.nc
NO_TS_MO_6201018_202201.nc
NO_TS_MO_6201019_202201.nc
NO_TS_MO_6201045_202201.nc
NO_TS_MO_6201047_202201.nc
NO_TS_MO_6201050_202201.nc
NO_TS_MO_6201051_202201.nc
NO_TS_MO_6201052_202201.nc
NO_TS_MO_6201059_202201.nc
NO_TS_MO_6201067_202201.nc
NO_TS_MO_6201068_202201.nc
NO_TS_MO_6201082_202201.nc
NO_TS_MO_6201083_202201.nc
NO_TS_MO_6202108_202201.nc
NO_TS_MO_6202110_202201.nc
NO_TS_MO_6202112_202201.nc
NO_TS_MO_6202600_202201.nc
NO_TS_MO_6202601_202201.nc
NO_TS_MO_6202602_202201.nc
NO_TS_MO_6202603_202201.nc
NO_TS_MO_A121_202201.nc
NO_TS_MO_A122_202201.nc
NO_TS_MO_A2Buoy_202201.nc
NO_TS_MO_AWG_202201.nc
NO

### Add to stations.txt file

In [None]:
with open("stations.txt", "w") as f:
    f.write('AkkaertSouthwestBuoy\n')
    f.write('Europlatform2\n')
    f.write('Europlatform3\n')
    f.write('F3platform\n')
    f.write('IJmuidenMunitiestort_\n')
    f.write('J61\n')
    f.write('K13a_\n')
    f.write('K141_\n')
    f.write('KeetenBoei\n')
    f.write('KwintebankBuoy\n')
    f.write('L91_\n')
    f.write('LichteilandGoeree1_\n')
    f.write('MaeslantkeringZeezijdeNoordMeetpaal\n')
    f.write('MaeslantkeringZeezijdeZuidMeetpaal\n')
    f.write('NieuwpoortBuoy\n')
    f.write('Nymindegab\n')
    f.write('Oosterschelde11\n')
    f.write('OstendEasternPalisadeBuoy\n')
    f.write('OverloopVanValkenisse\n')
    f.write('PasVanTerneuzenBoei\n')
    f.write('Q1_\n')
    f.write('WaddenEierlandseGat\n')
    f.write('WesthinderBuoy\n')
    f.write('ZeebruggeZandopvangkadeBuoy\n')
    f.write('ZwinBuoy\n')

In [None]:
with open("stations_new.txt", "w") as f:
    f.write('6200145\n')
    f.write('6200293\n')
    f.write('6201045\n')
    f.write('6201050\n')
    f.write('6201058\n')
    f.write('A121_\n')

### Functions

In [57]:
def download_copernicus_data(filename, csv_path, output_directory, username, password, dataset_id):

    with open (filename, "r") as f:
        station_files = [line.strip() for line in f if line.strip()]
        print(station_files)
    
    # Number of stations
    n_stations = len(station_files)
    print(f"Number of stations: {n_stations}")

    # Modify station files
    pattern = "|".join(re.escape(s) for s in station_files)  # escape in case of special characters

    # Filter by location and parameters
    filtered_df = df[(df['file_name'].str.contains(pattern, na=False)) & (df['file_name'].str.contains('202201')) ]
    print(filtered_df.shape)

    # Copy df for later use
    file_df = filtered_df[['file_name']].copy()

    # Print basenames of filenames
    file_names = file_df['file_name']
    for name in file_names:
        basename = os.path.basename(name)
        print(basename)
    
    # Update stations.csv file
    stations_data = []

    for station in station_files:
        for name in filtered_df['file_name']:
            if station in name:
                row = filtered_df[filtered_df['file_name'] == name]
                lat, lon = row['geospatial_lat_min'].values[0], row['geospatial_lon_min'].values[0]
                stations_data.append({'station': station, 'lat': lat, 'lon': lon})

    station_df = pd.DataFrame(stations_data)

    # Keep only unique rows
    df_unique = station_df.drop_duplicates(subset='station', keep='first')

    # If file exists, load it and update
    if os.path.exists(csv_path):
        existing_df = pd.read_csv(csv_path)
        # Concatenate and drop duplicates
        updated_df = pd.concat([existing_df, df_unique], ignore_index=True)
        updated_df = updated_df.drop_duplicates(subset='station', keep='first')
    else:
        updated_df = df_unique


    # Save back
    updated_df.to_csv(csv_path, index=False)
    print(f"Saved stations to {csv_path}")

    # Filter by location and parameters
    filtered_df = df[(df['file_name'].str.contains(pattern, na=False))
                    & (df['file_name'].str.contains('2022')) 
                    ]

    print(filtered_df.shape)

    # Copy df for later use
    file_df = filtered_df[['file_name']].copy()

    # Create output txt file with files to download
    with open("output.txt", "w") as f:
        for filename in file_df["file_name"]:
            f.write(str(filename) + "\n")

    # Define list with files to download
    file_list = 'output.txt'

    # Download data from Copernicus
    print("Starting download...")
    copernicusmarine.get(
        username=username,
        password=password,
        dataset_id=dataset_id,
        index_parts=False,
        file_list = file_list,
        output_directory=output_directory,
        no_directories=True
    )


In [68]:
def convert_nc_to_csv(station_file, filename, obs_fldr, csv_fldr):
    print("Converting .nc files to .csv files...")

    with open(station_file, "r") as f:
        station_names = [line.strip() for line in f if line.strip()]

    print(station_names)

    # Get full file names from output.txt
    with open(filename, "r") as f:
        file_names = [line.strip() for line in f if line.strip()]
    
    print(file_names)

    for station in station_names:
        stations_data = []

        for name in file_names:
            if station in name:
                # Load the NetCDF file
                basename = os.path.basename(name)
                ds = xr.open_dataset(os.path.join(obs_fldr, basename))

                # Convert to dataframe
                df = ds.to_dataframe().reset_index()

                # Extract the wanted variables
                wanted = ['TIME', 'VHM0', 'VTZA', 'VMDR', 'VTPK']
                cols = [col for col in wanted if col in df.columns]

                df = df[cols].copy()
                df = df.dropna()

                stations_data.append(df)
        
        #print(f"Processed {basename} for station {station}")
        
        if stations_data:
            station_df = pd.concat(stations_data, ignore_index=True)
            output_csv = os.path.join(csv_fldr, f"{station}.csv")
            station_df.to_csv(output_csv, index=False)
            print(f"Saved data for station {station} to {output_csv}")
        else:
            print(f"No data found for station {station}")

    


### Download copernicus data

In [59]:
download_copernicus_data(filename="stations_new.txt", csv_path='../observations/stations_new.csv', output_directory='./raw-data', username=username, password=password, dataset_id=dataset_id)

['6200145', '6200293', '6201045', '6201050', '6201058', 'A121']
Number of stations: 6
(5, 12)
NO_TS_MO_6200145_202201.nc
NO_TS_MO_6201045_202201.nc
NO_TS_MO_6201050_202201.nc
NO_TS_MO_A121_202201.nc
NO_TS_TG_A121TG_202201.nc
Saved stations to ../observations/stations_new.csv
(52, 12)
Starting download...


INFO - 2025-09-04T09:06:19Z - Selected dataset version: "202311"
INFO - 2025-09-04T09:06:19Z - Selected dataset part: "latest"
Downloading files: 100%|██████████| 52/52 [00:37<00:00,  1.37it/s]


### Convert nc data file to csv files

In [None]:
convert_nc_to_csv(station_file='stations_new.txt',filename="output.txt", obs_fldr = 'raw-data', csv_fldr='../observations')