# Preprocess stations
In this noteook we load the nc files for the stations in stations.txt and convert to csv files.

### Load packages

In [None]:
import pandas as pd
import xarray as xr
import os

### Open and read stations in txt file

In [None]:
with open("stations.txt", "r") as f:
    station_files = [line.strip() for line in f if line.strip()]

# Print a subset of the station files
station_files[0:5]

['6200144', '6200145', '6201045', '6201047', '6201050']

### Open and read file names in txt file

In [None]:
# Get full file names from output.txt
with open("output.txt", "r") as f:
    file_names = [line.strip() for line in f if line.strip()]

# Print a subset of the file names
file_names[0:3]

['INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/cmems_obs-ins_glo_phybgcwav_mynrt_na_irr_202311/monthly/MO/202201/NO_TS_MO_6200144_202201.nc',
 'INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/cmems_obs-ins_glo_phybgcwav_mynrt_na_irr_202311/monthly/MO/202202/NO_TS_MO_6200144_202202.nc',
 'INSITU_GLO_PHYBGCWAV_DISCRETE_MYNRT_013_030/cmems_obs-ins_glo_phybgcwav_mynrt_na_irr_202311/monthly/MO/202203/NO_TS_MO_6200144_202203.nc']

### Convert nc files to csv files

In [None]:
# Define observation folder
obs_fldr = 'raw-data'

for station in station_files:
    stations_data = []

    for name in file_names:
        if station in name:
            # Load the NetCDF file if the path exists
            basename = os.path.basename(name)
            file_path = os.path.join(obs_fldr, basename)

            if os.path.exists(file_path):
                ds = xr.open_dataset(file_path)
            else:
                continue

            # Convert to dataframe
            df = ds.to_dataframe().reset_index()

            # Extract the wanted variables
            cols = ['TIME']

            if 'VHM0' in df.columns:
                cols.append('VHM0')

            if 'VTZA' in df.columns:
                cols.append('VTZA')

            if 'VMDR' in df.columns:
                cols.append('VMDR')
            
            if 'VTPK' in df.columns:
                cols.append('VTPK')
                
            df = df[cols].copy()
            df = df.dropna()

            stations_data.append(df)
    
    # Convert to csv
    full_df = pd.concat(stations_data)
    csv_path = f"../observations/{station}.csv"

    if not os.path.exists(csv_path):
        full_df.to_csv(csv_path, index=False)
        print(f"Saved {station}.csv")
    else:
        print(f"Skipped saving {station}.csv (file already exists)")

Skipped saving 6200144.csv (file already exists)
Skipped saving 6200145.csv (file already exists)
Skipped saving 6201045.csv (file already exists)
Skipped saving 6201047.csv (file already exists)
Skipped saving 6201050.csv (file already exists)
Skipped saving 6201059.csv (file already exists)
Skipped saving A121_.csv (file already exists)
Skipped saving AkkaertSouthwestBuoy.csv (file already exists)
Skipped saving Europlatform2.csv (file already exists)
Skipped saving Europlatform3.csv (file already exists)
Skipped saving F3platform.csv (file already exists)
Skipped saving IJmuidenMunitiestort_.csv (file already exists)
Skipped saving J61.csv (file already exists)
Skipped saving K13a_.csv (file already exists)
Skipped saving K141_.csv (file already exists)
Skipped saving KeetenBoei.csv (file already exists)
Skipped saving KwintebankBuoy.csv (file already exists)
Skipped saving L91_.csv (file already exists)
Skipped saving LichteilandGoeree1_.csv (file already exists)
Skipped saving Mae