In [2]:
import xarray as xr
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import os

In [3]:
ds = xr.open_dataset('7902287_prof.nc')
print("Variables in dataset:", list(ds.variables))

Variables in dataset: ['DATA_TYPE', 'FORMAT_VERSION', 'HANDBOOK_VERSION', 'REFERENCE_DATE_TIME', 'DATE_CREATION', 'DATE_UPDATE', 'PLATFORM_NUMBER', 'PROJECT_NAME', 'PI_NAME', 'STATION_PARAMETERS', 'CYCLE_NUMBER', 'DIRECTION', 'DATA_CENTRE', 'DC_REFERENCE', 'DATA_STATE_INDICATOR', 'DATA_MODE', 'PLATFORM_TYPE', 'FLOAT_SERIAL_NO', 'FIRMWARE_VERSION', 'WMO_INST_TYPE', 'JULD', 'JULD_QC', 'JULD_LOCATION', 'LATITUDE', 'LONGITUDE', 'POSITION_QC', 'POSITIONING_SYSTEM', 'PROFILE_PRES_QC', 'PROFILE_TEMP_QC', 'PROFILE_PSAL_QC', 'VERTICAL_SAMPLING_SCHEME', 'CONFIG_MISSION_NUMBER', 'PRES', 'PRES_QC', 'PRES_ADJUSTED', 'PRES_ADJUSTED_QC', 'PRES_ADJUSTED_ERROR', 'TEMP', 'TEMP_QC', 'TEMP_ADJUSTED', 'TEMP_ADJUSTED_QC', 'TEMP_ADJUSTED_ERROR', 'PSAL', 'PSAL_QC', 'PSAL_ADJUSTED', 'PSAL_ADJUSTED_QC', 'PSAL_ADJUSTED_ERROR', 'PARAMETER', 'SCIENTIFIC_CALIB_EQUATION', 'SCIENTIFIC_CALIB_COEFFICIENT', 'SCIENTIFIC_CALIB_COMMENT', 'SCIENTIFIC_CALIB_DATE', 'HISTORY_INSTITUTION', 'HISTORY_STEP', 'HISTORY_SOFTWARE', 'H

In [6]:
lat = ds["LATITUDE"].values
lon = ds["LONGITUDE"].values
juld = ds["JULD"].values
pres = ds["PRES"].values
temp = ds["TEMP"].values
psal = ds["PSAL"].values

In [8]:
ref_date = datetime(1950, 1, 1)

if np.issubdtype(juld.dtype, np.datetime64):
    time = pd.to_datetime(juld)
else:
    # Otherwise treat as "days since 1950-01-01"
    juld = juld.astype(float)  # force float
    juld = np.where((np.isfinite(juld)) & (juld < 100000), juld, np.nan)  # filter invalid
    time = [
        ref_date + timedelta(days=float(t)) if not np.isnan(t) else pd.NaT
        for t in juld
    ]

In [10]:
n_prof, n_levels = pres.shape
lat_expanded = np.repeat(lat, n_levels)
lon_expanded = np.repeat(lon, n_levels)
time_expanded = np.repeat(time, n_levels)

df = pd.DataFrame({
    "time": time_expanded,
    "latitude": lat_expanded,
    "longitude": lon_expanded,
    "pressure": pres.flatten(),
    "temperature": temp.flatten(),
    "salinity": psal.flatten()
})

In [None]:
df = df.dropna()

df.to_csv("sample_cleaned.csv", index=False)

In [12]:
print(df.head())

                 time  latitude  longitude  pressure  temperature   salinity
0 2025-03-11 05:49:55  8.983333       68.0       0.3    29.635000  34.337002
1 2025-03-11 05:49:55  8.983333       68.0       1.1    29.479000  34.409000
2 2025-03-11 05:49:55  8.983333       68.0       1.9    29.441000  34.431000
3 2025-03-11 05:49:55  8.983333       68.0       2.9    29.433001  34.431000
4 2025-03-11 05:49:55  8.983333       68.0       3.8    29.427000  34.431999
