In [10]:
from pathlib import Path
import xarray as xr
import numpy as np

# in and out file paths 
original_data_dir = Path('data/original_data')

in_file = original_data_dir / 'CTD_all_1876-2019.nc'

transformed_data_dir = Path('data/transformed_data')

out_file = transformed_data_dir / 'CTD_all_1876-2019.nc'


#load in dataset
xrds = xr.open_dataset(in_file)

print(xrds)

<xarray.Dataset> Size: 13GB
Dimensions:      (PRES: 7225, TIME: 76708, CALIBRATION: 2)
Coordinates:
  * PRES         (PRES) float64 58kB 1.0 2.0 3.0 ... 7.224e+03 7.225e+03
  * TIME         (TIME) datetime64[ns] 614kB 1876-07-10T00:00:01.000004608 .....
    CALIBRATION  (CALIBRATION, TIME) float64 1MB ...
Data variables:
    STATION      (TIME) |S25 2MB ...
    LATITUDE     (TIME) float64 614kB ...
    LONGITUDE    (TIME) float64 614kB ...
    TEMP         (TIME, PRES) float64 4GB ...
    PSAL         (TIME, PRES) float64 4GB ...
    CNDC         (TIME, PRES) float64 4GB ...
    FDEP         (TIME) float64 614kB ...
    SHIP         (TIME) |S52 4MB ...
    SHIPID       (TIME) float32 307kB ...
    OWNER        (TIME) |S31 2MB ...
    CRUISE       (TIME) |S47 4MB ...
    TYPE         (TIME) |S47 4MB ...
Attributes: (12/38)
    cruise_ID:                     
    instrument:                    Shipborne observation
    institution:                   The University Centre in Svalbard, Nor

In [11]:


# Generate a unique integer CAST value for each time point
cast_values = np.arange(1, len(xrds['TIME']) + 1)

# Create a new CAST dimension with integer values
xrds = xrds.rename({'TIME': 'CAST'})

# Move the original TIME to a data variable that is CAST dependent
xrds['TIME'] = (('CAST',), xrds['CAST'].values)

# Assign integer CAST values
xrds['CAST'] = (('CAST',), cast_values)

# Modify the dimensions of data variables to be dependent on 'CAST' instead of 'TIME'
data_vars_to_reshape = [var for var in xrds.data_vars if 'CAST' in xrds[var].dims]

for var in data_vars_to_reshape:
    xrds[var] = xrds[var].rename({'CAST': 'CAST'})

# Save the modified dataset to a new NetCDF file
# xrds.to_netcdf('output.nc')
print(xrds['CAST'])
print(cast_values[:10])
print("The NetCDF file has been successfully modified and saved as 'output.nc'.")

<xarray.DataArray 'CAST' (CAST: 76708)> Size: 614kB
array([    1,     2,     3, ..., 76706, 76707, 76708])
Coordinates:
  * CAST     (CAST) int64 614kB 1 2 3 4 5 6 ... 76704 76705 76706 76707 76708
[ 1  2  3  4  5  6  7  8  9 10]
The NetCDF file has been successfully modified and saved as 'output.nc'.


In [12]:
for cast in range(1,11):
    time_value = xrds['TIME'].sel(CAST=cast).values
    print(time_value)

1876-07-10T00:00:01.000004608
1880-07-08T00:00:01.000004608
1890-07-02T00:00:01.000004608
1890-07-05T00:00:01.000004608
1890-07-08T00:00:01.000004608
1890-07-08T00:00:01.000004608
1890-07-08T00:00:01.000004608
1890-07-10T00:00:01.000004608
1890-07-10T00:00:01.000004608
1890-07-21T00:00:01.000004608


In [13]:
from check_for_met_ACDD import missing_MET_ACDD_attributes

transformed_data_dir = Path('data/transformed_data')

out_file = transformed_data_dir / 'CTD_all_1876-2019.nc'

missing_MET_ACDD_attributes(out_file)

data/transformed_data/CTD_all_1876-2019.nc is missing or has empty the following required attributes:
  - publisher_name: Required if not hosted by MET (MISSING)
  - publisher_email: Required if not hosted by MET (MISSING)
  - publisher_url: Required if not hosted by MET (MISSING)


In [14]:


missing_MET_ACDD_attributes('/home/alessioc/UNIS_hydrographic_data_to_cfnetcdf/data/transformed_data/CTD_1980-1989/UNIS_HD_19800106T034000_79_2.nc')

/home/alessioc/UNIS_hydrographic_data_to_cfnetcdf/data/transformed_data/CTD_1980-1989/UNIS_HD_19800106T034000_79_2.nc is missing or has empty the following required attributes:
  - publisher_name: Required if not hosted by MET (MISSING)
  - publisher_email: Required if not hosted by MET (MISSING)
  - publisher_url: Required if not hosted by MET (MISSING)


## Checking transformed_data

In [22]:
import os
import subprocess

subdirs = [d for d in transformed_data_dir.iterdir() if d.is_dir()]


unexpected_error_files = []

# Loop through each subdirectory
for subdir in subdirs:
    print(f"Subdirectory: {subdir.name}")
    
    # List .nc files in the current subdirectory
    nc_files = list(subdir.glob('*.nc'))
    
    tests_passed = 'All tests passed!'


    
    # Loop through each .nc file
    for nc_file in nc_files:
        # print(f"  .nc file: {nc_file.name}")
        # print(str(nc_file.name).split('_'))
        command = f"compliance-checker --test=cf:1.7 {nc_file}"
        result = subprocess.run(command, shell=True, capture_output=True, text=True)
        # print(f'Files with unexpected errors: {len(unexpected_error_files)}', end='\r')

        # Check if the output matches the expected output
        if tests_passed != result.stdout.strip().split('\n')[-1]:
            print(f'ERRORS FOUND in file {nc_file}!')
            unexpected_error_files.append(nc_file.name)
        

Subdirectory: CTD_2000-2009
All tests passed!
All tests passed!
All tests passed!
All tests passed!
All tests passed!
All tests passed!
All tests passed!
All tests passed!
All tests passed!
All tests passed!


KeyboardInterrupt: 