In [10]:
# Import libraries

import boto3
import numpy as np

In [2]:
# Initialize the S3 client
s3_client = boto3.client('s3')

# Define the bucket name and base directory
bucket_name = 'wecc-historical-wx'

In [11]:
# Create dictionary to save cleaned and qaqc'ed file counts
processed_files = {}

# Read network names from the file
file_path = '../networks-input.dat'

with open(file_path, 'r') as f:
    network_names = [line.strip() for line in f if line.strip()]
for network in network_names:
    processed_files[network] = []

In [6]:
# Clean base dir
base_directory = '2_clean_wx/'

# Read network names from the file
file_path = '../networks-input.dat'
with open(file_path, 'r') as f:
    network_names = [line.strip() for line in f if line.strip()]

# Iterate over each network and count the .nc files
for network in network_names:
    prefix = f"{base_directory}{network}/"
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    if 'Contents' in response:
        nc_files = [obj['Key'] for obj in response['Contents'] if obj['Key'].endswith('.nc')]
        print(f"Network: {network}, .nc files count: {len(nc_files)}")
        processed_files[network].append(len(nc_files))
    else:
        print(f"Network: {network}, No .nc files found.")

Network: ASOSAWOS, .nc files count: 455
Network: CAHYDRO, .nc files count: 106
Network: CDEC, .nc files count: 8
Network: CIMIS, .nc files count: 260
Network: CNRFC, .nc files count: 693
Network: CRN, .nc files count: 105
Network: HNXWFO, .nc files count: 28
Network: HOLFUY, .nc files count: 12
Network: HPWREN, .nc files count: 36
Network: LOXWFO, .nc files count: 55
Network: MAP, .nc files count: 138
Network: MARITIME, .nc files count: 80
Network: MTRWFO, .nc files count: 25
Network: NCAWOS, .nc files count: 38
Network: NDBC, .nc files count: 79
Network: NOS-NWLON, .nc files count: 38
Network: NOS-PORTS, .nc files count: 21
Network: OtherISD, .nc files count: 463
Network: SCAN, .nc files count: 91
Network: SGXWFO, .nc files count: 267
Network: SHASAVAL, .nc files count: 6
Network: SNOTEL, .nc files count: 866
Network: VCAPCD, .nc files count: 6


In [8]:
# QAQC base dir
base_prefix = '3_qaqc_wx/'

# Read the 23 network names from your file
with open("../networks-input.dat", "r") as f:
    networks = [line.strip() for line in f if line.strip()]

# Count .zarr/ folders for each network
for network in networks:
    prefix = f"{base_prefix}{network}/"
    paginator = s3_client.get_paginator("list_objects_v2")

    zarr_folders = set()
    for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix, Delimiter="/"):
        files = [p['Prefix'] for p in page['CommonPrefixes']]
        files = [f for f in files if f.endswith(".zarr/")]
        files = np.unique(files)
    print(f"{network}: {len(files)} .zarr folders")
    processed_files[network].append(len(files))

ASOSAWOS: 452 .zarr folders
CAHYDRO: 106 .zarr folders
CDEC: 8 .zarr folders
CIMIS: 259 .zarr folders
CNRFC: 693 .zarr folders
CRN: 85 .zarr folders
HNXWFO: 13 .zarr folders
HOLFUY: 12 .zarr folders
HPWREN: 36 .zarr folders
LOXWFO: 54 .zarr folders
MAP: 138 .zarr folders
MARITIME: 80 .zarr folders
MTRWFO: 25 .zarr folders
NCAWOS: 31 .zarr folders
NDBC: 79 .zarr folders
NOS-NWLON: 32 .zarr folders
NOS-PORTS: 21 .zarr folders
OtherISD: 459 .zarr folders
SCAN: 86 .zarr folders
SGXWFO: 267 .zarr folders
SHASAVAL: 6 .zarr folders
SNOTEL: 860 .zarr folders
VCAPCD: 3 .zarr folders


## Result:

Display how many files were in the clean folder vs how many there are in the QAQC folder

In [9]:
processed_files

{'ASOSAWOS': [455, 452],
 'CAHYDRO': [106, 106],
 'CDEC': [8, 8],
 'CIMIS': [260, 259],
 'CNRFC': [693, 693],
 'CRN': [105, 85],
 'HNXWFO': [28, 13],
 'HOLFUY': [12, 12],
 'HPWREN': [36, 36],
 'LOXWFO': [55, 54],
 'MAP': [138, 138],
 'MARITIME': [80, 80],
 'MTRWFO': [25, 25],
 'NCAWOS': [38, 31],
 'NDBC': [79, 79],
 'NOS-NWLON': [38, 32],
 'NOS-PORTS': [21, 21],
 'OtherISD': [463, 459],
 'SCAN': [91, 86],
 'SGXWFO': [267, 267],
 'SHASAVAL': [6, 6],
 'SNOTEL': [866, 860],
 'VCAPCD': [6, 3]}