# Analyze pcluster QAQC run for historical-obs-platform 

This notebook analyzes the results of the [QAQC](https://github.com/Eagle-Rock-Analytics/historical-obs-platform/tree/main/test_platform/scripts/3_qaqc_data)
run for the [historical-obs-platform](https://github.com/Eagle-Rock-Analytics/historical-obs-platform)

It reads from the s3 bucket to see how many files each network has in the clean folder, and how many are in the qaqc folder (or how many were processed correctly).

Also, it reads the output files from the pcluster runs (`hist-obs_*_output.txt`) to extract the running time of each station. This needs to be run in the pcluster or download the output files locally.

## Count files in the clean/qaqc folders

In [None]:
# Import libraries

import boto3
import numpy as np
import pandas as pd
import re
import glob

In [None]:
# Initialize the S3 client
s3_client = boto3.client("s3")

# Define the bucket name and base directory
bucket_name = "wecc-historical-wx"

In [None]:
# Create dictionary to save cleaned and qaqc'ed file counts
processed_files = {}

# Read network names from the file
file_path = "../networks-input.dat"

with open(file_path, "r") as f:
    network_names = [line.strip() for line in f if line.strip()]
for network in network_names:
    processed_files[network] = []

### Clean folder

In [None]:
# Clean base dir
base_directory = "2_clean_wx/"

# Read network names from the file
file_path = "../networks-input.dat"
with open(file_path, "r") as f:
    network_names = [line.strip() for line in f if line.strip()]

# Iterate over each network and count the .nc files
for network in network_names:
    prefix = f"{base_directory}{network}/"
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    if "Contents" in response:
        nc_files = [
            obj["Key"] for obj in response["Contents"] if obj["Key"].endswith(".nc")
        ]
        print(f"Network: {network}, .nc files count: {len(nc_files)}")
        processed_files[network].append(len(nc_files))
    else:
        print(f"Network: {network}, No .nc files found.")

### QAQC folder

In [None]:
# QAQC base dir
base_prefix = "3_qaqc_wx/"

# Read the 23 network names from your file
with open("../networks-input.dat", "r") as f:
    networks = [line.strip() for line in f if line.strip()]

# Count .zarr/ folders for each network
for network in networks:
    prefix = f"{base_prefix}{network}/"
    paginator = s3_client.get_paginator("list_objects_v2")

    zarr_folders = set()
    for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix, Delimiter="/"):
        # Hanldes networks with zero zarr files where 'CommonPrefixes' is not found
        common_prefixes = page.get("CommonPrefixes", [])
        files = [p["Prefix"] for p in common_prefixes if p["Prefix"].endswith(".zarr/")]
        zarr_folders.update(files)

    count = len(zarr_folders)
    print(f"{network}: {count} .zarr folders")
    processed_files[network].append(count)

### Display count

Display how many files were in the clean folder vs how many there are in the QAQC folder

In [None]:
processed_files

## Analyze pcluster output files

### Read output files

In [None]:
# Collect all files matching the pattern
files = glob.glob("../hist-obs_*_output.txt")

data = []

# Regular expressions
stations_re = re.compile(r"Running a sample of (\d+) files")
network_re = re.compile(r"Network:\s+([^\n\r]+)")
time_re = re.compile(r"Job completed in (\d+) seconds")

for filename in files:
    with open(filename, "r") as f:
        content = f.read()

        stations_match = stations_re.search(content)
        network_match = network_re.search(content)
        time_match = time_re.search(content)

        if stations_match and network_match and time_match:
            stations = int(stations_match.group(1))
            network_name = network_match.group(1).strip()
            seconds = int(time_match.group(1))
            minutes = seconds / 60

        # Some files did not record the number of processed stations, not sure why
        elif stations_match is None:
            stations = np.nan
            network_name = network_match.group(1).strip()
            seconds = int(time_match.group(1))
            minutes = seconds / 60

        else:
            print(filename)

        data.append((network_name, stations, seconds, minutes))

### Create pandas dataframe with run results

In [None]:
# Create DataFrame
df = pd.DataFrame(
    data,
    columns=[
        "Network",
        "Number of stations",
        "Elapsed time seconds",
        "Elapsed time minutes",
    ],
)
# df.set_index("Network", inplace=True)

print(df)

### Sort by network for analysis

In [None]:
df.sort_values(by="Network")