# Notebook to verify ucla swe data 

In [1]:
import pandas as pd
import xarray as xr 
from snowML.datapipe.utils import data_utils as du 
from snowML.Scripts.load_hucs import load_huc_splits as lhs

# Load an example file 

In [2]:
b = "snowml-gold" 
f = "mean_swe_ucla_2_in_171100050805.csv"
df = du.s3_to_df(f, b)
print(df.shape[0])
df.head()

13514


Unnamed: 0,day,SWE_Post
0,1984-10-01,0.0
1,1984-10-02,0.0
2,1984-10-03,0.0
3,1984-10-04,0.0
4,1984-10-05,2.5e-05


In [3]:
df.tail()

Unnamed: 0,day,SWE_Post
13509,2021-09-26,0.012049
13510,2021-09-27,0.011895
13511,2021-09-28,0.012089
13512,2021-09-29,0.012306
13513,2021-09-30,0.012108


In [4]:
def check_day_issues(huc_list):
    b = "snowml-gold"
    hucs_with_missing_days = []
    hucs_with_duplicate_days = []
    hucs_w_no_df_found = []
    hucs_w_unspecified_error = []

    for huc in huc_list:
        f = f"mean_swe_ucla_2_in_{huc}.csv"
        try:
            df = du.s3_to_df(f, b)
        except Exception:
            hucs_w_no_df_found.append(huc)
            continue

        # Make a copy and ensure 'day' is datetime
        try: 
            df_copy = df.copy()
            df_copy['day'] = pd.to_datetime(df_copy['day'])

            # Check for duplicate days
            has_duplicates = df_copy['day'].duplicated().any()

            # Check for missing days
            full_range = pd.date_range(start=df_copy['day'].min(), end=df_copy['day'].max())
            unique_days = df_copy['day'].drop_duplicates()
            has_missing = len(unique_days) != len(full_range)

            if has_missing:
                hucs_with_missing_days.append(huc)
            if has_duplicates:
                hucs_with_duplicate_days.append(huc)
        except: 
            hucs_w_unspecified_error.append(huc)

    return {
        "hucs_with_missing_days": hucs_with_missing_days,
        "hucs_with_duplicate_days": hucs_with_duplicate_days,
        "hucs_w_no_df_found": hucs_w_no_df_found, 
        "hucs_w_unspecified_error": hucs_w_unspecified_error
    }



In [5]:
check_day_issues([171100050805, 54])

{'hucs_with_missing_days': [],
 'hucs_with_duplicate_days': [],
 'hucs_w_no_df_found': [54],
 'hucs_w_unspecified_error': []}

# Run on All 

In [6]:
f = "../../src/snowML/datapipe/huc_lists/MarMultiSplits.json"

In [7]:
m1, m2, m3  = lhs.huc_split(f)

In [8]:
all = m1+m2+m3

In [9]:
issues_dict = check_day_issues(all)
issues_dict

{'hucs_with_missing_days': [],
 'hucs_with_duplicate_days': [],
 'hucs_w_no_df_found': [],
 'hucs_w_unspecified_error': ['171100080201', '171100050602']}

In [10]:
len(issues_dict["hucs_w_no_df_found"])

0

In [11]:
# Error ls from gold update: 
# ['171100080201', '171100050602']
#  '171100050602' - continued errors 

