In [1]:
import time
import pandas as pd
from pathlib import Path
from tqdm import tqdm

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
DATA_DIR = '../../../data/Streamflow/fpe_stations'

In [3]:
# List fpe sites
sites = sorted([
    f.stem
    for f in Path(DATA_DIR).iterdir()
    if f.is_dir() and not f.stem.startswith('.') and not f.stem in ['FOXRC', 'GREENR', 'LANESV', 'WESTKILL', 'VLYB', 'AVERYBB']
])

In [4]:
# Check if all images were downloaded for each site
for site in sites:
    # Define the path to the imagesets folder
    imagesets_path = Path(f'{DATA_DIR}/{site}/FLOW_CFS/imagesets')

    # If the imagesets folder doesn't exist, print a message and return
    if not imagesets_path.exists():
        print(f'{site}: no imagesets folder found')
        continue

    # Count the number of .JPG files in the imagesets folder
    num_images_downloaded1 = len(list(imagesets_path.rglob('*.JPG')))

    # Wait for 5 seconds
    time.sleep(5)

    # Count the number of .JPG files in the imagesets folder again
    num_images_downloaded2 = len(list(imagesets_path.rglob('*.JPG')))

    # If the number of .JPG files has changed, print a message and return
    if num_images_downloaded1 != num_images_downloaded2:
        print(f'{site}: images are still downloading...')
        continue

    # Read the images.csv file
    images = pd.read_csv(imagesets_path.parent / 'images.csv')

    # Compare the number of .JPG files with the number of rows in the images.csv file
    num_rows = len(images)
    print(f'{site}: [# images downloaded/ # rows in images.csv]: [{num_images_downloaded1}/{num_rows}]')

    # Check if all images were downloaded
    pbar = tqdm(images['filename'].values, desc='Images', position=1, bar_format='\t{l_bar}{bar}{r_bar}')
    for filename in pbar:
        if not (imagesets_path.parent / filename).exists():
            pbar.write(f'\t{filename} not downloaded')
    pbar.close()



Avery Brook_Bridge_01171000: no imagesets folder found
Avery Brook_River Left_01171000: images are still downloading...
Avery Brook_River Right_01171000: images are still downloading...
Avery Brook_Side_01171000: [# images downloaded/ # rows in images.csv]: [70591/70592]


	Images: 100%|██████████| 70592/70592 [00:00<00:00, 97844.37it/s]


Sanderson Brook_01171010: [# images downloaded/ # rows in images.csv]: [85194/85195]


	Images: 100%|██████████| 85195/85195 [00:00<00:00, 96988.36it/s]


West Branch Swift River_01174565: [# images downloaded/ # rows in images.csv]: [43440/43440]


	Images: 100%|██████████| 43440/43440 [00:00<00:00, 95508.03it/s]


West Brook 0_01171100: no imagesets folder found
West Brook Lower_01171090: images are still downloading...
West Brook Reservoir_01171020: [# images downloaded/ # rows in images.csv]: [68685/68686]


	Images: 100%|██████████| 68686/68686 [00:00<00:00, 96134.61it/s]


West Brook Upper_01171030: [# images downloaded/ # rows in images.csv]: [62078/62079]


	Images: 100%|██████████| 62079/62079 [00:00<00:00, 95935.66it/s]


West Whately_01171005: [# images downloaded/ # rows in images.csv]: [87016/87016]


	Images: 100%|██████████| 87016/87016 [00:00<00:00, 96792.59it/s]


In [5]:
# Check that images all have flow values
for site in sites:
    # Read the images.csv file
    images = pd.read_csv(f'{DATA_DIR}/{site}/FLOW_CFS/images.csv')
    # Count rows in images.csv
    num_rows = len(images)
    # Count NaNs in value column
    num_nans_value = images['value'].isna().sum()
    # Count NaNs in interp_value column
    num_nans_interp_value = images['interp_value'].isna().sum()
    print(f'{site} [{num_rows} rows]: {num_nans_value} NaNs in "value" column, {num_nans_interp_value} NaNs in "interp_value" column')

Avery Brook_Bridge_01171000 [73876 rows]: 1369 NaNs in "value" column, 0 NaNs in "interp_value" column
Avery Brook_River Left_01171000 [84661 rows]: 3761 NaNs in "value" column, 0 NaNs in "interp_value" column
Avery Brook_River Right_01171000 [85474 rows]: 3373 NaNs in "value" column, 0 NaNs in "interp_value" column
Avery Brook_Side_01171000 [70592 rows]: 597 NaNs in "value" column, 0 NaNs in "interp_value" column
Sanderson Brook_01171010 [85195 rows]: 22676 NaNs in "value" column, 19851 NaNs in "interp_value" column
West Branch Swift River_01174565 [43440 rows]: 370 NaNs in "value" column, 0 NaNs in "interp_value" column
West Brook 0_01171100 [54480 rows]: 2990 NaNs in "value" column, 0 NaNs in "interp_value" column
West Brook Lower_01171090 [102150 rows]: 30265 NaNs in "value" column, 27476 NaNs in "interp_value" column
West Brook Reservoir_01171020 [68686 rows]: 21647 NaNs in "value" column, 18830 NaNs in "interp_value" column
West Brook Upper_01171030 [62079 rows]: 62079 NaNs in "v

In [6]:
# Check that annotations true_rank and rank columns do not have NaNs
for site in sites:
    # Read the annotations.csv file
    annots = pd.read_csv(f'{DATA_DIR}/{site}/FLOW_CFS/annotations.csv')
    # Count NaNs in true_rank column
    num_nans_true_rank = annots['true_rank'].isna().sum()
    pct_nans_true_rank = num_nans_true_rank / len(annots) * 100
    # Count NaNs in rank column
    num_nans_rank = annots['rank'].isna().sum()
    pct_nans_rank = num_nans_rank / len(annots) * 100
    print(f'{site} [{len(annots)} rows]: {num_nans_true_rank} NaNs in "true_rank" column ({pct_nans_true_rank:.2f}%), {num_nans_rank} NaNs in "rank" column ({pct_nans_rank:.2f}%)')

Avery Brook_Bridge_01171000 [5148 rows]: 235 NaNs in "true_rank" column (4.56%), 0 NaNs in "rank" column (0.00%)
Avery Brook_River Left_01171000 [2516 rows]: 245 NaNs in "true_rank" column (9.74%), 0 NaNs in "rank" column (0.00%)
Avery Brook_River Right_01171000 [2500 rows]: 207 NaNs in "true_rank" column (8.28%), 0 NaNs in "rank" column (0.00%)
Avery Brook_Side_01171000 [2500 rows]: 52 NaNs in "true_rank" column (2.08%), 0 NaNs in "rank" column (0.00%)
Sanderson Brook_01171010 [5000 rows]: 1358 NaNs in "true_rank" column (27.16%), 0 NaNs in "rank" column (0.00%)
West Branch Swift River_01174565 [5084 rows]: 66 NaNs in "true_rank" column (1.30%), 0 NaNs in "rank" column (0.00%)
West Brook 0_01171100 [10495 rows]: 1443 NaNs in "true_rank" column (13.75%), 0 NaNs in "rank" column (0.00%)
West Brook Lower_01171090 [2620 rows]: 1077 NaNs in "true_rank" column (41.11%), 0 NaNs in "rank" column (0.00%)
West Brook Reservoir_01171020 [2500 rows]: 1034 NaNs in "true_rank" column (41.36%), 0 NaN

In [7]:
data = pd.read_csv(f'{DATA_DIR}/West Whately_01171005/FLOW_CFS/images.csv')
annots = pd.read_csv(f'{DATA_DIR}/West Whately_01171005/FLOW_CFS/annotations.csv')
rows_with_nan_true_rank = annots[annots['true_rank'].isna()]
for idx, row in rows_with_nan_true_rank.iterrows():
    print(idx)
    # check which values are NaN
    if pd.isna(row['left.value']):
        # left.value is NaN
        left_val = data[data['filename'] == row['left.filename']]['value'].values[0]
        left_interp_val = data[data['filename'] == row['left.filename']]['interp_value'].values[0]
        print(f'{left_val} {left_interp_val}')
    elif pd.isna(row['right.value']):
        # right.value is NaN
        right_val = data[data['filename'] == row['right.filename']]['value'].values[0]
        right_interp_val = data[data['filename'] == row['right.filename']]['interp_value'].values[0]
        print(f'{right_val} {right_interp_val}')

    

3
nan nan
9
nan 0.7767759336542568
22
nan nan
30
nan 0.9971326611169522
100
nan 0.925921415466428
110
nan 0.6938621861058486
121
nan 0.4406385809133543
134
nan 0.4152416619009256
136
nan 0.869151278481805
149
nan 0.5076170067284739
151
nan nan
152
nan 0.491930674397268
153
nan nan
155
nan nan
164
nan nan
168
nan 0.821594302684022
174
nan 0.7844946051188184
176
nan nan
177
nan 0.9958877141065392
179
nan nan
184
nan nan
187
nan nan
194
nan nan
196
nan nan
198
nan nan
200
nan 1.0227788461863547
222
nan nan
224
nan nan
227
nan nan
228
nan nan
229
nan nan
234
nan nan
237
nan 0.4194744817363304
240
nan nan
241
nan nan
250
nan nan
255
nan nan
256
nan nan
262
nan nan
263
nan nan
265
nan 0.7536199192605718
282
nan 1.0954804317357894
284
nan nan
288
nan 0.968747869279532
290
nan nan
293
nan 1.06286641657655
302
nan nan
303
nan nan
307
nan nan
310
nan nan
315
nan nan
324
nan nan
328
nan nan
329
nan nan
334
nan 0.9675029222691188
336
nan 0.8064059491569814
338
nan 0.879359843967193
348
nan nan
354