In [16]:
import os
import pathlib
import pandas as pd
import requests
from tqdm import tqdm
from multiprocessing.pool import ThreadPool

In [17]:
dataset_root = '../../../data/Streamflow/'
dataset_dirs = {
    'FOXRC': 'fpe_stations/FOXRC/FOXRC-20230829',
    'GREENR': 'fpe_stations/GREENR/GREENR-20230829',
    'LANESV': 'fpe_stations/LANESV/LANESV-20230829',
    'VLYB': 'fpe_stations/VLYB/VLYB-20230829',
    'WESTKILL': 'fpe_stations/WESTKILL/WESTKILL-20230829',
}

In [18]:
site_data = dict()
for site_name, dataset_dir in tqdm(dataset_dirs.items()):
    full_df = pd.read_csv(os.path.join(dataset_root, dataset_dir, 'data', 'flow-images.csv'))
    full_df['timestamp'] = pd.to_datetime(full_df['timestamp'])
    train_df = pd.read_csv(os.path.join(dataset_root, dataset_dir, 'data', 'flow-images-train.csv'))
    train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
    ann_df = pd.read_csv(os.path.join(dataset_root, dataset_dir, 'data', 'annotations.csv'))
    train_ann_df = pd.read_csv(os.path.join(dataset_root, dataset_dir, 'data', 'annotations-train.csv'))
    site_data[site_name] = {
        'flow-images': full_df,
        'flow-images-train': train_df,
        'annotations': ann_df,
        'annotations-train': train_ann_df,
    }


100%|██████████| 5/5 [00:01<00:00,  3.74it/s]


# Check that all images are downloaded

In [19]:
def download_image_from_url(entry):
    """Download image from url and save to path.

    Args:
        entry (tuple): (url, path)

    Returns:
        path: path to saved image
    """
    url, path = entry
    response = requests.get(url)
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'wb') as f:
        f.write(response.content)
    return path


In [20]:
for site_name in site_data:
    # find the expected number of images
    n_images = len(site_data[site_name]['flow-images'])
    print(f'{site_name}: {n_images} images')

    # find the missing images
    image_root = pathlib.Path(
        os.path.join(dataset_root, dataset_dirs[site_name])
    ).parent
    missing = []
    for _, row in tqdm(site_data[site_name]['flow-images'].iterrows(), total=len(site_data[site_name]['flow-images'])):
        file = row['filename']
        image_path = image_root / file
        if not image_path.exists():
            missing.append((row['url'], image_path))
    print(f'{len(missing)} images missing for {site_name}')

    # download any missing images
    if len(missing) == 0:
        continue
    ## https://www.markhneedham.com/blog/2018/07/15/python-parallel-download-files-requests/
    print('Downloading missing images...')
    results = ThreadPool(8).imap_unordered(download_image_from_url, missing)
    downloaded_results = []
    for path in tqdm(results, total=len(missing)):
        downloaded_results.append(path)
    print('Done.\n'+('-'*40))




FOXRC: 3513 images


100%|██████████| 3513/3513 [00:00<00:00, 15794.22it/s]


0 images missing for FOXRC
GREENR: 29333 images


100%|██████████| 29333/29333 [00:01<00:00, 16476.35it/s]


0 images missing for GREENR
LANESV: 5536 images


100%|██████████| 5536/5536 [00:00<00:00, 16462.28it/s]


0 images missing for LANESV
VLYB: 27758 images


100%|██████████| 27758/27758 [00:01<00:00, 16726.44it/s]


0 images missing for VLYB
WESTKILL: 20837 images


100%|██████████| 20837/20837 [00:01<00:00, 16764.00it/s]

0 images missing for WESTKILL





# Check for duplicate entries

In [40]:
for site_name in site_data:
    n_rows = len(site_data[site_name]['flow-images'])
    n_unique_urls = site_data[site_name]['flow-images']['url'].nunique()
    n_unique_filenames = site_data[site_name]['flow-images']['filename'].nunique()
    n_unique_ids = site_data[site_name]['flow-images']['image_id'].nunique()
    assert n_rows == n_unique_urls == n_unique_filenames == n_unique_ids, f'{site_name}: {n_rows} rows, {n_unique_urls} unique urls, {n_unique_filenames} unique filenames, {n_unique_ids} unique ids'

# Timespan and # observations for each site

In [21]:
data = []
for site_name in site_data:
    n_obs = len(site_data[site_name]['flow-images'])
    data_start = site_data[site_name]['flow-images']['timestamp'].min().date()
    data_end = site_data[site_name]['flow-images']['timestamp'].max().date()
    n_ann = len(site_data[site_name]['annotations'])
    data.append({'site': site_name, 'start_date': data_start, 'end_date': data_end, 'n_obs': n_obs, 'n_ann': n_ann})
pd.DataFrame(data)


Unnamed: 0,site,start_date,end_date,n_obs,n_ann
0,FOXRC,2023-06-14,2023-07-20,3513,3022
1,GREENR,2022-09-29,2023-08-10,29333,13426
2,LANESV,2022-10-05,2023-05-24,5536,9079
3,VLYB,2022-10-12,2023-07-28,27758,8776
4,WESTKILL,2022-10-13,2023-06-28,20837,9184


# Check for NaN values

In [26]:
for site_name in site_data:
    for table in ['flow-images', 'flow-images-train', 'annotations', 'annotations-train']:
        site_table = site_data[site_name][table]
        site_table_nan_cols = site_table.columns[site_table.isna().any()].tolist()
        if len(site_table_nan_cols) > 0:
            print(f'{site_name}, {table}: {site_table_nan_cols}')
    
    

FOXRC, flow-images: ['flow_cfs']
FOXRC, flow-images-train: ['flow_cfs']
FOXRC, annotations: ['comment', 'left.attributes', 'right.attributes', 'left.flow_cfs', 'right.flow_cfs', 'delta_flow_cfs', 'avg_flow_cfs', 'rel_delta_flow_cfs', 'true_rank']
FOXRC, annotations-train: ['comment', 'left.attributes', 'right.attributes', 'left.flow_cfs', 'right.flow_cfs', 'delta_flow_cfs', 'avg_flow_cfs', 'rel_delta_flow_cfs', 'true_rank']
GREENR, annotations: ['comment', 'left.attributes', 'right.attributes']
GREENR, annotations-train: ['comment', 'left.attributes', 'right.attributes']
LANESV, annotations: ['comment', 'left.attributes', 'right.attributes']
LANESV, annotations-train: ['comment', 'left.attributes', 'right.attributes']
VLYB, flow-images: ['flow_cfs']
VLYB, flow-images-train: ['flow_cfs']
VLYB, annotations: ['comment', 'left.attributes', 'right.attributes', 'left.flow_cfs', 'left.url', 'left.filename', 'right.flow_cfs', 'right.url', 'right.filename', 'delta_flow_cfs', 'avg_flow_cfs', 're

# Check that annotations can be matched back to images

In [67]:
for site_name in site_data:
    annotations = site_data[site_name]['annotations']
    images = site_data[site_name]['flow-images']
    unmatched_ids = []
    unmatched_urls = []
    unmatched_filenames = []
    for _, ann in annotations.iterrows():
        # try matching on image_id
        if ann['left.imageId'] not in images['image_id'].values:
            unmatched_ids.append(ann['left.imageId'])
        if ann['right.imageId'] not in images['image_id'].values:
            unmatched_ids.append(ann['right.imageId'])
        # try matching on url
        if ann['left.url'] not in images['url'].values:
            unmatched_urls.append(ann['left.url'])
        if ann['right.url'] not in images['url'].values:
            unmatched_urls.append(ann['right.url'])
        # try matching on filename
        if ann['left.filename'] not in images['filename'].values:
            unmatched_filenames.append(ann['left.filename'])
        if ann['right.filename'] not in images['filename'].values:
            unmatched_filenames.append(ann['right.filename'])
    print(f'{site_name}, annotations:')
    print(f'\t{len(set(unmatched_ids))} image ids not in flow-images')
    print(f'\t{len(set(unmatched_urls))} urls not in flow-images')
    print(f'\t{len(set(unmatched_filenames))} filenames not in flow-images')
    # if len(set(unmatched)) > 0:
    #     print(set(unmatched))


FOXRC, annotations:
	0 image ids not in flow-images
	0 urls not in flow-images
	0 filenames not in flow-images
GREENR, annotations:
	0 image ids not in flow-images
	0 urls not in flow-images
	0 filenames not in flow-images
LANESV, annotations:
	0 image ids not in flow-images
	0 urls not in flow-images
	0 filenames not in flow-images
VLYB, annotations:
	68 image ids not in flow-images
	1 urls not in flow-images
	1 filenames not in flow-images
WESTKILL, annotations:
	0 image ids not in flow-images
	0 urls not in flow-images
	0 filenames not in flow-images


In [83]:
vlyb = pd.read_csv('../../../data/Streamflow/fpe_stations/VLYB/VLYB-20230829/data/flow-images.csv')
vlyb_ann = pd.read_csv('../../../data/Streamflow/fpe_stations/VLYB/VLYB-20230829/data/annotations.csv')
rid = vlyb_ann[~vlyb_ann['right.imageId'].isin(vlyb['image_id'].values)]['right.imageId'].values
lid = vlyb_ann[~vlyb_ann['left.imageId'].isin(vlyb['image_id'].values)]['left.imageId'].values
print(len(set(rid).union(set(lid))))

68


# Check that the "train" files are strict subsets of the non-"train" files

In [25]:
for site in site_data:
    all_images = site_data[site]['flow-images']['image_id']
    assert len(all_images) == len(set(all_images)), f'{site} has duplicate image_ids in flow-images'
    train_images = site_data[site]['flow-images-train']['image_id']
    assert len(train_images) == len(set(train_images)), f'{site} has duplicate train image_ids in flow-images-train'
    assert set(train_images).issubset(set(all_images)), f'{site} has images in flow-images-train that are absent from flow-images'

In [2]:
# dataset_dir = '../../../data/Streamflow/WESTKILL-20230804/WESTKILL'
dataset_dir = '../../../data/Streamflow/FOXRC-20230803/FOXRC'


In [3]:
flow_img = pd.read_csv(os.path.join(dataset_dir, 'data', 'flow-images.csv'))
flow_img['timestamp'] = pd.to_datetime(flow_img['timestamp'])
flow_img_train = pd.read_csv(os.path.join(dataset_dir, 'data', 'flow-images-train.csv'))
flow_img_train['timestamp'] = pd.to_datetime(flow_img_train['timestamp'])

print(len(flow_img), len(flow_img_train))

3058 1554


In [6]:
display(flow_img.head(3))
flow_img['timestamp'].min(), flow_img['timestamp'].max()

Unnamed: 0,station_name,station_id,imageset_id,image_id,timestamp,filename,url,flow_cfs
0,Fox River above Cedars Dam,182,1932,2673121,2023-06-14 15:30:00+00:00,imagesets/5d05c303-6d30-4966-a25d-48a95e1ae447...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,
1,Fox River above Cedars Dam,182,1932,2673122,2023-06-14 15:45:00+00:00,imagesets/5d05c303-6d30-4966-a25d-48a95e1ae447...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,
2,Fox River above Cedars Dam,182,1932,2673123,2023-06-14 16:00:00+00:00,imagesets/5d05c303-6d30-4966-a25d-48a95e1ae447...,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,


(Timestamp('2023-06-14 15:30:00+0000', tz='UTC'),
 Timestamp('2023-07-16 03:45:00+0000', tz='UTC'))

In [24]:
flow_img[flow_img['image_id']==2677862]

Unnamed: 0,station_name,station_id,imageset_id,image_id,timestamp,filename,url,flow_cfs


In [8]:
annotations = pd.read_csv(os.path.join(dataset_dir, 'data', 'annotations.csv'))
annotations_train = pd.read_csv(os.path.join(dataset_dir, 'data', 'annotations-train.csv'))

print(len(annotations), len(annotations_train))

1300 999


In [22]:
display(annotations.head(3))
print(len(annotations[annotations["rank"].isin(["SAME", "LEFT", "RIGHT"])]))
# annotations['left.url'].isna().sum(), annotations['right.url'].isna().sum()
annotations[annotations['left.url'].isna()].sort_values(by='left.imageId')


Unnamed: 0,annotation_id,user_id,station_id,station_name,duration_sec,n,url,rank,comment,left.imageId,...,left.flow_cfs,left.url,left.filename,right.flow_cfs,right.url,right.filename,delta_flow_cfs,avg_flow_cfs,rel_delta_flow_cfs,true_rank
0,641,532fe3b5-dfd7-4235-b2c8-1739e6221cd1,182,Fox River above Cedars Dam,334.004,20,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,RIGHT,,2673134,...,,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,imagesets/5d05c303-6d30-4966-a25d-48a95e1ae447...,,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,imagesets/5d05c303-6d30-4966-a25d-48a95e1ae447...,,,,
1,641,532fe3b5-dfd7-4235-b2c8-1739e6221cd1,182,Fox River above Cedars Dam,334.004,20,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,SAME,,2677403,...,,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,imagesets/5d05c303-6d30-4966-a25d-48a95e1ae447...,,,,,,,
2,641,532fe3b5-dfd7-4235-b2c8-1739e6221cd1,182,Fox River above Cedars Dam,334.004,20,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,RIGHT,,2678094,...,,,,,,,,,,


1076


Unnamed: 0,annotation_id,user_id,station_id,station_name,duration_sec,n,url,rank,comment,left.imageId,...,left.flow_cfs,left.url,left.filename,right.flow_cfs,right.url,right.filename,delta_flow_cfs,avg_flow_cfs,rel_delta_flow_cfs,true_rank
1016,628,532fe3b5-dfd7-4235-b2c8-1739e6221cd1,182,Fox River above Cedars Dam,1058.536,100,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,LEFT,,2677862,...,,,,,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,imagesets/5d05c303-6d30-4966-a25d-48a95e1ae447...,,,,
466,650,532fe3b5-dfd7-4235-b2c8-1739e6221cd1,182,Fox River above Cedars Dam,379.934,100,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,SAME,,2677865,...,,,,,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,imagesets/5d05c303-6d30-4966-a25d-48a95e1ae447...,,,,
341,649,532fe3b5-dfd7-4235-b2c8-1739e6221cd1,182,Fox River above Cedars Dam,923.837,100,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,LEFT,,2677865,...,,,,,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,imagesets/5d05c303-6d30-4966-a25d-48a95e1ae447...,,,,
614,651,532fe3b5-dfd7-4235-b2c8-1739e6221cd1,182,Fox River above Cedars Dam,391.035,100,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,LEFT,,2677867,...,,,,,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,imagesets/5d05c303-6d30-4966-a25d-48a95e1ae447...,,,,
647,651,532fe3b5-dfd7-4235-b2c8-1739e6221cd1,182,Fox River above Cedars Dam,391.035,100,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,LEFT,,2677875,...,,,,,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,imagesets/5d05c303-6d30-4966-a25d-48a95e1ae447...,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488,650,532fe3b5-dfd7-4235-b2c8-1739e6221cd1,182,Fox River above Cedars Dam,379.934,100,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,SAME,,2678531,...,,,,,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,imagesets/5d05c303-6d30-4966-a25d-48a95e1ae447...,,,,
22,642,532fe3b5-dfd7-4235-b2c8-1739e6221cd1,182,Fox River above Cedars Dam,227.132,20,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,SAME,,2678536,...,,,,,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,imagesets/5d05c303-6d30-4966-a25d-48a95e1ae447...,,,,
1261,638,532fe3b5-dfd7-4235-b2c8-1739e6221cd1,182,Fox River above Cedars Dam,181.651,50,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,UNKNOWN,,2678539,...,,,,,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,imagesets/5d05c303-6d30-4966-a25d-48a95e1ae447...,,,,
870,653,532fe3b5-dfd7-4235-b2c8-1739e6221cd1,182,Fox River above Cedars Dam,386.635,100,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,RIGHT,,2678539,...,,,,,https://usgs-chs-conte-prod-fpe-storage.s3.ama...,imagesets/5d05c303-6d30-4966-a25d-48a95e1ae447...,,,,


In [85]:
(
    annotations['left.filename'].isin(flow_img_train['filename']) &
    annotations['right.filename'].isin(flow_img_train['filename'])
).sum()

258

In [45]:
no_flags = annotations.loc[
    annotations['left.attributes'].isna() & 
    annotations['right.attributes'].isna()
]

In [68]:
left_right = no_flags[['left.imageId','right.imageId']]
for idx, row in left_right.iterrows():
    left = row['left.imageId']
    right = row['right.imageId']
    if left == right:
        print('Left and right image have the same id')
    else:
        # check for reversed order
        left_on_right = left_right[left_right['right.imageId'] == left]
        if len(left_on_right) > 0:
            right_on_left = left_on_right[left_on_right['left.imageId'] == right]
            if len(right_on_left) > 0:
                print('Left and right image are reversed in another pair')

In [70]:
print(len(no_flags))
print(len(left_right.drop_duplicates()))

7842
7842


In [79]:
no_flags[no_flags['rank']=='UNKNOWN'].iloc[0]['right.url']#.value_counts(dropna=False)

'https://usgs-chs-conte-prod-fpe-storage.s3.amazonaws.com/imagesets/fad43511-aa8f-4569-a513-7cbc36bfdba8/images/RCNX0714.JPG'

In [110]:
missing = []
for i in range(len(flow_img)):
    path = os.path.join(
        dataset_dir,
        flow_img.iloc[i]['filename']
    )
    if not os.path.exists(path):
        missing.append(path)


In [105]:
def download_image_from_url(url, path):
    response = requests.get(url)
    with open(path, 'wb') as f:
        f.write(response.content)

for i, m in enumerate(missing):
    base_url = 'https://usgs-chs-conte-prod-fpe-storage.s3.amazonaws.com/'
    m_path = m.split('WESTKILL/')[1]
    url = base_url + m_path
    if not os.path.exists(m):
        download_image_from_url(url, m)


In [113]:
left_id = annotations.iloc[0]['left.url']
print(left_id)
right_id = annotations.iloc[0]['right.url']
left = flow_img[flow_img['url'] == left_id].index[0]
left

https://usgs-chs-conte-prod-fpe-storage.s3.amazonaws.com/imagesets/fad43511-aa8f-4569-a513-7cbc36bfdba8/images/RCNX0399.JPG


6872

In [114]:
flow_img.iloc[left]['url']

'https://usgs-chs-conte-prod-fpe-storage.s3.amazonaws.com/imagesets/fad43511-aa8f-4569-a513-7cbc36bfdba8/images/RCNX0399.JPG'

In [116]:
flow_img.loc[left]['url']

'https://usgs-chs-conte-prod-fpe-storage.s3.amazonaws.com/imagesets/fad43511-aa8f-4569-a513-7cbc36bfdba8/images/RCNX0399.JPG'