In [1]:
# import copy
# from argparse import Namespace

import os
import sys
import json

import numpy as np
import pandas as pd
from tqdm import tqdm

PROJECT_ROOT = os.path.abspath(os.path.join(sys.path[0], os.pardir))
sys.path.append(PROJECT_ROOT)
from src.utils import parse_configargparse_args

  from .autonotebook import tqdm as notebook_tqdm


# Summarize Ranking Model Training Experiment Results

In [2]:
exp_folders = [
    os.path.join(PROJECT_ROOT, 'results', 'train_ranking_model', f)
    for f in os.listdir(os.path.join(PROJECT_ROOT, 'results', 'train_ranking_model'))
    ]

df = pd.DataFrame(
    columns = [
        'site', 'folder', 'data_file', 'min_month', 'max_month', 'min_hour', 'max_hour', 'num_train_pairs', 'num_eval_pairs', 'margin', 'margin_mode', 'augment', 'normalize', 'random_seed', 'min_val_loss', 'min_val_loss_epoch'
    ]
)

for exp_folder in tqdm(exp_folders):
    exp_params_file = os.path.join(exp_folder, 'params.txt')
    exp_args = parse_configargparse_args(exp_params_file)
    exp_metrics_file = os.path.join(
        exp_folder, 
        f'metrics_per_epoch_ranking_margin_{exp_args["margin"]}_randompairs_{exp_args["num_train_pairs"]}_{exp_args["site"]}_' + \
            ('augment_' if exp_args['augment'] else '') + \
            ('normalize_' if exp_args['normalize'] else '') + \
            f'{exp_args["random_seed"]}.json'
    )
    exp_metrics = json.load(open(exp_metrics_file, 'r'))
    min_val_loss = np.min(exp_metrics['val_loss'])
    min_val_loss_epoch = np.argmin(exp_metrics['val_loss'])
    df = pd.concat(
        [df, pd.DataFrame([exp_args | {'folder': exp_folder, 'min_val_loss': min_val_loss, 'min_val_loss_epoch': min_val_loss_epoch}])],
        ignore_index=True
    )


100%|██████████| 4/4 [00:00<00:00, 324.07it/s]


In [3]:
df.sort_values(by=['site', 'min_val_loss'])#[['site', 'folder', 'margin', 'min_val_loss', 'min_val_loss_epoch']]


Unnamed: 0,site,folder,data_file,min_month,max_month,min_hour,max_hour,num_train_pairs,num_eval_pairs,margin,...,min_val_loss_epoch,c,image_root_dir,gpu,output_root_dir,epochs,col_timestamp,batch_size,lr,unfreeze_after
0,AVERYBB,/home/amritagupta/ssdprivate/repos/fpe-model/r...,../data/raw/Avery_Brook_Bridge_01171000/flow-i...,4,11,7,18,5000,1000,0.1,...,16,../conf/ranking.yml,../data/raw/Avery_Brook_Bridge_01171000/images/,1.0,../results/train_ranking_model,30.0,timestamp,64.0,0.001,2.0
3,AVERYBB,/home/amritagupta/ssdprivate/repos/fpe-model/r...,../data/raw/Avery_Brook_Bridge_01171000/flow-i...,4,11,7,18,5000,1000,0.0,...,14,../conf/ranking.yml,../data/raw/Avery_Brook_Bridge_01171000/images/,4.0,../results/train_ranking_model,30.0,timestamp,64.0,0.001,2.0
1,WESTB0,/home/amritagupta/ssdprivate/repos/fpe-model/r...,../data/raw/West_Brook_0_Master_01171100/flow-...,4,11,7,18,5000,1000,0.1,...,17,../conf/ranking.yml,../data/raw/West_Brook_0_Master_01171100/images/,1.0,../results/train_ranking_model,30.0,timestamp,64.0,0.001,2.0
2,WESTB0,/home/amritagupta/ssdprivate/repos/fpe-model/r...,../data/raw/West_Brook_0_Master_01171100/flow-...,4,11,7,18,5000,1000,0.0,...,7,../conf/ranking.yml,../data/raw/West_Brook_0_Master_01171100/images/,4.0,../results/train_ranking_model,30.0,timestamp,64.0,0.001,2.0


In [6]:
from PIL import Image
img_path = '../data/raw/Avery_Brook_Bridge_01171000/images/Avery Bridge Downstream__2021-04-21__19-15-00(1).JPG'
img = Image.open(img_path)
img.show()

Error: no "view" mailcap rules found for type "image/png"


# Load a table of flow images

In [2]:
df = pd.read_csv(
    '../../../data/Streamflow/sites/Avery_Brook_Bridge_01171000/flow-images.csv'
)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44832 entries, 0 to 44831
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   station_name  44832 non-null  object 
 1   station_id    44832 non-null  int64  
 2   imageset_id   44832 non-null  int64  
 3   image_id      44832 non-null  int64  
 4   timestamp     44832 non-null  object 
 5   filename      44832 non-null  object 
 6   url           44832 non-null  object 
 7   flow_cfs      44832 non-null  float64
dtypes: float64(1), int64(3), object(4)
memory usage: 2.7+ MB


In [3]:
def load_data(col_timestamp='timestamp'):
    df = pd.read_csv(
        '../../../data/Streamflow/sites/Avery_Brook_Bridge_01171000/flow-images.csv'
    )
    df[col_timestamp] = pd.to_datetime(df[col_timestamp])
    df.sort_values(by=col_timestamp, inplace=True, ignore_index=True)
    return df

def convert_timezone(df, time_zone, col_timestamp='timestamp'):
    df[col_timestamp] = df[col_timestamp].dt.tz_convert(tz=time_zone)
    return df

def filter_detections(detection_results, confidence_threshold: float, categories=[]):
    """Filter detections by confidence threshold and category.

    Args:
        detection_results: A dict containing MegaDetector v5 results.
        confidence_threshold: A float representing the confidence below
          which detections should be filtered out.
        categories: A list of categories of detections to return. Detections
          in other categories will be filtered out.

    Returns:
        A dict containing only MegaDetector v5 detection results above the
        specified confidence threshold and belonging to the specified
        categories.

    Raises:

    """
    filtered_results = copy.deepcopy(detection_results)
    for image in tqdm(filtered_results["images"]):
        # keep only detections above confidence_threshold
        # and of the specified categories
        image["detections"] = [
            det
            for det in image["detections"]
            if (det["conf"] >= confidence_threshold) and (det["category"] in categories)
        ]
        image["max_detection_conf"] = (
            max([det["conf"] for det in image["detections"]])
            if len(image["detections"]) > 0
            else 0.0
        )

    # keep only images that have at least 1 detection after filtering
    filtered_results["images"] = [
        image for image in filtered_results["images"] if len(image["detections"]) > 0
    ]
    return filtered_results

def drop_by_col_val(df, col_val, col_name='filename'):
    df = df[~df[col_name].isin(col_val)]
    return df

def filter_by_hour(df, min_hour=7, max_hour=18, col_timestamp='timestamp'):
    df = df[
        df[col_timestamp].dt.hour.between(min_hour, max_hour)
    ]
    return df

def filter_by_month(df, min_month=4, max_month=11, col_timestamp='timestamp'):
    df = df[
        df[col_timestamp].dt.month.between(min_month, max_month)
    ]
    return df

# def filter_by_date(self, start_date, end_date, mode="exclude"):
#     if mode == "exclude":
#         before_start_date = self.data[self.col_timestamp] < start_date
#         after_end_date = self.data[self.col_timestamp] > end_date
#         outside_two_dates = before_start_date | after_end_date
#         filtered_dates = self.data.loc[outside_two_dates].copy()
#         self.data = filtered_dates
#     else:
#         raise NotImplementedError(
#             'Please select "exclude" mode and provide date range to exclude.'
#         )


In [4]:
df = load_data().pipe(convert_timezone, 'US/Eastern')

pii_detection_results = json.load(
    open('../results/pii_detection/md_v5a_Avery_Brook_Bridge_01171000_output.json', "r")
)
pii_detections = filter_detections(pii_detection_results, 0.2, ["2", "3"])["images"]
pii_files = pd.DataFrame(pii_detections)["file"].tolist()
df = df.pipe(drop_by_col_val, pii_files, col_name='filename')

df = (
    df.pipe(filter_by_hour, min_hour=6, max_hour=18)
    .pipe(filter_by_month, min_month=3, max_month=11)
).reset_index()
print(len(df))
# df.head(3)

100%|██████████| 44828/44828 [00:00<00:00, 1392891.61it/s]

17553





In [13]:
df.index.values[-5:]

array([17548, 17549, 17550, 17551, 17552])

In [2]:
args = Namespace(
    data_file = '/home/amritagupta/ssdprivate/data/Streamflow/sites/Avery_Brook_Bridge_01171000/flow-images.csv',
    image_dir = '/home/amritagupta/ssdprivate/data/Streamflow/sites/Avery_Brook_Bridge_01171000/images',
    normalize = False,
    augment = True,
    batch_size = 64,
    pii_detection_results  = '/home/amritagupta/ssdprivate/repos/fpe-model/results/pii_detection/md_v5a_Avery_Brook_Bridge_01171000_output.json',
    seed = 939
)
print("args:")
for arg in vars(args):
    print(f"  {arg}: {getattr(args, arg)}")

args:
  data_file: /home/amritagupta/ssdprivate/data/Streamflow/sites/Avery_Brook_Bridge_01171000/flow-images.csv
  image_dir: /home/amritagupta/ssdprivate/data/Streamflow/sites/Avery_Brook_Bridge_01171000/images
  normalize: False
  augment: True
  batch_size: 64
  pii_detection_results: /home/amritagupta/ssdprivate/repos/fpe-model/results/pii_detection/md_v5a_Avery_Brook_Bridge_01171000_output.json
  seed: 939


In [9]:
def _load_data_file(filepath, pii_detections):
    # logger.info(f"load dataset: {filepath}")
    df = pd.read_csv(filepath, dtype={"flow_cfs": np.float32})
    df["timestamp"] = pd.to_datetime(df["timestamp"]).dt.tz_convert(tz="US/Eastern")
    df.sort_values(by="timestamp", inplace=True, ignore_index=True)

    # filter by hour
    min_hour = 6
    max_hour = 18
    # logger.info(f"filter(hour): {min_hour} to {max_hour}")
    df = df[df["timestamp"].dt.hour.between(min_hour, max_hour)]

    # filter by month
    min_month = 3
    max_month = 11
    # logger.info(f"filter(month): {min_month} to {max_month}")
    df = df[df["timestamp"].dt.month.between(min_month, max_month)]

    # filter by pii detections
    pii_results = json.load(open(args.pii_detection_results, "r"))
    pii_detections = filter_detections(pii_results, 0.2, ["2", "3"])["images"]
    pii_files = pd.DataFrame(pii_detections)["file"].tolist()
    df = df[~df["filename"].isin(pii_files)]

    # logger.info(
    #     f"dataset loaded\n  rows: {len(df)}\n  flow: {df.flow_cfs.mean():>.2f} cfs"
    # )
    return df

In [4]:
# # DATALOADING OPTS
# def data_args(parser):
#     group = parser.add_argument_group(
#         "Data", "Arguments control Data and loading for training"
#     )
#     group.add_argument(
#         "--data-file",
#         type=str,
#         required=True,
#         help="path to CSV file with linked images and flows",
#     )
#     group.add_argument(
#         "--image-dir",
#         type=str,
#         required=True,
#         help="path to folder containing images listed in data-file",
#     )
#     # group.add_argument(
#     #     "--split-idx",
#     #     type=int,
#     #     required=True,
#     #     help="index specifying which of 5 train/val splits to use",
#     # )
#     group.add_argument(
#         "--normalize",
#         type=bool,
#         default=True,
#         help="whether to normalize image inputs to model",
#     )
#     group.add_argument(
#         "--augment",
#         type=bool,
#         default=True,
#         help="whether to use image augmentation during training",
#     )
#     # group.add_argument(
#     #     "--crop-to-bbox",
#     #     type=bool,
#     #     default=False,
#     #     help="whether to crop images to bounding boxes before training",
#     # )
#     group.add_argument(
#         "--batch-size", type=int, default=64, help="batch size of the train loader"
#     )

# def get_args():
#     parser = argparse.ArgumentParser(description="")
#     parser.add_argument("--seed", default=939, type=int, help="random seed")
#     data_args(parser)
#     # add temporary arguments that should be refactored out
#     parser.add_argument("--pii-detection-results", default=None)
#     args = parser.parse_args()
#     return args

In [5]:
args = args #get_args()

# Load data
df = _load_data_file(args.data_file, args.pii_detection_results)
# NOTE: Dataset Splitter class is incomplete
# NOTE: Fragile to return indices of dataset copy sorted in function call
train_inds, val_inds, test_inds = RandomStratifiedWeeklyFlow().split(
    df, 0.8, 0.1, 0.1
)
train_df, val_df, test_df = (
    df.iloc[train_inds],
    df.iloc[val_inds],
    df.iloc[test_inds],
)

# Create PyTorch Datsets
train_ds_tmp = FlowPhotoDataset(train_df, os.path.dirname(args.image_dir))
train_mean, train_std = train_ds_tmp.compute_mean_std()
image = train_ds_tmp.get_image(0)
aspect = image.shape[2] / image.shape[1]

train_transforms = [Resize([480, np.int32(480 * aspect)])]
if args.augment:
    train_transforms.append(RandomCrop([384, np.int32(384 * aspect)]))
    train_transforms.append(RandomHorizontalFlip())
    train_transforms.append(RandomRotation(10))
    train_transforms.append(ColorJitter())
else:
    train_transforms.append(CenterCrop([384, np.int32(384 * aspect)]))
# train_transforms.append(ToTensor())
if args.normalize:
    train_transforms.append(Normalize(train_mean, train_std))
train_transform = Compose(train_transforms)
train_ds = FlowPhotoDataset(
    train_df, os.path.dirname(args.image_dir), transform=train_transform
)

100%|██████████| 44828/44828 [00:00<00:00, 776553.58it/s]
100%|██████████| 1000/1000 [01:26<00:00, 11.61it/s]


In [6]:
image = train_ds.get_image(12302)

def show_image(image):
    image_pil = ToPILImage()(image)
    display(image_pil)
    
show_image(image)

Could not read image index 12302


UnboundLocalError: local variable 'image' referenced before assignment

In [10]:
train_ds.table.iloc[12302].url

'https://usgs-chs-conte-prod-fpe-storage.s3.amazonaws.com/imagesets/b9f642d4-027d-4d11-acde-1ae889fd2ecb/images/Avery Brook Bridge__2022-10-28__08-15-00(1).JPG'

In [11]:
im, label = train_ds[12302]
show_image(im)

RuntimeError: Image is incomplete or truncated

In [34]:

# stat = ImageStat.Stat(ToPILImage()(image))
# print(np.array(stat.mean) / 255.0)
# print(np.array(stat.stddev) / 255.0)    

[0.49695178 0.50054886 0.47154651]
[0.34803897 0.346689   0.36952244]


In [20]:
pilimage.max()

AttributeError: max

In [3]:
def filter_detections(
    detection_results,
    confidence_threshold: float,
    categories = []
):
    """Filter detections by confidence threshold and category.
    
    Args:
        detection_results: A dict containing MegaDetector v5 results.
        confidence_threshold: A float representing the confidence below
          which detections should be filtered out.
        categories: A list of categories of detections to return. Detections
          in other categories will be filtered out.

    Returns:
        A dict containing only MegaDetector v5 detection results above the
        specified confidence threshold and belonging to the specified 
        categories.

    Raises:
        
    """
    filtered_results = copy.deepcopy(detection_results)
    for image in tqdm(filtered_results['images']):
        # keep only detections above confidence_threshold
        # and of the specified categories
        image['detections'] = [
            det for det in image['detections']
            if (det['conf'] >= confidence_threshold)
            and (det['category'] in categories)
        ]
        image['max_detection_conf'] = max([
            det['conf'] for det in image['detections']
        ]) if len(image['detections']) > 0 else 0.0
    
    # keep only images that have at least 1 detection after filtering
    filtered_results['images'] = [
        image for image in filtered_results['images']
        if len(image['detections']) > 0
    ]
    return filtered_results
pii_results = json.load(open(args.pii_filename, 'r'))
pii_detections = filter_detections(pii_results, 0.2, ['2', '3'])['images']
pii_files = pd.DataFrame(pii_detections)['file'].tolist()

100%|██████████| 44828/44828 [00:00<00:00, 587540.18it/s]


In [4]:
# load raw data file
site_df = pd.read_csv(os.path.join(args.data_dir, args.filename), dtype={"flow_cfs": np.float32})
site_df["timestamp"] = pd.to_datetime(site_df["timestamp"]).dt.tz_convert(tz="US/Eastern")
print(len(site_df))

# Until a standard filtering procedure is decided upon, filtering by
#  pii detection/month of year/hour of day/specific date ranges/etc.
#  should be done prior to initializing the dataset (for transparency)

# filter out pii
site_df = site_df[~site_df['filename'].isin(pii_files)]
print(len(site_df))

# filter by hour
min_hour = 7
max_hour = 18
# logger.info(f"filter(hour): {min_hour} to {max_hour}")
site_df = site_df[site_df["timestamp"].dt.hour.between(min_hour, max_hour)]
print(len(site_df))

# min_month = 4
# max_month = 11
# # logger.info(f"filter(month): {min_month} to {max_month}")
# site_df = site_df[site_df["timestamp"].dt.month.between(min_month, max_month)]

print(f"dataset loaded\n  rows: {len(site_df)}\n  flow: {site_df.flow_cfs.mean():>.2f} cfs")

44832
44777
22399
dataset loaded
  rows: 22399
  flow: 5.61 cfs


In [5]:
splitter = RandomStratifiedWeeklyFlow()
train_inds, val_inds, test_inds = splitter.split(site_df, 0.8, 0.1, 0.1)
train_df = site_df.iloc[train_inds]
val_df = site_df.iloc[val_inds]
test_df = site_df.iloc[test_inds]

assert set(train_inds).intersection(set(test_inds)) == set(), 'train and test contain overlapping samples'
assert set(train_inds).intersection(set(val_inds)) == set(), 'train and val contain overlapping samples'
assert set(test_inds).intersection(set(val_inds)) == set(), 'test and val contain overlapping samples'

In [6]:
# create transforms
from torchvision.io import read_image
from torchvision import transforms
from PIL import ImageStat

# compute mean and std from train set
train_ds = FlowPhotoDataset(train_df, '/home/amritagupta/ssdprivate/data/Streamflow/sites/Avery_Brook_Bridge_01171000')

def compute_mean_stddev(dataset, max_sample_size=1000):
    means = np.zeros((3))
    stddevs = np.zeros((3))
    sample_size = min(len(train_ds.table), max_sample_size)
    sample_indices = np.random.choice(
        len(dataset.table), size=sample_size, replace=False
    )
    for index in tqdm(sample_indices):
        image, _ = dataset[index]
        image = transforms.functional.to_pil_image(image)
        stat = ImageStat.Stat(image)
        means += np.array(stat.mean) / 255.0
        stddevs += np.array(stat.stddev) / 255.0
    means = means / sample_size
    stddevs = stddevs / sample_size
    return means, stddevs

means, stddevs = compute_mean_stddev(train_ds)

# compute aspect ratio
image, _ = train_ds[0]
aspect = image.shape[2] / image.shape[1]


    
# img_path = os.path.join(args.data_dir, train_df['filename'].iloc[0])
# # img_dir = os.path.join(data_dir, "images")
# # img_path = os.path.join(img_dir, train_df["filename"].iloc[0])
# print(f"loading first image: {img_path}")
# img = read_image(img_path)
# aspect = img.shape[2] / img.shape[1]
# print(aspect)
# # train_ds = FlowPhotoRegressionDataset(train_df, '/home/amritagupta/ssdprivate/data/Streamflow/sites/Avery_Brook_Bridge_01171000')
# train_transforms = [

# ]
# # read first image
# # m, s = train_ds.compute_mean_std()

100%|██████████| 1000/1000 [01:05<00:00, 15.29it/s]


In [7]:
DATA_AUGMENTATION = False
NORMALIZATION = True

train_transforms = [
    transforms.Resize([480, np.int32(480 * aspect)]),
]
if DATA_AUGMENTATION:
    train_transforms.append(transforms.RandomCrop([384, np.int32(384 * aspect)]))
    train_transforms.append(transforms.RandomHorizontalFlip())
    train_transforms.append(transforms.RandomRotation(10))
    train_transforms.append(transforms.ColorJitter())
else:
    train_transforms.append(transforms.CenterCrop([384, np.int32(384 * aspect)]))
train_transforms.append(transforms.ToTensor())
if NORMALIZATION:
    train_transforms.append(transforms.Normalize(means, stddevs))
train_transform = transforms.Compose(train_transforms)

eval_transforms = [
    transforms.Resize([480, np.int32(480 * aspect)]),
    transforms.CenterCrop([384, np.int32(384 * aspect)]),
    transforms.ToTensor()
]
if NORMALIZATION:
    eval_transforms.append(transforms.Normalize(means, stddevs))
eval_transform = transforms.Compose(eval_transforms)


In [8]:
train_ds = FlowPhotoDataset(
    train_df,
    '/home/amritagupta/ssdprivate/data/Streamflow/sites/Avery_Brook_Bridge_01171000/',
    transform=train_transform
)

In [9]:
train_ds[0]

TypeError: Unexpected type <class 'numpy.ndarray'>

In [16]:
from PIL import Image

image_path = os.path.join(
    '/home/amritagupta/ssdprivate/data/Streamflow/sites/Avery_Brook_Bridge_01171000/images',
    train_df['filename'].iloc[0]
)
image = Image.open(image_path)
image.mode


'RGB'

In [13]:
np.asarray(image)

array([[[252, 252, 164],
        [214, 211, 116],
        [200, 190,  77],
        ...,
        [233, 252, 248],
        [207, 243, 255],
        [138, 180, 205]],

       [[246, 255, 162],
        [215, 220, 126],
        [205, 200, 108],
        ...,
        [242, 255, 255],
        [194, 225, 230],
        [139, 175, 187]],

       [[210, 225, 134],
        [201, 210, 127],
        [184, 187, 116],
        ...,
        [127, 141, 142],
        [121, 141, 139],
        [138, 163, 157]],

       ...,

       [[254, 255, 253],
        [254, 255, 253],
        [254, 255, 253],
        ...,
        [254, 255, 253],
        [254, 255, 253],
        [254, 255, 253]],

       [[254, 255, 253],
        [254, 255, 253],
        [254, 255, 253],
        ...,
        [254, 255, 253],
        [254, 255, 253],
        [254, 255, 253]],

       [[254, 255, 253],
        [254, 255, 253],
        [254, 255, 253],
        ...,
        [254, 255, 253],
        [254, 255, 253],
        [254, 255, 253]]