In [None]:
import datetime
import math
import os
import sys
from collections import defaultdict
from typing import Union

import dateutil.parser
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from benthicnet.io import sanitize_filename, sanitize_filename_series
from IPython.display import display
from tqdm.auto import tqdm

sys.path.append("..")
from pangaea_downloader.tools import checker

In [None]:
# Load datasets from this directory
dirname = "../query-outputs_2022-01-01"
# Pangaea benthic image dataset file with filtered dataset IDs
pangaea_file = "../full-dataset/pangaea_2022-01-24_filtered.csv"
pangaea_df = pd.read_csv(pangaea_file)
ds_ids = pangaea_df.dataset.unique()
print(f"Total {len(ds_ids)} datasets to process.")

## 1. Load datasets

In [None]:
def get_dataset_url(ds_id: Union[str, int]) -> str:
    """Return dataset URL given six digit dataset ID."""
    if isinstance(ds_id, int):
        ds_id = str(ds_id)
    if ds_id.startswith("pangaea"):
        ds_id = ds_id.split("-")[-1]
    if ds_id.endswith(".csv"):
        ds_id = ds_id.split(".csv")[-2]
    return f"https://doi.pangaea.de/10.1594/PANGAEA.{ds_id}"

In [None]:
def find_url_column(df: pd.DataFrame) -> pd.Series:
    """Find and return column with image URLs."""
    # Standardize column names
    clean_cols = [
        col.lower().replace(" ", "").replace("-", "").replace("_", "").replace(".", "")
        for col in df.columns
    ]
    # Ordered list of priorities
    # Exclude url meta/ref/source which are not links to images
    candidates = [
        "urlimage",
        "urlraw",
        "urlfile",
        "url",
        "urlgraphic",
        "urlthumb",
        "urlthumbnail",
        "image",
        "imagery",
    ]
    # Find and return the first match
    for candidate in candidates:
        if candidate not in clean_cols:
            continue
        col = df.columns[clean_cols.index(candidate)]
        if any(df[col].apply(checker.is_url)):
            return col

In [None]:
# Column name value counts
column_count = defaultdict(lambda: 0)
# Key = column name : value = dataframes with that column
column_examples = defaultdict(lambda: [])

# Files with URL issues
files_without_url = []
files_with_repeat_urls = []

# Counts
n_total = 0
n_valid = 0

verbose = False

for dataset_name in tqdm(ds_ids):
    # Load dataset
    ds_id = dataset_name.split("-")[1]
    fname = f"{ds_id}.csv"
    f_path = os.path.join(dirname, fname)
    df = pd.read_csv(f_path, low_memory=False)
    n_total += 1

    # Any column names with URL or Image?
    if not checker.has_url_col(df):
        continue
    # Extract the column name
    url_col = find_url_column(df)

    # No URL column found
    if not url_col:
        if verbose:
            print(f"No url column for {fname} with columns\n{df.columns}")
        files_without_url.append(fname)
        continue

    # URL column found!
    n_valid += 1
    for col in df.columns:
        col = col.lower().strip()
        column_count[col] += 1
        column_examples[col].append(fname)
    subdf = df[df[url_col] != ""]
    if len(subdf) != len(subdf.drop_duplicates(subset=url_col)):
        files_with_repeat_urls.append(fname)

In [None]:
print(f"There are {n_valid} valid (of {n_total}) total datasets")
print(f"Of which {len(files_with_repeat_urls)} have repeated URLs", end="")
print(" (possibly multiple annotations)\n")
print(f"There are {len(column_count)} unique column names:\n")

# Sort by value in descending order
sorted_column_count = dict(
    sorted(column_count.items(), key=lambda item: item[1], reverse=True)
)
for col, count in sorted_column_count.items():
    c = col + " "
    print(f"{c:.<35s} {count:4d}")

## 2. Examine each of the columns of interest
- Depth water
- Bathy depth
- Depth bot & depth top

In [None]:
# Find depth columns
for col in column_examples:
    if "depth" in col:
        print(col)

### 2.1 Depth water
**Observations:**
- ***Depth water*** values in ALL datasets are positive.
- Therefore it is reasonable to assume that ***depth water*** represents the absolute distance of the camera vehicle below mean sea level.

In [None]:
def value_near_zero(value, tolerance=0.5) -> bool:
    """Check if the input value is close to zero within a specified tolerance range."""
    lb = 0 - tolerance
    ub = 0 + tolerance
    if lb <= value <= ub:
        return True
    return False

In [None]:
# Column to find
key = "depth water"

val_exception = {}
for i, file in enumerate(column_examples[key]):
    df = pd.read_csv(os.path.join(dirname, file), low_memory=False)
    url_column = find_url_column(df)
    df.columns = [col.lower() for col in df.columns]
    # Extract info
    mean = df[key].mean()
    sd = df[key].std()
    min_ = df[key].min()
    max_ = df[key].max()
    url = get_dataset_url(file)
    # Check for start and end at 0 altitude/depth
    start, end = df[key].iloc[0], df[key].iloc[-1]
    # Show
    print(
        f"[{i}] Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}, Depth start: {start}, end: {end}"
    )
    plt.figure(figsize=(16, 4))
    plt.plot(df[key], label=key)
    plt.gca().invert_yaxis()
    plt.show()
    # Datasets that defy column value norms
    #     if (min_ <= 0) or (max_ <= 0):
    #         print("\tMin or Max non-positive.")
    #         val_exception[url] = (mean, sd, min_, max_, start, end)
    if value_near_zero(start) or value_near_zero(end):
        print("\tStart or End near zero.")
        val_exception[url] = (mean, sd, min_, max_, start, end)

In [None]:
val_exception

### 2.2 Bathy depth
**Observations:**
- ***Bathy depth*** values in ALL datasets are positive.
- It is reasonable to assume that bathymetry depth refers to the distance from mean sea level to the ocean floor.

In [None]:
# Column to find
key = "bathy depth"

val_exception = {}
for i, file in enumerate(column_examples[key]):
    df = pd.read_csv(os.path.join(dirname, file), low_memory=False)
    url_column = find_url_column(df)
    df.columns = [col.lower() for col in df.columns]
    # Extract info
    mean = df[key].mean()
    sd = df[key].std()
    min_ = df[key].min()
    max_ = df[key].max()
    url = get_dataset_url(file)
    # Show
    print(f"[{i}] Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}")
    plt.figure(figsize=(16, 4))
    plt.plot(df[key], label=key)
    plt.gca().invert_yaxis()
    plt.show()
    if (min_ < 0) or (max_ < 0):
        print("\tDoes not satisfy column value norms.")
        val_exception[url] = (mean, sd, min_, max_)

In [None]:
val_exception

### 2.3 Depth top & depth bot
**Observations:**
- Common sense dictates that ***depth top*** and ***depth bot*** should mean the depth of the top and bottom of the camera vehicle.
- With this assumtion we would expect the difference between top and bot depth to be constant.
- In cases where the difference varies this is likely due to the rotation of the camera vehicle.

In [None]:
# Column to find
keys = ["depth top", "depth bot"]
# Depth bot & depth top

for i, file in enumerate(column_examples[keys[0]]):
    df = pd.read_csv(os.path.join(dirname, file), low_memory=False)
    url_column = find_url_column(df)
    df.columns = [col.lower() for col in df.columns]
    for key in keys:
        # Extract info
        mean = df[key].mean()
        sd = df[key].std()
        min_ = df[key].min()
        max_ = df[key].max()
        url = get_dataset_url(file)
        # Show
        print(
            f"[{i}] '{key}' Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}"
        )
    plt.figure(figsize=(16, 4))
    for key in keys:
        plt.plot(df[key], label=key)
    plt.plot(abs(df["depth top"] - df["depth bot"]), label="diff", linestyle=":")
    plt.legend()
    plt.show()

## 3. Explore relation between depth columns

In [None]:
print(len(column_examples["depth"]))
print(len(column_examples["depth water"]))
print(len(column_examples["bathy depth"]))
print(len(column_examples["bathy depth_2"]))
print(len(column_examples["bathy_depth"]))

## 3.1 Datasets with `depth`, `depth water` and `bathy depth` columns
When depth co-occurs with bathy depth and depth water

In [None]:
depth_set = set(column_examples["depth"])
depth_water_set = set(column_examples["depth water"])
bathy_set = set(column_examples["bathy depth"])
intersect = depth_set.intersection(depth_water_set).intersection(bathy_set)

print("depth_set :", len(depth_set))
print("depth_water_set :", len(depth_water_set))
print("bathy_set :", len(bathy_set))
print("# of files with all:", len(intersect))

## 3.2 Datasets with `depth` and `bathy depth` columns

In [None]:
depth_set = set(column_examples["depth"])
bathy_set = set(column_examples["bathy depth"])
intersect = depth_set.intersection(bathy_set)

print("depth_set :", len(depth_set))
print("bathy_set :", len(bathy_set))
print("# of files with both:", len(intersect))

keys = ["depth", "bathy depth"]
if len(intersect) > 0:
    for file in intersect:
        df = pd.read_csv(os.path.join(dirname, file), low_memory=False)
        df.columns = [col.lower() for col in df.columns]
        for key in keys:
            # Extract info
            mean = df[key].mean()
            sd = df[key].std()
            min_ = df[key].min()
            max_ = df[key].max()
            url = get_dataset_url(file)
            # Show
            print(
                f"[{i}] '{key}' Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}"
            )
        # Plot
        plt.figure(figsize=(16, 4))
        for key in keys:
            plt.plot(-df[key], label=key.capitalize())
        plt.legend()
        plt.show()

**NOTE:** for datasets with both depth and bathy depth, the bathy depth seems to be the depth of the sea floor (relative to mean sea level) and depth seems to be the depth of the camera vehicle.

## 3.3 Datasets with `depth water` and `bathy depth` columns

In [None]:
depth_water_set = set(column_examples["depth water"])
bathy_set = set(column_examples["bathy depth"])
intersect = depth_water_set.intersection(bathy_set)

print("depth_water_set :", len(depth_water_set))
print("bathy_set :", len(bathy_set))
print("# of files with both:", len(intersect))

keys = ["depth water", "bathy depth"]
if len(intersect) > 0:
    for file in intersect:
        df = pd.read_csv(os.path.join(dirname, file), low_memory=False)
        df.columns = [col.lower() for col in df.columns]
        for key in keys:
            # Extract info
            mean = df[key].mean()
            sd = df[key].std()
            min_ = df[key].min()
            max_ = df[key].max()
            url = get_dataset_url(file)
            # Show
            print(
                f"[{i}] '{key}' Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}"
            )
        # Plot
        plt.figure(figsize=(16, 4))
        for key in keys:
            plt.plot(-df[key], label=key.capitalize())
        plt.legend()
        plt.show()

**NOTE:**<br>
- Depth water is the altitude of the craft below mean sea level.
- Bathy depth is the bathymetry depth or the depth of the sea floor.

## 3.4 Datasets with two `bathy depth` columns

In [None]:
bathy_set = set(column_examples["bathy depth"])
bathy_set2 = set(column_examples["bathy depth_2"])
intersect = bathy_set.intersection(bathy_set2)

print("bathy_set :", len(bathy_set))
print("bathy_set2 :", len(bathy_set2))
print("# of files with both:", len(intersect))

keys = ["bathy depth", "bathy depth_2"]
if len(intersect) > 0:
    for file in intersect:
        df = pd.read_csv(os.path.join(dirname, file))
        df.columns = [col.lower() for col in df.columns]
        print(df.doi.iloc[0])
        for key in keys:
            # Extract info
            mean = df[key].mean()
            sd = df[key].std()
            min_ = df[key].min()
            max_ = df[key].max()
            url = get_dataset_url(file)
            # Show
            print(
                f"[{i}] '{key}' Mean: {mean:.2f} ± {sd:.2f} Range: {min_:.2f} to {max_:.2f}"
            )
        # Plot
        plt.figure(figsize=(16, 4))
        for key in keys:
            plt.plot(df[key], label=key.capitalize())
        plt.legend()
        plt.show()

**NOTE:** Upon checking the dataset webpages we see that the two bathy depth columns correspond to the original collection and recollection sites.