In [None]:
import datetime
import os
from collections import defaultdict

import dateutil.parser
import pandas as pd
from IPython.display import display
from tqdm.auto import tqdm

from pangaea_downloader.tools import checker

In [None]:
dirname = "../query-outputs2"

## Load datasets and check distribution of column frequencies

In [None]:
def find_url_column(df):

    clean_cols = [
        col.lower().replace(" ", "").replace("-", "").replace("_", "").replace(".", "")
        for col in df.columns
    ]
    # Ordered list of priorities
    # Exclude url meta/ref/source which are not links to images
    candidates = [
        "urlimage",
        "urlraw",
        "urlfile",
        "url",
        "urlgraphic",
        "urlthumb",
        "urlthumbnail",
        "image",
        "imagery",
    ]
    for candidate in candidates:
        if candidate not in clean_cols:
            continue
        col = df.columns[clean_cols.index(candidate)]
        if any(df[col].apply(checker.is_url)):
            return col

In [None]:
column_count = defaultdict(lambda: 0)
column_examples = defaultdict(lambda: [])
files_without_url = []
files_with_repeat_urls = []
n_total = 0
n_valid = 0

for fname in tqdm(os.listdir(dirname)):
    ds_id = os.path.splitext(fname)[0]
    df = pd.read_csv(os.path.join(dirname, fname))
    n_total += 1
    if not checker.has_url_col(df):
        continue
    url_col = find_url_column(df)
    if not url_col:
        print(f"No url column for {fname} with columns\n{df.columns}")
        files_without_url.append(fname)
        continue
    n_valid += 1
    for col in df.columns:
        col = col.lower().strip()
        column_count[col] += 1
        column_examples[col].append(fname)
    subdf = df[df[url_col] != ""]
    if len(subdf) != len(subdf.drop_duplicates(subset=url_col)):
        files_with_repeat_urls.append(fname)

In [None]:
print(f"There are {n_valid} valid (of {n_total}) total datasets")
print(
    f"Of which {len(files_with_repeat_urls)} have repeated URLs (possibly multiple annotations)"
)
print()
print(f"There are {len(column_count)} unique column names:")
print()

for col, count in dict(
    sorted(column_count.items(), key=lambda item: item[1], reverse=True)
).items():
    c = col + " "
    print(f"{c:.<35s} {count:4d}")

### Examining columns to find out what their contents are

In [None]:
df = pd.read_csv(os.path.join(dirname, column_examples["resolution"][0]))
display(df)
print(df.columns)
url_column = find_url_column(df)
print(df[url_column].iloc[0])
print(df["dataset_title"].iloc[0])

In [None]:
urls = []
n_datasets = 0
for fname in column_examples["seagr cov"]:
    df = pd.read_csv(os.path.join(dirname, fname))
    url_column = find_url_column(df)
    if url_column:
        urls.append(df[url_column])
        n_datasets += 1

In [None]:
len(pd.concat(urls).unique())

In [None]:
n_datasets

In [None]:
# Load up largest dataset, containing long cruises
df = pd.read_csv(os.path.join(dirname, "882349.csv"))
display(df)
print(df.columns)
url_column = find_url_column(df)
print(df[url_column].iloc[0])
print(df["dataset_title"].iloc[0])

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(12, 8))
plt.plot(-df["Depth water"], label="Depth water")
plt.plot(df["Elevation"], label="Elevation")
plt.legend()
plt.show()
plt.figure(figsize=(12, 8))
plt.plot(df["Depth water"] + df["Elevation"])
plt.show()

In [None]:
len(column_examples["seagr cov"])

In [None]:
len(set(column_examples["seagr cov"]))

In [None]:
coverage_columns = [k for k in column_examples.keys() if k.endswith(" cov")]

In [None]:
coverage_datasets = set()
for k in coverage_columns:
    if k == "ice cov":
        continue
    print()
    print(k)
    print(len(coverage_datasets))
    coverage_datasets.update(column_examples[k])
    print(len(coverage_datasets))

In [None]:
len(coverage_datasets)

In [None]:
urls = []
n_datasets = 0
for fname in coverage_datasets:
    df = pd.read_csv(os.path.join(dirname, fname))
    url_column = find_url_column(df)
    if url_column:
        urls.append(df[url_column])
        n_datasets += 1

In [None]:
len(pd.concat(urls).unique())

In [None]:
df = pd.read_csv(os.path.join(dirname, column_examples["ice cov"][1]))
display(df)
print(df.columns)
url_column = find_url_column(df)
print(df[url_column].iloc[0])
print(df["dataset_title"].iloc[0])

In [None]:
df = pd.read_csv(os.path.join(dirname, column_examples["ph"][0]))
display(df)
print(df.columns)
url_column = find_url_column(df)
print(df[url_column].iloc[0])
print(df["dataset_title"].iloc[0])

In [None]:
df = pd.read_csv(os.path.join(dirname, column_examples["elevation"][-1]))
display(df)
print(df.columns)
url_column = find_url_column(df)
print(df[url_column].iloc[0])
print(df["dataset_title"].iloc[0])

In [None]:
df = pd.read_csv(os.path.join(dirname, column_examples["doi"][0]))
display(df)
print(df.columns)
url_column = find_url_column(df)
print(df[url_column].iloc[0])

In [None]:
df = pd.read_csv(os.path.join(dirname, files_with_repeat_urls[3]))
display(df)
print(df.columns)
url_column = find_url_column(df)
print(df[url_column].iloc[0])
print(df["dataset_title"].iloc[-1])

## Implement dataset cleaning functions

In [None]:
TAXONOMY_RANKS = [
    ["Kingdom", "Regnum"],
    ["Phylum", "Division"],
    ["Ordo", "Order"],
    ["Familia", "Family"],
    ["Genus"],
    ["Species"],
]


def row2taxonomy(row):
    parts = []
    for rank_synonyms in TAXONOMY_RANKS:
        for col in rank_synonyms:
            col_ = col.lower()
            if col in row.keys() and row[col] and row[col] != "-":
                parts.append(row[col])
                break
            elif col_ in row.keys() and row[col_] and row[col_] != "-":
                parts.append(row[col_])
                break
        else:
            break
    return " > ".join(parts)

In [None]:
def check_title(title):
    """
    Screen dataset title.

    Parameters
    ----------
    title : str
        The title of the dataset.

    Returns
    -------
    bool
        Whether the dataset title is acceptable.
    """

    title = str(title)

    if title.startswith("Meteorological observations"):
        return False
    if title.startswith("Sea ice conditions"):
        return False
    if "topsoil" in title.lower():
        return False
    if "core" in title.lower():
        # return False
        pass

    return True


def add_file_extension(row):
    """
    Add file extension to image filename.

    Parameters
    ----------
    row : dict
        A dict record which may have fields ``"image"``, ``"File format"``, ``"File type"``.

    Returns
    -------
    fname : str
        File name with extension included.
    """
    if (
        "image" not in row.keys()
        or not row["image"]
        or not isinstance(row["image"], str)
    ):
        return ""

    s = row["image"]
    ext = os.path.splitext(s)[-1]
    if (
        ext.lower()
        in checker.VALID_IMG_EXTENSIONS
        + checker.INVALID_FILE_EXTENSIONS
        + checker.COMPRESSED_FILE_EXTENSIONS
    ):
        return s

    for col in ["File format", "File type"]:
        if col not in row.keys():
            continue
        new_ext = row[col]
        if not new_ext or not isinstance(new_ext, str):
            continue
        new_ext = "." + new_ext.strip().lstrip(".")
        if ext == new_ext:
            break
        s += new_ext
        break

    return s


def datetime2timestamp(ts):
    """
    Convert a datetime string to a timestamp.

    Parameters
    ----------
    ts : str
        Datetime string, in a format understood by ``dateutil.parser``.

    Returns
    -------
    float
        Timestamp; number of seconds since Unix epoch.
    """
    if isinstance(ts, str):
        return dateutil.parser.parse(ts).timestamp()
    return ts


def reformat_df(df, remove_duplicate_columns=True):
    """
    Reformat/clean pangaea dataset.

    Rename columns.

    Parameters
    ----------
    df : pandas.DataFrame
        Original dataset.

    Returns
    -------
    df : pandas.DataFrame or None
        Cleaned dataset, or ``None`` if the dataset is invalid.
    """

    if (
        "dataset_title" in df
        and len(df) > 0
        and df.iloc[0]["dataset_title"]
        and not check_title(df.iloc[0]["dataset_title"])
    ):
        return None

    # Make a copy of the dataframe so we can't overwrite the input
    df = df.copy()

    # Remove bad columns
    df.drop(labels=["-"], axis="columns", inplace=True, errors="ignore")
    # Remove duplicately named columns
    cols_to_drop = []
    if remove_duplicate_columns:
        for col in df.columns:
            if len(col) < 2:
                continue
            if (
                (col[-2] in " _")
                and (col[-1] in "123456789")
                and (col[:-2] in df.columns)
            ):
                cols_to_drop.append(col)
        df.drop(labels=cols_to_drop, axis="columns", inplace=True)

    # Find the correct URL column, and drop other columns containing "url"
    cols_to_drop = []
    mapping = {}
    col_url = find_url_column(df)
    mapping[col_url] = "url"
    for col in df.columns:
        if col != col_url and "url" in col.lower():
            cols_to_drop.append(col)

    # Search for matches to canonical columns.
    # Each entry in desired_columns is a key, value pair where the key
    # is the output column name, and the value is a list of search names
    # in order of priority. The first match will be kept and others discarded.
    desired_columns = {
        "dataset": ["ds_id", "dataset", "Campaign", "campaign"],
        "site": ["Event", "event", "Site", "site", "deployment"],
        "image": ["image", "filename"],
        "timestamp": ["Timestamp"],
        "latitude": ["Latitude", "latitude", "lat", "latitude+"],
        "longitude": ["Longitude", "longitude", "lon", "long", "longitude+"],
        "x_pos": [],
        "y_pos": [],
        "altitude": ["altitude", "height"],
        "depth": [
            "depthwater",
            "bathydepth",
            "bathymetry",
            "bathy",
            "depth",
            "elevation",
        ],
        "backscatter": [],
        "temperature": ["temperature", "temp"],
        "salinity": ["salinity", "sal"],
        "chlorophyll": [],
        "acidity": ["pH"],
        "doi": ["DOI", "doi"],
    }
    # Remove non-alphanumeric padding characters, including spaces, from actual column names
    raw_cols = list(df.columns)
    clean_cols = [
        col.lower().replace(" ", "").replace("-", "").replace("_", "").replace(".", "")
        for col in df.columns
    ]
    # Map to lower case
    lower_cols = [col.lower() for col in clean_cols]

    # Search for matching column names
    for canon, searches in desired_columns.items():
        found = False

        # Check for case-sensitive, non-alphanumeric, match
        for search in searches:
            if search not in raw_cols:
                continue
            col = search
            if not found:
                found = True
                mapping[col] = canon
                if col != canon and canon in df.columns:
                    cols_to_drop.append(canon)
            elif col not in mapping and col not in cols_to_drop:
                cols_to_drop.append(col)

        # Check for case-sensitive match
        for search in searches:
            if search not in clean_cols:
                continue
            col = df.columns[clean_cols.index(search)]
            if not found:
                found = True
                mapping[col] = canon
                if col != canon and canon in df.columns:
                    cols_to_drop.append(canon)
            elif col not in mapping and col not in cols_to_drop:
                cols_to_drop.append(col)

        # Check for case-insensitive match
        for search in searches:
            if search.lower() not in lower_cols:
                continue
            col = df.columns[lower_cols.index(search.lower())]
            if not found:
                found = True
                mapping[col] = canon
                if col != canon and canon in df.columns:
                    cols_to_drop.append(canon)
            elif col not in mapping and col not in cols_to_drop:
                cols_to_drop.append(col)

    # Remove superfluous columns
    df.drop(labels=cols_to_drop, axis="columns", inplace=True)
    # Rename columns to canonical names
    df.rename(columns=mapping, inplace=True, errors="raise")

    # Add file extension to image
    df["image"] = df.apply(add_file_extension, axis=1)
    if "timestamp" not in df.columns and "Date/Time" in df.columns:
        df["timestamp"] = df["Date/Time"].apply(datetime2timestamp)

    if any([c in clean_cols for c in ["Kingdom", "Phylum", "Genus"]]):
        df["taxonomy"] = df.apply(row2taxonomy, axis=1)
        df.drop(
            labels=[x for syn in TAXONOMY_RANKS for x in syn],
            axis="columns",
            inplace=True,
            errors="ignore",
        )

    cols_to_drop = [
        "File format",
        "File type",
        "File size",
        "Date/Time",
        "Date/time end",
    ]
    df.drop(labels=cols_to_drop, axis="columns", inplace=True, errors="ignore")

    return df

## Load data with dataset cleaning functions applied

In [None]:
column_count = defaultdict(lambda: 0)
column_examples = defaultdict(lambda: [])
files_without_url = []
files_with_repeat_urls = []
n_total = 0
n_valid = 0
dfs = []
dfs_fnames = []

for fname in tqdm(os.listdir(dirname)):
    ds_id = os.path.splitext(fname)[0]
    df = pd.read_csv(os.path.join(dirname, fname))
    n_total += 1
    if not checker.has_url_col(df):
        continue
    # print(df.columns)
    url_col = find_url_column(df)
    if not url_col:
        # print(f"No url column for {fname} with columns\n{df.columns}")
        files_without_url.append(fname)
        continue
    df["ds_id"] = f"pangaea-{ds_id}"
    df = reformat_df(df)
    if df is None:
        continue
    n_valid += 1
    dfs.append(df)
    dfs_fnames.append(fname)
    for col in df.columns:
        column_count[col] += 1
        column_examples[col].append(fname)
    # print(df.columns)
    url_col = "url"
    subdf = df[df[url_col] != ""]
    if len(subdf) != len(subdf.drop_duplicates(subset=url_col)):
        files_with_repeat_urls.append(fname)

In [None]:
print(f"There are {n_valid} valid (of {n_total}) valid datasets")
print(
    f"Of which {len(files_with_repeat_urls)} have repeated URLs (possibly multiple annotations)"
)
print()
print(f"There are {len(column_count)} unique column names:")
print()

for col, count in dict(
    sorted(column_count.items(), key=lambda item: item[1], reverse=True)
).items():
    c = col + " "
    print(f"{c:.<35s} {count:4d}")

### Merge datasets together

In [None]:
select_cols = {
    "dataset",
    "site",
    "url",
    "image",
    "timestamp",
    "latitude",
    "longitude",
    "x_pos",
    "y_pos",
    "altitude",
    "depth",
    "backscatter",
    "temperature",
    "salinity",
    "chlorophyll",
    "acidity",
}

df_all = pd.concat([df[df.columns.intersection(select_cols)] for df in dfs])

In [None]:
df_all

In [None]:
# Save all records
df_all.to_csv(
    f"../pangaea_{datetime.datetime.today().strftime('%Y-%m-%d')}.csv", index=False
)

In [None]:
# Filter down to only valid URLs
df_all = df_all[df_all["url"].apply(checker.is_url)]

In [None]:
len(df_all)

In [None]:
unique_url_bases = sorted(
    df_all["url"].apply(lambda x: "/".join(x.split("/")[:4])).unique()
)

In [None]:
len(unique_url_bases)

In [None]:
url_base = unique_url_bases[0]
df_all[df_all["url"].str.startswith(url_base)].iloc[[0, -1]]

In [None]:
for i, url_base in enumerate(unique_url_bases):
    print()
    sdf = df_all[df_all["url"].str.startswith(url_base)]
    print(
        "{:3d}/{} ({:7d} URLs), base {}".format(
            i, len(unique_url_bases), len(sdf), url_base
        )
    )
    print(sdf["url"].iloc[0])
    if len(sdf) > 2:
        print(sdf["url"].iloc[1])
    if len(sdf) > 4:
        print(sdf["url"].iloc[len(sdf) // 2])
    if len(sdf) > 3:
        print(sdf["url"].iloc[-2])
    if len(sdf) > 1:
        print(sdf["url"].iloc[-1])

In [None]:
# Check how many rows have lat & lon
sum(~df_all["latitude"].isna() & ~df_all["longitude"].isna())

In [None]:
len(df_all["site"].unique())

In [None]:
# Remove duplicate URLs
df_all = df_all.drop_duplicates(subset="url")

In [None]:
len(df_all)

In [None]:
# Check how many rows have lat & lon
sum(~df_all["latitude"].isna() & ~df_all["longitude"].isna())

In [None]:
len(df_all["site"].unique())

In [None]:
is_image = df_all["url"].apply(
    lambda x: checker.has_img_extension(x.rstrip("/"))
) | df_all["image"].apply(lambda x: checker.has_img_extension(x.rstrip("/")))
df_all = df_all[is_image]

In [None]:
len(df_all)

In [None]:
sum(~df_all["latitude"].isna() & ~df_all["longitude"].isna())

In [None]:
len(df_all["site"].unique())

In [None]:
unique_url_bases = sorted(
    df_all["url"].apply(lambda x: "/".join(x.split("/")[:4])).unique()
)

In [None]:
len(unique_url_bases)

In [None]:
for i, url_base in enumerate(unique_url_bases):
    print()
    sdf = df_all[df_all["url"].str.startswith(url_base)]
    print(
        "{:3d}/{} ({:7d} URLs), base {}".format(
            i + 1, len(unique_url_bases), len(sdf), url_base
        )
    )
    print(sdf["url"].iloc[0])
    if len(sdf) > 2:
        print(sdf["url"].iloc[1])
    if len(sdf) > 4:
        print(sdf["url"].iloc[len(sdf) // 2])
    if len(sdf) > 12:
        print(sdf["url"].iloc[9])
    if len(sdf) > 102:
        print(sdf["url"].iloc[99])
    if len(sdf) > 1002:
        print(sdf["url"].iloc[999])
    if len(sdf) > 10002:
        print(sdf["url"].iloc[9999])
    if len(sdf) > 3:
        print(sdf["url"].iloc[-2])
    if len(sdf) > 1:
        print(sdf["url"].iloc[-1])

In [None]:
def check_subdomain(url):
    blacklist = [
        "https://doi.org/10.1594/PANGAEA",
        "http://epic.awi.de/",
        "https://epic.awi.de/",
        "http://hdl.handle.net/10013/",
        "http://library.ucsd.edu/dc/object/",
        "https://hs.pangaea.de/Maps/",
        "https://hs.pangaea.de/Movies/",
        "https://hs.pangaea.de/Projects/",
        "https://hs.pangaea.de/bathy/",
        "https://hs.pangaea.de/fishsounder/",
        "https://hs.pangaea.de/mag/",
        "https://hs.pangaea.de/model/",
        "https://hs.pangaea.de/nav/",
        "https://hs.pangaea.de/palaoa/",
        "https://hs.pangaea.de/para/",
        "https://hs.pangaea.de/reflec/",
        "https://hs.pangaea.de/sat/",
        "https://prr.osu.edu/collection/object/",
        "https://store.pangaea.de/Projects/",  # Not all bad, but mostly
        "https://store.pangaea.de/Publications/",  # Not all bad, but mostly
        "https://store.pangaea.de/software/",
        "https://www.ngdc.noaa.gov/geosamples/",
    ]
    for entry in blacklist:
        if url.startswith(entry):
            return False
    return True

In [None]:
# Remove bad URLs based on their subdomain
df_all = df_all[df_all["url"].apply(check_subdomain)]

In [None]:
len(df_all)

In [None]:
unique_url_bases = sorted(
    df_all["url"].apply(lambda x: "/".join(x.split("/")[:5])).unique()
)

In [None]:
len(unique_url_bases)

In [None]:
for i, url_base in enumerate(unique_url_bases):
    print()
    sdf = df_all[df_all["url"].str.startswith(url_base)]
    print(
        "{:3d}/{} ({:7d} URLs), base {}".format(
            i + 1, len(unique_url_bases), len(sdf), url_base
        )
    )
    print(sdf["url"].iloc[0])
    if len(sdf) > 2:
        print(sdf["url"].iloc[1])
    if len(sdf) > 4:
        print(sdf["url"].iloc[len(sdf) // 2])
    if len(sdf) > 12:
        print(sdf["url"].iloc[9])
    if len(sdf) > 102:
        print(sdf["url"].iloc[99])
    if len(sdf) > 1002:
        print(sdf["url"].iloc[999])
    if len(sdf) > 10002:
        print(sdf["url"].iloc[9999])
    if len(sdf) > 3:
        print(sdf["url"].iloc[-2])
    if len(sdf) > 1:
        print(sdf["url"].iloc[-1])

In [None]:
import re

In [None]:
if re.search(
    "(?<![A-Za-z])map(?![A-Za-z])",
    "https://hs.pangaea.de/Images/ROV/M/M114/GeoB19346-1/data_publish/config/divemap/Dive360_map.jpg",
):
    print("true")

In [None]:
if re.search(
    "(?<![A-Za-z])map(?![A-Za-z])",
    "https://hs.pangaea.de/Images/ROV/M/M114/GeoB19346-1/data_publish/config/divemap/Dive360mapy.jpg",
):
    print("true")

In [None]:
def check_subdomain(url):
    blacklist = [
        "https://doi.org/10.1594/PANGAEA",
        "http://epic.awi.de/",
        "https://epic.awi.de/",
        "http://hdl.handle.net/10013/",
        "http://library.ucsd.edu/dc/object/",
        "https://app.geosamples.org/uploads/UHM",
        "https://hs.pangaea.de/Images/Linescan",
        "https://hs.pangaea.de/Maps/",
        "https://hs.pangaea.de//Maps",
        "https://hs.pangaea.de/Movies/",
        "https://hs.pangaea.de/Projects/",
        "https://hs.pangaea.de/bathy/",
        "https://hs.pangaea.de/fishsounder/",
        "https://hs.pangaea.de/mag/",
        "https://hs.pangaea.de/model/",
        "https://hs.pangaea.de/nav/",
        "https://hs.pangaea.de/palaoa/",
        "https://hs.pangaea.de/pasata/",
        "https://hs.pangaea.de/para/",
        "https://hs.pangaea.de/polar",
        "https://hs.pangaea.de/reflec/",
        "https://hs.pangaea.de/sat/",
        "https://prr.osu.edu/collection/object/",
        "https://store.pangaea.de/Projects/",  # Not all bad, but mostly
        "https://store.pangaea.de/Publications/",  # Not all bad, but mostly
        "https://store.pangaea.de/software/",
        "https://www.ngdc.noaa.gov/geosamples/",
        "https://hs.pangaea.de/Images/Airphoto",
        # "https://hs.pangaea.de/Images/Cores",  # Some of these are okay
        "https://hs.pangaea.de/Images/Documentation",
        "https://hs.pangaea.de/Images/Maps",
        "https://hs.pangaea.de/Images/MMT/",
        "https://hs.pangaea.de/Images/Plankton",
        # The GeoB19346-1 dataset contains .bmp images of the ROV's sonar
        "https://hs.pangaea.de/Images/ROV/M/M114/GeoB19346-1/data_publish/data/sonar/",
        "https://hs.pangaea.de/Images/Satellite",
        "https://hs.pangaea.de/Images/SeaIce",
        "https://hs.pangaea.de/Images/Water",
        "https://store.pangaea.de/Images/Airphoto",
        "https://store.pangaea.de/Images/Documentation",
    ]
    banned_words = ["map", "divemap", "dredge_photos", "dredgephotograph"]
    for entry in blacklist:
        if url.startswith(entry):
            return False
    for word in banned_words:
        if re.search("(?<![A-Za-z])" + word + "(?![A-Za-z])", url.lower()):
            return False
    if re.search("(?<![a-z])core(?![a-rty])", url.lower()) and "SUR" not in url:
        # Images of cores must contain "SURFACE", or the shorthand "SUR"
        # We only keep the ones with surface in uppercase, because those
        # experiments are in-situ surface photos, whereas lower case are not.
        return False
    if "not_available" in url:
        return False
    return True

In [None]:
# Remove bad URLs based on their subdomain (again), and remove maps
df_all = df_all[df_all["url"].apply(check_subdomain)]

In [None]:
len(df_all)

In [None]:
unique_url_bases = sorted(
    df_all["url"].apply(lambda x: "/".join(x.split("/")[:5])).unique()
)

In [None]:
len(unique_url_bases)

In [None]:
for i, url_base in enumerate(unique_url_bases):
    print()
    sdf = df_all[df_all["url"].str.startswith(url_base)]
    print(
        "{:3d}/{} ({:7d} URLs), base {}".format(
            i + 1, len(unique_url_bases), len(sdf), url_base
        )
    )
    print(sdf["url"].iloc[0])
    if len(sdf) > 2:
        print(sdf["url"].iloc[1])
    if len(sdf) > 4:
        print(sdf["url"].iloc[len(sdf) // 2])
    if len(sdf) > 12:
        print(sdf["url"].iloc[9])
    if len(sdf) > 102:
        print(sdf["url"].iloc[99])
    if len(sdf) > 1002:
        print(sdf["url"].iloc[499])
        print(sdf["url"].iloc[999])
    if len(sdf) > 10002:
        print(sdf["url"].iloc[4999])
        print(sdf["url"].iloc[9999])
    if len(sdf) > 3:
        print(sdf["url"].iloc[-2])
    if len(sdf) > 1:
        print(sdf["url"].iloc[-1])

In [None]:
df_all[df_all["url"].apply(lambda x: "mosaic" in x)]

In [None]:
# Drop mosaic images
df_all = df_all[~df_all["url"].apply(lambda x: "mosaic" in x)]

In [None]:
unique_extensions = sorted(
    df_all["url"].apply(lambda x: os.path.splitext(x)[1]).unique()
)

In [None]:
unique_extensions

In [None]:
for i, ext in enumerate(unique_extensions):
    print()
    sdf = df_all[df_all["url"].str.endswith(ext)]
    print(
        "{:3d}/{} ({:7d} URLs), extension {}".format(
            i + 1, len(unique_extensions), len(sdf), ext
        )
    )
    print(sdf["url"].iloc[0])
    if len(sdf) > 2:
        print(sdf["url"].iloc[1])
    if len(sdf) > 4:
        print(sdf["url"].iloc[len(sdf) // 2])
    if len(sdf) > 12:
        print(sdf["url"].iloc[9])
    if len(sdf) > 102:
        print(sdf["url"].iloc[99])
    if len(sdf) > 1002:
        print(sdf["url"].iloc[499])
        print(sdf["url"].iloc[999])
    if len(sdf) > 10002:
        print(sdf["url"].iloc[4999])
        print(sdf["url"].iloc[9999])
    if len(sdf) > 3:
        print(sdf["url"].iloc[-2])
    if len(sdf) > 1:
        print(sdf["url"].iloc[-1])

In [None]:
df_all

## Save unlabelled dataset

In [None]:
df_all.to_csv(
    f"../pangaea_{datetime.datetime.today().strftime('%Y-%m-%d')}_filtered.csv",
    index=False,
)

# Labelled data

## Percent Coverage

In [None]:
cov_dfs = []
for fname in tqdm(coverage_datasets):
    ds_id = os.path.splitext(fname)[0]
    df = pd.read_csv(os.path.join(dirname, fname))
    if not checker.has_url_col(df):
        continue
    # print(df.columns)
    url_col = find_url_column(df)
    if not url_col:
        print(f"No url column for {fname} with columns\n{df.columns}")
        continue
    df["ds_id"] = f"pangaea-{ds_id}"
    df = reformat_df(df, remove_duplicate_columns=False)
    if df is None or len(df) == 0:
        continue
    df = df[~df["url"].isna()]
    df = df[df["url"].apply(check_subdomain)]
    is_image = df["url"].apply(lambda x: checker.has_img_extension(x.rstrip("/"))) | df[
        "image"
    ].apply(lambda x: checker.has_img_extension(x.rstrip("/")))
    df = df[is_image]
    if df is None or len(df) == 0:
        continue
    cov_dfs.append(df)

In [None]:
df_cov_all = pd.concat(cov_dfs)

In [None]:
display(df_cov_all)

In [None]:
files_without_url = []
files_with_repeat_urls = []
n_total = 0
n_valid = 0

dois = []

for df in cov_dfs:
    n_total += 1
    url_col = "url"
    if not url_col:
        print(f"Missing url column with columns\n{df.columns}")
        continue
    n_valid += 1
    dois.append(df.iloc[0]["doi"])
    subdf = df[df[url_col] != ""]
    if len(subdf) != len(subdf.drop_duplicates(subset=url_col)):
        files_with_repeat_urls.append(fname)

for doi in sorted(dois):
    print(doi)

In [None]:
print(f"There are {n_valid} valid (of {n_total}) total datasets")
print(
    f"Of which {len(files_with_repeat_urls)} have repeated URLs (possibly multiple annotations)"
)
print()
print(f"There are {len(column_count)} unique column names:")
print()

for col, count in dict(
    sorted(column_count.items(), key=lambda item: item[1], reverse=True)
).items():
    if " cov" not in col:
        pass
    c = col + " "
    print(f"{c:.<35s} {count:4d}")

### Labelled data coverage columns (A) PANGAEA > Roelfsema et. al.

Formed from the following 4 dataset "publication series":
https://doi.pangaea.de/10.1594/PANGAEA.891711 (CC-BY-3.0)
https://doi.pangaea.de/10.1594/PANGAEA.891736 (CC-BY-3.0)
https://doi.pangaea.de/10.1594/PANGAEA.892623 (CC-BY-3.0)
https://doi.pangaea.de/10.1594/PANGAEA.894801 (CC-BY-4.0)

All datasets have the same 61 coverage column labels:
```
    Acropora cov
    Acropora cov_2
    Acroporidae cov
    Acroporidae cov_2
    Acroporidae cov_3
    Acroporidae cov_4
    Montipora cov
    Montipora cov_2
    Acropora cov_3
    Acropora cov_4
    Acropora cov_5
    Acropora cov_6
    Favia cov
    Favia cov_2
    Coral indet cov
    Coral indet cov_2
    Coral indet cov_3
    Coral indet cov_4
    Pocilloporidae cov
    Pocilloporidae cov_2
    P. cylindrica cov
    P. cylindrica cov_2
    P. lichen cov
    P. lichen cov_2
    P. lobata cov
    P. lobata cov_2
    Coral indet cov_5
    Coral indet cov_6
    Coral indet cov_7
    Coral indet cov_8
    Alcyoniidae cov
    Alcyoniidae cov_2
    Gorgonia cov
    Gorgonia cov_2
    A. planci cov
    Invertebrata cov
    Soft corals oth cov
    Soft corals oth cov_2
    Invertebrata cov_2
    Sand cov
    Other cov
    Other cov_2
    Other cov_3
    Other cov_4
    Background cov
    Algae cov
    Benth microalgae cov
    Corall algae cov
    Corall algae cov_2
    Caulerpa sp. cov
    Chlorodesmis sp. cov
    Cyanobact cov
    Dictyota sp. cov
    Epith algal matrix cov
    Epith algal matrix cov_2
    Lobophora cov
    Halimeda sp. cov
    Sargassum sp. cov
    Padina sp. cov
    Turbinaria sp. cov
    Seagr cov
```
And all have metadata columns:
```
    image
    url
    latitude
    longitude
    Cover branch cor
    Cover branch cor_2
    site
    dataset_title
    doi
    dataset
    timestamp
```

Note that not all of the labels are manually generated. Abstract for one of them:
A subset of photoquadrats were uploaded to the CoralNet machine learning interface (https://coralnet.ucsd.edu/) and manually labelled for coral, algae or substrate type using 50 points per quadrat. Follow training of the machine, this enabled automatic annotation of all unclassified field images: the remaining field photos were uploaded to the database and 50 annotation points were overlaid on each of the images. Every point was assigned a benthic cover category from a label list automatically by the program. The resulting benthic cover data of each photo was linked to gps coordinates, saved as an ArcMap point shapefile, and projected to Universal Transverse Mercator WGS84 Zone 55 South.

In [None]:
files_without_url = []
files_with_repeat_urls = []
n_total = 0
n_valid = 0

dois = []

first = True

cov_dfs_sub = []

for df in cov_dfs:
    if "seagr cov" not in [c.lower() for c in df.columns]:
        continue
    if "acropora cov" not in [c.lower() for c in df.columns]:
        continue
    n_total += 1
    url_col = "url"
    if not url_col:
        print(f"Missing url column with columns\n{df.columns}")
        continue
    n_valid += 1
    dois.append(df.iloc[0]["doi"])
    cov_dfs_sub.append(df)
    subdf = df[df[url_col] != ""]
    if len(subdf) != len(subdf.drop_duplicates(subset=url_col)):
        files_with_repeat_urls.append(fname)
    if first:
        for c in df.columns:
            if " cov" not in c:
                print(c)
        for c in df.columns:
            if " cov" in c:
                print(c)
        first = False
    last_df = df

for doi in sorted(dois):
    print(doi)

for c in last_df.columns:
    if " cov" not in c:
        print(c)
for c in last_df.columns:
    if " cov" in c:
        print(c)

In [None]:
print(f"There are {n_valid} valid (of {n_total}) total datasets")
print(
    f"Of which {len(files_with_repeat_urls)} have repeated URLs (possibly multiple annotations)"
)
print()
print(f"There are {len(column_count)} unique column names:")
print()

for col, count in dict(
    sorted(column_count.items(), key=lambda item: item[1], reverse=True)
).items():
    if " cov" not in col:
        pass
    c = col + " "
    print(f"{c:.<35s} {count:4d}")

In [None]:
cov_df_sub = pd.concat(cov_dfs_sub)

cov_df_sub.to_csv(
    f"../pangaea_coverage-a_{datetime.datetime.today().strftime('%Y-%m-%d')}.csv",
    index=False,
)

display(cov_df_sub)

### Labelled data coverage columns (B) PANGAEA > Roelfsema et. al.

All datasets in the publication series
https://doi.pangaea.de/10.1594/PANGAEA.846147 (CC-BY-3.0)

Sited at Eastern Banks, Moreton Bay.

Published in https://doi.org/10.1038/sdata.2015.40

Note
The 2015 data was additional data that was collected in the same manner as the other years. Although it was not used in the publication, we believed that it should be added to the data set as it expands the collection for anyone wanting to use the data in their own way.

In [None]:
files_without_url = []
files_with_repeat_urls = []
n_total = 0
n_valid = 0

dois = []

first = True

cov_dfs_sub = []

for df in cov_dfs:
    if "seagr cov" not in [c.lower() for c in df.columns]:
        continue
    if "acropora cov" in [c.lower() for c in df.columns]:
        continue
    n_total += 1
    url_col = "url"
    if not url_col:
        print(f"Missing url column with columns\n{df.columns}")
        continue
    n_valid += 1
    dois.append(df.iloc[0]["doi"])
    cov_dfs_sub.append(df)
    subdf = df[df[url_col] != ""]
    if len(subdf) != len(subdf.drop_duplicates(subset=url_col)):
        files_with_repeat_urls.append(fname)
    if first:
        for c in df.columns:
            if " cov" not in c:
                print(c)
        for c in df.columns:
            if " cov" in c:
                print(c)
        first = False
    last_df = df

for doi in sorted(dois):
    print(doi)

for c in last_df.columns:
    if " cov" not in c:
        print(c)
for c in last_df.columns:
    if " cov" in c:
        print(c)

In [None]:
print(f"There are {n_valid} valid (of {n_total}) total datasets")
print(
    f"Of which {len(files_with_repeat_urls)} have repeated URLs (possibly multiple annotations)"
)
print()
print(f"There are {len(column_count)} unique column names:")
print()

for col, count in dict(
    sorted(column_count.items(), key=lambda item: item[1], reverse=True)
).items():
    if " cov" not in col:
        pass
    c = col + " "
    print(f"{c:.<35s} {count:4d}")

In [None]:
cov_df_sub = pd.concat(cov_dfs_sub)

cov_df_sub.to_csv(
    f"../pangaea_coverage-b_{datetime.datetime.today().strftime('%Y-%m-%d')}.csv",
    index=False,
)

display(cov_df_sub)

### Labelled data coverage columns (C) Other

The rest. 3 datasets in Antarctica, Arctic, and Chile.

https://doi.pangaea.de/10.1594/PANGAEA.839225 (CC-BY-3.0)
https://doi.pangaea.de/10.1594/PANGAEA.841459 (CC-BY-3.0)
https://doi.pangaea.de/10.1594/PANGAEA.897047 (CC-BY-4.0)
    https://aslopubs.onlinelibrary.wiley.com/doi/10.1002/lno.11187

They are all interested in Bryozoa, but otherwise there isn't any overlap in labels.

In [None]:
files_without_url = []
files_with_repeat_urls = []
n_total = 0
n_valid = 0

dois = []

first = True

cov_dfs_sub = []

for df in cov_dfs:
    if "seagr cov" in [c.lower() for c in df.columns]:
        continue
    if "acropora cov" not in [c.lower() for c in df.columns]:
        pass
    n_total += 1
    url_col = "url"
    if not url_col:
        print(f"Missing url column with columns\n{df.columns}")
        continue
    n_valid += 1
    dois.append(df.iloc[0]["doi"])
    cov_dfs_sub.append(df)
    subdf = df[df[url_col] != ""]
    if len(subdf) != len(subdf.drop_duplicates(subset=url_col)):
        files_with_repeat_urls.append(fname)
    if first:
        for c in df.columns:
            if " cov" not in c:
                print(c)
        for c in df.columns:
            if " cov" in c:
                print(c)
        first = False
    last_df = df

for doi in sorted(dois):
    print(doi)

for c in last_df.columns:
    if " cov" not in c:
        print(c)
for c in last_df.columns:
    if " cov" in c:
        print(c)

In [None]:
print(f"There are {n_valid} valid (of {n_total}) total datasets")
print(
    f"Of which {len(files_with_repeat_urls)} have repeated URLs (possibly multiple annotations)"
)
print()
print(f"There are {len(column_count)} unique column names:")
print()

for col, count in dict(
    sorted(column_count.items(), key=lambda item: item[1], reverse=True)
).items():
    if " cov" not in col:
        pass
    c = col + " "
    print(f"{c:.<35s} {count:4d}")

In [None]:
cov_df_sub = pd.concat(cov_dfs_sub)

cov_df_sub.to_csv(
    f"../pangaea_coverage-c_{datetime.datetime.today().strftime('%Y-%m-%d')}.csv",
    index=False,
)

display(cov_df_sub)