In [None]:
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sys.path.append("..")
from pangaea_downloader.tools import datasets, eda

## Load files

In [None]:
data_dir = "../query-outputs/"
files = [f for f in os.listdir(data_dir) if f.endswith(".csv")]
df_list = [pd.read_csv(os.path.join(data_dir, f), low_memory=False) for f in files]

assert len(files) == len(
    df_list
), f"Number of files in directory '{data_dir}' does not match number of dataframes loaded."

print(f"Total {len(df_list)} files loaded.")
sorted_dfs = sorted(df_list, key=lambda df: len(df), reverse=True)

## Check if these labeled datasets have already been downloaded

In [None]:
known_labeled_datasets = [
    "https://doi.pangaea.de/10.1594/PANGAEA.846264",
    "https://doi.pangaea.de/10.1594/PANGAEA.846142",
    "https://doi.pangaea.de/10.1594/PANGAEA.846143",
    "https://doi.pangaea.de/10.1594/PANGAEA.846144",
    "https://doi.pangaea.de/10.1594/PANGAEA.846146",
    "https://doi.pangaea.de/10.1594/PANGAEA.846185",
    "https://doi.pangaea.de/10.1594/PANGAEA.846186",
    "https://doi.pangaea.de/10.1594/PANGAEA.846266",
    "https://doi.pangaea.de/10.1594/PANGAEA.867188",
]
known_labeled_dsids = [url.split(".")[-1] for url in known_labeled_datasets]
for i, (lab_ds_id, lab_ds) in enumerate(
    zip(known_labeled_dsids, known_labeled_datasets)
):
    print(i + 1, f"Dataset ID: '{lab_ds_id}', URL: {lab_ds}")

We have to convert the doi/urls to the same format before comparing them

In [None]:
def url_from_doi(doi: str) -> str:
    # Already in desired format
    if ".pangaea.de" in doi:
        return doi
    # Convert to desired format
    start, end = doi.split(".org")
    full = start + ".pangaea.de" + end
    return full

In [None]:
test = df_list[0].doi.iloc[0]
print("DOI:", test)
print("URL:", url_from_doi(test))

Iteratively convert to standard form and match the doi of each dataset with that of the known datasets

In [None]:
errors = dict()
results = dict()
datasets_cheked = []
for i, df in enumerate(df_list):
    try:
        doi = df.doi.dropna().iloc[0]
    except AttributeError as a:
        errors[i] = a
        doi = df.DOI.dropna().iloc[0]
    finally:
        # Convert dois to same format
        ds_id = doi.split(".")[-1]
    # Compare urls
    for kdsid in known_labeled_dsids:
        if ds_id == kdsid:
            results[kdsid] = True
results

In [None]:
results = dict()
for i, (lab_ds_id, lab_ds) in enumerate(
    zip(known_labeled_dsids, known_labeled_datasets)
):
    for df in sorted_dfs:
        try:
            doi = df.doi.dropna().iloc[0]
        except AttributeError:
            doi = df.DOI.dropna().iloc[0]
        finally:
            ds_id = doi.split(".")[-1]
        if ds_id == lab_ds_id:
            results[lab_ds] = True
results

## Identify labeled datasets

Automated process for finding datasets with label columns in the format `species_cov` or `species cov`

In [None]:
errors = dict()
labelled_datasets = dict()
labelled_dataset_idxs = []
for i, df in enumerate(sorted_dfs):
    try:  # Extract info
        title = df.dataset_title.iloc[0]
        doi = df.doi.iloc[0]
    except AttributeError as a:
        errors[i] = a
        title = df.Dataset.iloc[0]
        doi = df.DOI.iloc[0]
    finally:
        # Track labeled datasets with label columns in the format: `species_cov` or `species cov`
        label_cols = [
            col
            for col in df.columns
            # Inclusion criteria
            if ("cov" in col.lower())
            # Exclusion criteria
            and (col != "Cov")  # Covariance
            and (col != "Cov std e")  # Coverage standard error
            and (col.lower() != "coverage")  # Time coverage
            and (col.lower() != "recov time")  # Recover time
            and not ("recovery" in col.lower())  # Recovery time
            and not ("ice cov" in col.lower())  # Ice cover (sea surface not seafloor)
            and not ("canopy cover" in col.lower())
            and (col.lower() != "ipc-cov")
            and (col.lower() != "cov std dev")
            and (col.lower() != "recov std dev")
        ]
        if doi == "https://doi.org/10.1594/PANGAEA.884805":
            continue

        # Show data
        if len(label_cols) > 0:
            url_col = datasets.get_url_col(df)
            n_images = len(df[url_col].dropna())

            print(f"[{str(i).zfill(4)}] {title}")
            print(
                f"N images: {n_images} (col: '{url_col}'); N label columns: {len(label_cols)}. DOI: {doi}"
            )
            print("Label columns:", label_cols)
            labelled_datasets[doi] = [n_images, title, label_cols, url_col]
            labelled_dataset_idxs.append(i)
            print()

In [None]:
print(
    f"[DEBUG] N column name mismatches: {len(errors)}, ({round(len(errors)/len(sorted_dfs), 2)}%)"
)
print(f"[INFO] N labelled datasets: {len(labelled_datasets)}")
print(f"[INFO] N labelled images: {sum([v[0] for v in labelled_datasets.values()])}")

Identify ice coverage datasets and check if they also have biota label columns. If not they can be discarded.

In [None]:
i = 0
ice_cov_ds = []
for df in sorted_dfs:
    try:
        doi = df.doi.iloc[0]
    except AttributeError:
        doi = df.DOI.iloc[0]

    ice_cov_cols = [col for col in df.columns if ("ice cov" in col.lower())]
    if len(ice_cov_cols) > 0:
        print(f"[{i}] {doi} : {df.columns}")
        ice_cov_ds.append(doi)
        i += 1

## Results

In [None]:
results = (
    pd.DataFrame(
        {
            "doi": labelled_datasets.keys(),
            "n_images": [v[0] for v in labelled_datasets.values()],
            "dataset_title": [v[1] for v in labelled_datasets.values()],
            "url_col": [v[3] for v in labelled_datasets.values()],
            "label_cols": [v[2] for v in labelled_datasets.values()],
        }
    )
    .sort_values(by="n_images", ascending=False)
    .reset_index(drop=True)
)
print(f"Total {results.n_images.sum()} labelled images")
results

In [None]:
results.to_excel("../pangaea-labelled-datasets.xlsx", index=False)

## Manual Process
- Analyze column names: We begin we analyzing the dataset columns.
- We also look the column descriptions on the dataset webpage be clicking the doi link.

In [None]:
errors = dict()
labaled_datasets = dict()
for i, df in enumerate(sorted_dfs):
    try:  # Extract info
        title = df.dataset_title.iloc[0]
        doi = df.doi.iloc[0]
    except AttributeError:
        title = df.Dataset.iloc[0]
        doi = df.DOI.iloc[0]
    # Show info
    suffix = ">>>" if i in labelled_dataset_idxs else ""
    print(f"{suffix}[{i}] {title}")
    print(f"Columns: {list(df.columns)}")
    print(f"Row: {df.shape[0]}; Columns: {df.shape[1]}. DOI: {doi}")
    print()