# Download datasets using `PanQuery` search

In [None]:
import os
import sys

import pandas as pd
import pangaeapy

sys.path.append("..")
from pangaea_downloader import utilz

## 1. Search Pangaea

In [None]:
# Make sure output directory exists
out_dir = "../query-outputs"
os.makedirs(out_dir, exist_ok=True)

In [None]:
# New function exceeds max 500 search result limit previously faced
def search_pangaea(query="seabed photographs", n_results=999):
    offset = 0
    results = []
    while True:
        pq = pangaeapy.PanQuery(query=query, limit=n_results, offset=offset)
        results.extend(pq.result)
        offset += len(pq.result)
        if len(results) >= pq.totalcount:
            break
    # Sanity check
    assert len(results) == pq.totalcount
    return results

In [None]:
# Run search
results = search_pangaea(query="seabed photographs", n_results=999)
print("Number of results returned:", len(results))

## 2. Analyze results
Before fetching and processing each of the result datasets, we will first try to analyze them without loading them into memory.

#### 2.1 Check if there are any duplicate results

In [None]:
# Get DOIs for each result
result_dois = [result["URI"] for result in results]
# Number of dois in result should equal number of unique dois in result
if len(result_dois) == len(set(result_dois)):
    print("NO DUPLICATES!")

#### 2.2 Detect the type of each result dataset
- The `size` of the dataset can be extracted from the `result['html']` attribute. 
- We can determine the type of the dataset (parent, child, video, paginated) from the size.

In [None]:
def ds_type(size: str) -> str:
    """Identify the dataset type from the size description string."""
    if "bytes" in size:
        return "Video"
    elif "unknown" == size:
        return "Paginated"
    elif "datasets" in size:
        return "Parent"
    elif "data points" in size:
        return "Child"

In [None]:
# Test
ds_type(utilz.get_result_info(results[0])[2])

#### 2.3 Make a dataframe describing each of the search results

In [None]:
datasets = []
for i, result in enumerate(results):
    citation, url, size, is_parent = utilz.get_result_info(result)
    datasets.append(
        {
            "doi": result["URI"],
            "citation": citation,
            "size": size,
            "is_parent": is_parent,
        }
    )
datasets = pd.DataFrame(datasets)
datasets["type"] = datasets["size"].apply(ds_type)

In [None]:
# Show preview
datasets.loc[270:280, :]

## 3. Process result datasets

### 3.1 Process and examine 1 sample dataset

In [None]:
# Random sample
idx = 34
# Fetch sample
ds = pangaeapy.PanDataSet(results[idx]["URI"])
print(ds.citation)

### 3.2 Process all search result datasets

In [None]:
n_downloads = 0
result_data = []
for i, result in enumerate(results):
    # Extract result information
    citation, url, size, is_parent = utilz.get_result_info(result)
    print(f"[{i+1}] Loading dataset: '{citation}'")

    # ------------- ASSESS DATASET TYPE ------------- #
    df = None
    typ = ds_type(size)

    # Video dataset (ignore)
    if typ == "Video":
        print("\t[WARNING] VIDEO dataset. Skipping...")
        continue

    # Paginated images (scrape urls and metadata)
    elif typ == "Paginated":
        df = utilz.scrape_images(url)

    # Parent dataset (fetch child datasets)
    elif typ == "Parent":
        df = utilz.fetch_child_datasets(url)

    # Tabular dataset (fetch and save)
    elif typ == "Child":
        df = utilz.fetch_dataset(url)

    # ----------------- SAVE TO FILE ----------------- #
    if df is None:
        continue
    else:
        result_data.append(
            {
                "doi": result["URI"],
                "citation": citation,
                "size": size,
                "is_parent": is_parent,
                "missing_values": df.isna().sum().sum(),
            }
        )
        n_downloads += 1
print(f"COMPLETE! Total files processed: {n_downloads}")

In [None]:
result_data = pd.DataFrame(result_data)
result_data

## 4. Check if all saved files have desired image url column

In [None]:
# Load all files in output directory
files = os.listdir(out_dir)
dfs = [pd.read_csv(os.path.join(out_dir, file)) for file in files]
# Check if they have the desired column
if all([utilz.has_url_col(df) for df in dfs]):
    print("All ddownloaded files have URL column")
else:
    print("Some files are missing URL column!")