# Download datasets using `PanQuery` search

In [None]:
import os
import sys

import pandas as pd
import pangaeapy

sys.path.append("..")
from pangaea_downloader import utilz

## 1. Search Pangaea

In [None]:
# Make sure output directory exists
out_dir = "../query-outputs"
os.makedirs(out_dir, exist_ok=True)

In [None]:
# New function exceeds max 500 search result limit previously faced
def search_pangaea(query="seabed photographs", n_results=999):
    offset = 0
    results = []
    while True:
        pq = pangaeapy.PanQuery(query=query, limit=n_results, offset=offset)
        results.extend(pq.result)
        offset += len(pq.result)
        if len(results) >= pq.totalcount:
            break
    # Sanity check
    assert len(results) == pq.totalcount
    return results

In [None]:
# Run search
results = search_pangaea(query="seabed photographs", n_results=999)
print("Number of results returned:", len(results))

## 2. Analyze results

**Check if there are any duplicate results**

In [None]:
# Get DOIs for each result
result_dois = [result["URI"] for result in results]
# Number of dois in result should equal number of unique dois in result
if len(result_dois) == len(set(result_dois)):
    print("There are no duplicate results.")

## 3. Fetch result datasets

In [None]:
for i, result in enumerate(results):
    # Extract result information
    citation, url, size, is_parent = utilz.get_result_info(result)
    ds_id = result["URI"].split("PANGAEA.")[-1]
    print(f"[{i+1}] Loading dataset: '{citation}'")

    # ------------- ASSESS DATASET TYPE ------------- #
    df = None
    # Video dataset (ignore)
    if "bytes" in size:
        print("\t[WARNING] VIDEO dataset. Skipping...")
        continue

    # Paginated images (scrape urls and metadata)
    elif "unknown" == size:
        df = utilz.scrape_images(url)

    # Parent dataset (fetch child datasets)
    elif "datasets" in size:
        df = utilz.fetch_child_datasets(url)

    # Tabular dataset (fetch and save)
    elif "data points" in size:
        df = utilz.fetch_dataset(url)

    # ----------------- SAVE TO FILE ----------------- #
    if df is None:
        continue
    else:
        f_name = ds_id + ".csv"
        path = os.path.join(out_dir, f_name)
        # df.to_csv(path, index=False)
        print(f"\t[INFO] Saved to '{path}'")
        n_downloads += 1
print(f"COMPLETE! Total files downloaded: {n_downloads}")

## 4. Check if all saved files have desired image url column

In [None]:
# Load all files in output directory
files = os.listdir(out_dir)
dfs = [pd.read_csv(os.path.join(out_dir, file)) for file in files]
# Check if they have the desired column
if all([utilz.has_url_col(df) for df in dfs]):
    print("All ddownloaded files have URL column")