# Download datasets using `PanQuery` search

In [None]:
import os

import pandas as pd
import pangaeapy
from utilz import fetch_child_datasets, has_url_col

Make sure output directory exists

In [None]:
out_dir = "../query-outputs"
os.makedirs(out_dir, exist_ok=True)

## Make search query

In [None]:
query = "seabed photographs"
n_results = 25

In [None]:
pq = pangaeapy.PanQuery(query=query, limit=n_results)
print("Requested URL:", pq.PANGAEA_QUERY_URL + "+".join(pq.query.split(" ")))

print("Number of results returned:", len(pq.result))
print("Total search results", pq.totalcount)

## Fetch datasets from search results

In [None]:
result_dois = [r_item["URI"] for r_item in pq.result]  # Get DOIs for each result item
datasets = [
    pangaeapy.PanDataSet(doi) for doi in result_dois
]  # Fetch datasets for each DOI

**Note:** Using list comprehension to make a list of `PanDataSets` is fine for small number of datasets. But for larger size lists with many datasets it may raise a `MemoryError`. So its better to fetch each dataset in a loop instead.

### View charactereistics of datasets

In [None]:
data = {
    "dataset": [ds.title for ds in datasets],
    "doi": result_dois,
    "is_parent": [ds.isParent for ds in datasets],
    "data_shape": [ds.data.shape for ds in datasets],
    "access": [ds.loginstatus for ds in datasets],
}
pd.DataFrame(data)

In [None]:
print("[INFO] Processing each dataset...")
for i, ds in enumerate(datasets):
    # Get dataset ID
    ds_id = ds.doi.split(".")[-1]
    print(f"[{i+1}] Dataset ID: {ds_id}. Title: '{ds.title}'")
    # Can access dataset
    if ds.loginstatus == "unrestricted":
        # Dataset has child datasets
        if ds.isParent:
            df = fetch_child_datasets(ds)
            # Save to file
            file = os.path.join(out_dir, ds_id + ".csv")
            # df.to_csv(file, index=False)
            print(f"\t[INFO] Saved to '{file}'")
        else:  # Does not have child datasets
            assert len(ds.events) == 1, "\t[ERROR] Dataset has more than 1 event!"
            # Add metadata
            ds.data["Dataset"] = ds.title
            ds.data["DOI"] = ds.doi
            ds.data["Campaign"] = ds.events[0].campaign.name
            ds.data["Site"] = ds.events[0].label
            # Save to file
            file = os.path.join(out_dir, ds_id + ".csv")
            # ds.data.to_csv(file, index=False)
            print(f"\t[INFO] Saved to '{file}'")
    else:
        print(f"\t[ERROR] Dataset access restricted: '{ds.loginstatus}'")

## Check if all saved files have desired image url column

In [None]:
# Load all files in output directory
files = os.listdir(out_dir)
dfs = [pd.read_csv(os.path.join(out_dir, file)) for file in files]
# Check if they have the desired column
all([has_url_col(df) for df in dfs])