# Exploring the Pangaea Python API

In [None]:
from pprint import pprint

import pangaeapy

## Checking out the `PanDataSet` class

In [None]:
# Fetch dataset using doi
ds = pangaeapy.PanDataSet("10.1594/PANGAEA.618833")
print(ds.citation)
print(f"\nData shape: {ds.data.shape[0]} rows x {ds.data.shape[1]} columns")
ds.data.head()

#### `PanDataSet` object attributes
- The campaign/cruise of the dataset is `ds.events[0].campaign.name`

- The site/event/station/deployment is `ds.events[0].label`

In [None]:
print("Dataset id:", ds.id)
print("Dataset uri:", ds.uri)
print("Dataset doi:", ds.doi)
print("Dataset title:", ds.title)
print("Dataset year:", ds.year)
print("Dataset authors:", [auth.fullname for auth in ds.authors])
print("Dataset mintimeextent:", ds.mintimeextent)
print("Dataset maxtimeextent:", ds.maxtimeextent)
print("Dataset loginstatus:", ds.loginstatus)
print("Dataset isParent:", ds.isParent)
print("Dataset children:", ds.children)
print("Dataset moratorium:", ds.moratorium)
print("Dataset datastatus:", ds.datastatus)
print("Dataset registrystatus:", ds.registrystatus)

#### Dataset `PanEvent` object attributes
Most often datasets have one event in the events list

In [None]:
ds.events

In [None]:
print("Event basis:", ds.events[0].basis)
print("Event datetime:", ds.events[0].datetime)
print("Event datetime2:", ds.events[0].datetime2)
print("Event device:", ds.events[0].device)
print("Event label:", ds.events[0].label)
print("Event location:", ds.events[0].location)
print("Event latitude:", ds.events[0].latitude)
print("Event latitude2:", ds.events[0].latitude2)
print("Event longitude:", ds.events[0].longitude)
print("Event longitude2:", ds.events[0].longitude2)
print("Event elevation:", ds.events[0].elevation)

#### Dataset event `PanCampaign` object attributes

In [None]:
if ds.events[0].campaign is None:
    print("NO campaign info!")
else:
    print("Event campaign BSHID:", ds.events[0].campaign.BSHID)
    print("Event campaign start:", ds.events[0].campaign.start)
    print("Event campaign startlocation:", ds.events[0].campaign.startlocation)
    print("Event campaign end:", ds.events[0].campaign.end)
    print("Event campaign endlocation:", ds.events[0].campaign.endlocation)
    print("Event campaign expeditionprogram:", ds.events[0].campaign.expeditionprogram)
    print("Event campaign name:", ds.events[0].campaign.name)
    print("Event campaign URI:", ds.events[0].campaign.URI)

#### Dataset `PanParam` object attributes

In [None]:
ds.params

In [None]:
# DataSet Parameters
for key in ds.params.keys():
    print(f"'{key}': {ds.params[key]}")
    print(f"\t{key} id:", ds.params[key].id)
    print(f"\t{key} name:", ds.params[key].name)
    print(f"\t{key} shortName:", ds.params[key].shortName)
    print(f"\t{key} synonym:", ds.params[key].synonym)
    print(f"\t{key} type:", ds.params[key].type)
    print(f"\t{key} source:", ds.params[key].source)
    print(f"\t{key} unit:", ds.params[key].unit)
    print(f"\t{key} format:", ds.params[key].format)
    print(f"\t{key} terms:", ds.params[key].terms)

#### Checking another one using the dataset id only (dont need full doi)

In [None]:
# Fetch dataset using dataset ID
ds = pangaeapy.PanDataSet(371064)
print(ds.citation)
print(f"\nData shape: {ds.data.shape[0]} rows x {ds.data.shape[1]} columns")
ds.data.head()

In [None]:
ds.isParent

In [None]:
ds.data

In [None]:
print(ds.doi)

The dataframe for this dataset is empty and if we check the dataset on the Pangaea website, we can see that the dataset size is 'unknown'. After clicking 'Download dataset' we can see that the images are hosted on the website on several pages, rather than providing the image urls in a table format.

## Checking out the `PanQuery` class

In [None]:
# Search Pangaea using query string
query = pangaeapy.PanQuery(query="seafloor images", limit=999)
print("Total search results:", query.totalcount)
print("Results returned:", len(query.result))

As we can see desipte 596 results being available, `PanQuery` returns a max of 500 results

In [None]:
query.result[0].keys()

### Examining result items
Each of the items in the results list is a dictionary containing:
- dataset **URI** (doi)
- TF-IDF **score** (match with query string)
- **type** which is either 'parent' or 'child'
- **position** the result index
- **html** with other useful info such as the dataset size, citation etc.

In [None]:
result = query.result[0]
result

In [None]:
for result in query.result:
    print(
        f"Index: [{result['position']}], DOI: {result['URI']}, Type: {result['type']}, Score: {result['score']}"
    )
    pprint(result["html"])
    print("-" * 125)