# Notebook for ingesting data from BPA REST API

BPA API is built on CKAN - docs: https://docs.ckan.org/en/latest/api/

Base URL for BPA is https://data.bioplatforms.com/api/3

Ingest using wildcard search: [`https://data.bioplatforms.com/api/3/action/package_search?q=*:*&rows=1000`](https://data.bioplatforms.com/api/3/action/package_search?q=*:*&rows=1000)

Note script will need to paginate 1000 records at a time (48,438 results)

### To Do

- [x] Add ignore list for JSON data to skip

- [x] Map to DwC and extendions

In [19]:
import requests
import pandas as pd
from pandas.io.json import json_normalize 
import numpy as nmp
# from tqdm.notebook import tqdm_notebook
from tqdm import tqdm

url = "https://data.bioplatforms.com/api/3/action/package_search?q=*:*&rows="
pagesize = 1000
total = requests.get(f"{url}0").json()['result']['count']
totalPages = int(nmp.ceil(total / pagesize)) + 1 # range is "exclusive" so needs an extra 1
print("Total:", total, totalPages)
api_dataset = requests.get(f"{url}{pagesize}").json()['result']['results']

for page in tqdm(range(1, totalPages), desc = "JSON data pagination"):
    # print("Paginating:", page, "start=", (page * pagesize))
    response = requests.get(f"{url}{pagesize}&start={(page * pagesize)}").json()['result']['results']
    api_dataset.extend(response)

print("api_dataset size", len(api_dataset))
df = json_normalize(api_dataset) # transformation to dataframe via normalize function

df.info()

Total: 50160 52


JSON data pagination: 100%|██████████| 51/51 [01:20<00:00,  1.58s/it]
  df = json_normalize(api_dataset) # transformation to dataframe via normalize function


api_dataset size 50160
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50160 entries, 0 to 50159
Columns: 992 entries, access_control_date to license_url
dtypes: bool(3), int64(2), object(987)
memory usage: 378.6+ MB


In [20]:
df.head()

Unnamed: 0,access_control_date,access_control_mode,access_control_reason,amplicon,analytical_platform,archive_ingestion_date,author,author_email,creator_user_id,data_generated,...,rs_version,sequencer_run_id,smrt_cell_id,data_homepage,data_portal,doi,language,theme,update_frequency,license_url
0,2023-01-23,date,,ITS,MiSeq,2022-10-25,,,c9507d6a-2f85-4b89-9f2e-3910aac83870,2022-10-25,...,,,,,,,,,,
1,2023-01-23,date,,ITS,MiSeq,2022-10-25,,,c9507d6a-2f85-4b89-9f2e-3910aac83870,2022-10-25,...,,,,,,,,,,
2,2023-01-23,date,,ITS,MiSeq,,,,c9507d6a-2f85-4b89-9f2e-3910aac83870,,...,,,,,,,,,,
3,2023-01-23,date,,ITS,MiSeq,,,,c9507d6a-2f85-4b89-9f2e-3910aac83870,,...,,,,,,,,,,
4,2023-01-23,date,,ITS,MiSeq,,,,c9507d6a-2f85-4b89-9f2e-3910aac83870,,...,,,,,,,,,,


In [21]:
# Get a list of the fields with most number of rows containing data
# df.notna().sum().sort_values(ascending=False)

In [22]:
fields_to_keep = [
    "access_control_date",
    "access_control_mode",
    "access_control_reason",
    "access_rights",
    "ala_specimen_url",
    "amplicon",
    "analysis_software_version",
    "associated_media",
    "barcode_id",
    "base_url",
    "biotic_relationship",
    "birth_date",
    "bpa_dataset_id",
    "bpa_library_id",
    "bpa_sample_id",
    "citation",
    "class",
    "coastal_id",
    "collection_date",
    "collection_method",
    "collector",
    "collector_sample_id",
    "color",
    "comments",
    "common_name",
    "coord_uncertainty_metres",
    "country",
    "creator_user_id",
    "custodian",
    "dataset_id",
    "dataset_url",
    "data_custodian",
    "data_generated",
    "data_type",
    "date_data_published",
    "date_of_transfer",
    "date_of_transfer_to_archive",
    "date_since_change_in_land_use",
    "ddrad_dataset_ids",
    "death_date",
    "description",
    "dna_extraction_date",
    "dna_treatment",
    "download",
    "facility",
    "facility_sample_id",
    "family",
    "fire",
    "fire_intensity_if_known",
    "flooding",
    "flow_id",
    "folder_name",
    "fouling_organisms",
    "funding_agency",
    "genome_data",
    "genome_dataset_ids",
    "genomic_material_associated_references",
    "genus",
    "geo_loc_name",
    "grazing_number",
    "groups",
    "herbarium_code",
    "host_state",
    "host_type",
    "id",
    "identified_by",
    "id_vetting_by",
    "imos_site_code",
    "information",
    "institution_name",
    "isopen",
    "latitude",
    "lat_lon",
    "license_id",
    "license_title",
    "life_stage",
    "living_collections_catalog_number",
    "living_collections_material_sample_rna",
    "living_collections_recorded_by",
    "living_collections_record_number",
    "location_text",
    "longitude",
    "metadata_created",
    "metadata_modified",
    "nagoya_protocol_permit_number",
    "name",
    "ncbi_bioproject_accession",
    "ncbi_biosample_accession",
    "notes",
    "nrs_sample_code",
    "nrs_trip_code",
    "num_resources",
    "num_tags",
    "order",
    "organization.approval_status",
    "organization.created",
    "organization.description",
    "organization.id",
    "organization.image_url",
    "organization.is_organization",
    "organization.name",
    "organization.revision_id",
    "organization.state",
    "organization.title",
    "organization.type",
    "owner_org",
    "phylum",
    "plant_id",
    "private",
    "project_aim",
    "reads",
    "read_length",
    "relationships_as_object",
    "relationships_as_subject",
    "resources",
    "resource_permissions",
    "revision_id",
    "sample_attribution",
    "sample_extraction_id",
    "sample_id",
    "sample_name",
    "sample_submission_date",
    "sample_submitter",
    "sample_type",
    "scientific_name",
    "scientific_name_authorship",
    "scientific_name_notes",
    "sequencer",
    "sequence_data_type",
    "sequence_length",
    "sequencing_facility",
    "sequencing_platform",
    "sequencing_run_number",
    "sex",
    "spatial",
    "species",
    "species_name",
    "specific_host",
    "state",
    "state_or_region",
    "subspecies_or_variant",
    "synonyms",
    "tags",
    "target",
    "taxonomic_group",
    "taxon_id",
    "texture",
    "ticket",
    "tissue_collection",
    "tissue_preservation",
    "tissue_type",
    "title",
    "touching_organisms",
    "type",
    "type_status",
    "url",
    "utc_time_sampled",
    "vegetation_type",
    "voucher_herbarium_catalog_number",
    "voucher_herbarium_collector_id",
    "voucher_herbarium_event_date",
    "voucher_herbarium_recorded_by",
    "voucher_herbarium_record_number",
    "voucher_number",
    "voucher_or_tissue_number",
    "voyage_code",
    "voyage_survey_link",
    "wild_captive",
]
df.drop(columns=df.columns.difference(fields_to_keep), inplace=True)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50160 entries, 0 to 50159
Columns: 166 entries, access_control_date to sequencing_run_number
dtypes: bool(3), int64(2), object(161)
memory usage: 62.5+ MB


In [23]:

# Cleanup field names and map to DwC where possible
field_mapping = {
    # DwC fields
    'access_rights': 'accessRights',
    'date_of_transfer': 'eventDate',
    'base_url': 'geneticAccessionURI', 
    'citation': 'MaterialCitation',
    'collection_method': 'measurementType',
    'collector': 'recordedBy',
    'collector_sample_id': 'materialSampleID',
    'common_name': 'vernacularName',
    'coord_uncertainty_metres': 'coordinateUncertaintyInMeters',
    'dataset_id': 'datasetID',
    'geo_loc_name': 'locality',
    'id': 'occurrenceID',
    'identified_by': 'identifiedBy',
    'latitude': 'decimalLatitude',
    'life_stage': 'lifeStage',
    'living_collections_catalog_number': 'otherCatalogNumbers',
    'living_collections_recorded_by': 'recordedBy',
    'longitude': 'decimalLongitude',
    'resources': 'associatedSequences',
    'sample_id': 'materialSampleID',
    # 'scientific_name': 'acceptedScientificName',
    'scientific_name_authorship': 'scientificNameAuthorship',
    'species': 'specificEpithet',
    # 'species_name': 'scientificiName',
    'state_or_region': 'stateProvince',
    'taxon_id': 'taxonID',
    'tissue_preservation': 'preparations',
    'type_status': 'typeStatus',
    'voucher_herbarium_catalog_number': 'otherCatalogNumbers',
    'voucher_herbarium_record_number': 'dwciri:recordNumber',
    'voucher_herbarium_recorded_by': 'dwciri:recordedBy',
    'wild_captive': 'degreeOfEstablishment',
    # Unmapped fields
    'access_control_date': 'bpa_access_control_date',
    'access_control_mode': 'bpa_access_control_mode',
    'access_control_reason': 'bpa_access_control_reason',
    'ala_specimen_url': 'bpa_ala_specimen_url',
    'amplicon': 'bpa_amplicon',
    'analysis_software_version': 'bpa_analysis_software_version',
    'associated_media': 'bpa_associated_media',
    'barcode_id': 'bpa_barcode_id',
    'base_url': 'bpa_base_url',
    'biotic_relationship': 'bpa_biotic_relationship',
    'birth_date': 'bpa_birth_date',
    'bpa_dataset_id': 'bpa_bpa_dataset_id',
    'bpa_library_id': 'bpa_bpa_library_id',
    'bpa_sample_id': 'bpa_bpa_sample_id',
    'coastal_id': 'bpa_coastal_id',
    'herbarium_code': 'bpa_herbarium_code',
    'collection_date': 'bpa_collection_date',
    'color': 'bpa_color',
    'comments': 'bpa_comments',
    'creator_user_id': 'bpa_creator_user_id',
    'custodian': 'bpa_custodian',
    'data_custodian': 'bpa_data_custodian',
    'data_generated': 'bpa_data_generated',
    'data_type': 'bpa_data_type',
    'dataset_url': 'bpa_dataset_url',
    'date_data_published': 'bpa_date_data_published',
    'date_of_transfer_to_archive': 'bpa_date_of_transfer_to_archive',
    'date_since_change_in_land_use': 'bpa_date_since_change_in_land_use',
    'ddrad_dataset_ids': 'bpa_ddrad_dataset_ids',
    'death_date': 'bpa_death_date',
    'description': 'bpa_description',
    'dna_extraction_date': 'bpa_dna_extraction_date',
    'dna_treatment': 'bpa_dna_treatment',
    'download': 'bpa_download',
    'facility': 'bpa_facility',
    'facility_sample_id': 'bpa_facility_sample_id',
    'fire': 'bpa_fire',
    'fire_intensity_if_known': 'bpa_fire_intensity_if_known',
    'flooding': 'bpa_flooding',
    'flow_id': 'bpa_flow_id',
    'folder_name': 'bpa_folder_name',
    'fouling_organisms': 'bpa_fouling_organisms',
    'funding_agency': 'bpa_funding_agency',
    'genome_data': 'bpa_genome_data',
    'genome_dataset_ids': 'bpa_genome_dataset_ids',
    'genomic_material_associated_references': 'bpa_genomic_material_associated_references',
    'grazing_number': 'bpa_grazing_number',
    'groups': 'bpa_groups',
    'host_state': 'bpa_host_state',
    'host_type': 'bpa_host_type',
    'id_vetting_by': 'bpa_id_vetting_by',
    'imos_site_code': 'bpa_imos_site_code',
    'information': 'bpa_information',
    'institution_name': 'bpa_institution_name',
    'isopen': 'bpa_isopen',
    'lat_lon': 'bpa_lat_lon',
    'license_id': 'bpa_license_id',
    'license_title': 'bpa_license_title',
    'living_collections_material_sample_rna': 'bpa_living_collections_material_sample_rna',
    'living_collections_record_number': 'bpa_living_collections_record_number',
    'location_text': 'bpa_location_text',
    'metadata_created': 'bpa_metadata_created',
    'metadata_modified': 'bpa_metadata_modified',
    'nagoya_protocol_permit_number': 'bpa_nagoya_protocol_permit_number',
    'name': 'bpa_name',
    'ncbi_bioproject_accession': 'ncbi_bioproject',
    'ncbi_biosample_accession': 'ncbi_biosample',
    'notes': 'bpa_notes',
    'nrs_sample_code': 'bpa_nrs_sample_code',
    'nrs_trip_code': 'bpa_nrs_trip_code',
    'num_resources': 'bpa_num_resources',
    'num_tags': 'bpa_num_tags',
    'order': 'bpa_order',
    'organization.approval_status': 'bpa_organization_approval_status',
    'organization.created': 'bpa_organization_created',
    'organization.description': 'bpa_organization_description',
    'organization.id': 'bpa_organization_id',
    'organization.image_url': 'bpa_organization_image_url',
    'organization.is_organization': 'bpa_organization_is_organization',
    'organization.name': 'bpa_organization_name',
    'organization.revision_id': 'bpa_organization_revision_id',
    'organization.state': 'bpa_organization_state',
    'organization.title': 'bpa_organization_title',
    'organization.type': 'bpa_organization_type',
    'owner_org': 'bpa_owner_org',
    'plant_id': 'bpa_plant_id',
    'private': 'bpa_private',
    'project_aim': 'bpa_project_aim',
    'read_length': 'bpa_read_length',
    'reads': 'bpa_reads',
    'relationships_as_object': 'bpa_relationships_as_object',
    'relationships_as_subject': 'bpa_relationships_as_subject',
    'resource_permissions': 'bpa_resource_permissions',
    'revision_id': 'bpa_revision_id',
    'sample_attribution': 'bpa_sample_attribution',
    'sample_extraction_id': 'bpa_sample_extraction_id',
    'sample_name': 'bpa_sample_name',
    'sample_submission_date': 'bpa_sample_submission_date',
    'sample_submitter': 'bpa_sample_submitter',
    'sample_type': 'bpa_sample_type',
    'scientific_name_notes': 'bpa_scientific_name_notes',
    'sequence_data_type': 'bpa_sequence_data_type',
    'sequence_length': 'bpa_sequence_length',
    'sequencer': 'bpa_sequencer',
    'sequencing_facility': 'bpa_sequencing_facility',
    'sequencing_platform': 'bpa_sequencing_platform',
    'sequencing_run_number': 'bpa_sequencing_run_number',
    'spatial': 'bpa_spatial',
    'species': 'bpa_species',
    'state_or_region': 'bpa_state_or_region',
    'subspecies_or_variant': 'bpa_subspecies_or_variant',
    'synonyms': 'bpa_synonyms',
    'tags': 'bpa_tags',
    'target': 'bpa_target',
    'taxonomic_group': 'bpa_taxonomic_group',
    'texture': 'bpa_texture',
    'ticket': 'bpa_ticket',
    'tissue_collection': 'bpa_tissue_collection',
    'tissue_type': 'bpa_tissue_type',
    'title': 'bpa_title',
    'touching_organisms': 'bpa_touching_organisms',
    'type': 'bpa_type',
    'url': 'bpa_url',
    'utc_time_sampled': 'bpa_utc_time_sampled',
    'vegetation_type': 'bpa_vegetation_type',
    'voucher_herbarium_collector_id': 'bpa_voucher_herbarium_collector_id',
    'voucher_herbarium_event_date': 'bpa_voucher_herbarium_event_date',
    'voucher_number': 'bpa_voucher_number',
    'voucher_or_tissue_number': 'bpa_voucher_or_tissue_number',
    'voyage_code': 'bpa_voyage_code',
    'voyage_survey_link': 'bpa_voyage_survey_link',
}

# copy these fields first
df['bpa_id'] = df['id'] 
df['bpa_specific_host'] = df['specific_host'] 
df['bpa_species'] = df['species'] 
df['bpa_collection_date'] = df['collection_date'] 
df['bpa_date_of_transfer'] = df['date_of_transfer'] 
# df['acceptedScientificName'] = nmp.where(df.scientific_name.notna(), df.scientific_name, df.species_name)

# Attempt to find a scientificName value 
def mergeNames(row):
  name = nmp.nan
  if (pd.notnull(row['scientific_name'])):
    name = row['scientific_name']
  elif (pd.notnull(row['species_name'])):
    name = row['species_name']
  elif (pd.notnull(row['genus']) and pd.notnull(row['species'])):
    name = str(row['genus']) + ' ' + str(row['species'])
  elif (pd.notnull(row['genus'])):
    name = str(row['genus'])
  elif (pd.notnull(row['species'])):
    name = str(row['species'])
  elif (pd.notnull(row['common_name'])):
    name = row['common_name']
  
  return name

df['scientificName'] = df.apply(mergeNames, axis=1)

# do rename
df.rename(
  columns = field_mapping, 
  inplace = True
)

df.info(verbose = True, null_counts = True)
df['associatedSequences'].head()

  df.info(verbose = True, null_counts = True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50160 entries, 0 to 50159
Data columns (total 172 columns):
 #    Column                                      Non-Null Count  Dtype 
---   ------                                      --------------  ----- 
 0    bpa_access_control_date                     47649 non-null  object
 1    bpa_access_control_mode                     47649 non-null  object
 2    bpa_access_control_reason                   47649 non-null  object
 3    bpa_amplicon                                37368 non-null  object
 4    bpa_creator_user_id                         50160 non-null  object
 5    bpa_data_generated                          35302 non-null  object
 6    bpa_data_type                               47884 non-null  object
 7    bpa_dataset_url                             26492 non-null  object
 8    eventDate                                   47887 non-null  object
 9    bpa_description                             46124 non-null  object
 10   bpa_flow

0    [{'cache_last_updated': None, 'cache_url': Non...
1    [{'amplicon': 'ITS', 'cache_last_updated': Non...
2    [{'cache_last_updated': None, 'cache_url': Non...
3    [{'amplicon': 'ITS', 'cache_last_updated': Non...
4    [{'amplicon': 'ITS', 'cache_last_updated': Non...
Name: associatedSequences, dtype: object

In [24]:
# df['associatedSequences2'] = df['associatedSequences'].apply(lambda x: x[0].to_json())
import json 
# convert to true JSON - replace `None` with `null` and `True` with `true`
df['associatedSequences'] = df['associatedSequences'].apply(lambda x: json.dumps(x))
df['bpa_tags'] = df['bpa_tags'].apply(lambda x: json.dumps(x))
df['associatedSequences'].head()

0    [{"cache_last_updated": null, "cache_url": nul...
1    [{"amplicon": "ITS", "cache_last_updated": nul...
2    [{"cache_last_updated": null, "cache_url": nul...
3    [{"amplicon": "ITS", "cache_last_updated": nul...
4    [{"amplicon": "ITS", "cache_last_updated": nul...
Name: associatedSequences, dtype: object

In [25]:
print("occurrenceID empty count", len(df[df['occurrenceID'].isna()]))
print("scientificName", len(df[df['scientificName'].notna()]))
print("scientific_name", len(df[df['scientific_name'].notna()]))
print("species_name", len(df[df['species_name'].notna()]))
# print("acceptedScientificName", len(df[df['acceptedScientificName'].notna()]))
df.info(verbose = True, null_counts = True)

occurrenceID empty count 0
scientificName 8592
scientific_name 5211
species_name 2029


  df.info(verbose = True, null_counts = True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50160 entries, 0 to 50159
Data columns (total 172 columns):
 #    Column                                      Non-Null Count  Dtype 
---   ------                                      --------------  ----- 
 0    bpa_access_control_date                     47649 non-null  object
 1    bpa_access_control_mode                     47649 non-null  object
 2    bpa_access_control_reason                   47649 non-null  object
 3    bpa_amplicon                                37368 non-null  object
 4    bpa_creator_user_id                         50160 non-null  object
 5    bpa_data_generated                          35302 non-null  object
 6    bpa_data_type                               47884 non-null  object
 7    bpa_dataset_url                             26492 non-null  object
 8    eventDate                                   47887 non-null  object
 9    bpa_description                             46124 non-null  object
 10   bpa_flow

In [26]:
df.info()
df.to_csv("/data/arga-data/bpa_export.csv", index=False)
# dr18544 in http://collections-test.ala.org.au

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50160 entries, 0 to 50159
Columns: 172 entries, bpa_access_control_date to scientificName
dtypes: bool(3), int64(2), object(167)
memory usage: 64.8+ MB
