In [1]:
import json

import pydantic
import requests

In [2]:
sample_id = "SAMEA111457860"

In [3]:
# request_uri = f"https://www.ebi.ac.uk/biosamples/structureddata/SAMEA14448373"

In [4]:
r = requests.get(request_uri)

In [5]:
r.status_code

200

In [34]:
import sys
from datetime import datetime
from typing import List, Dict, Optional

import requests
from pydantic import BaseModel


class Relationship(BaseModel):
    source: str
    type: str
    target: str
        

class Characteristic(BaseModel):
    text: str
    tag: Optional[str]
    unit: Optional[str]
    

class BioSample(BaseModel):
    name: str
    accession: str
    taxId: int
    submitted: datetime
    
    characteristics: Dict[str, List[Characteristic]]
    relationships: List[Relationship] = []
        
def get_biosample_record(sample_id):
    request_uri = f"http://www.ebi.ac.uk/biosamples/samples/{sample_id}"
    
    r = requests.get(request_uri)
    
    if not r.ok:
        r.raise_for_staus()
        sys.exit()
    
    return BioSample.parse_raw(r.content)

In [41]:
sample_id = "SAMEA111458369"

In [55]:
def sto(sample_id):

    sample = get_biosample_record(sample_id)

    attributes_dict = {
        "BioSamples accession ID": sample.accession,
        "NCBI taxonomy ID": sample.taxId
    }

    characteristics = [
        "organism",
        "geographic location (latitude)",
        "geographic location (longitude)",
        "geographic location (region and locality)",
        "habitat",
        "sample collection device or method",
    ]

    for characteristic in characteristics:
        attributes_dict[characteristic] = sample.characteristics[characteristic][0].text

    return attributes_dict

In [56]:
sto(sample_id)

{'BioSamples accession ID': 'SAMEA111458369',
 'NCBI taxonomy ID': 347362,
 'organism': 'Hermaeophaga mercurialis',
 'geographic location (latitude)': '51.67',
 'geographic location (longitude)': '-0.79',
 'geographic location (region and locality)': 'England|Naphill Common',
 'habitat': 'NOT COLLECTED',
 'sample collection device or method': 'Hand Picked'}

In [57]:
import json

In [58]:
with open("../copo_biosamples_from_nhmdump_spreadsheet.json") as fh:
    raw_copo = json.load(fh)

In [62]:
sample_ids = list(raw_copo['biosampleAccession'].values())

In [63]:
sample_ids[42]

'SAMEA111458774'

In [64]:
sto(sample_ids[42])

{'BioSamples accession ID': 'SAMEA111458774',
 'NCBI taxonomy ID': 1858070,
 'organism': 'Scrobipalpa costella',
 'geographic location (latitude)': '51.49',
 'geographic location (longitude)': '-0.17',
 'geographic location (region and locality)': 'England|Natural History Museum Wildlife Garden',
 'habitat': 'NOT COLLECTED',
 'sample collection device or method': '0'}

In [16]:
sample.characteristics['geographic location (latitude)']

[Characteristic(text='51.51', tag='attribute', unit='DD')]

In [46]:
for k, v in sample.characteristics.items():
    print(k, v)

ENA-CHECKLIST [Characteristic(text='ERC000053', tag='attribute', unit=None)]
ENA-FIRST-PUBLIC [Characteristic(text='2022-10-12', tag='attribute', unit=None)]
ENA-LAST-UPDATE [Characteristic(text='2022-10-12', tag='attribute', unit=None)]
External Id [Characteristic(text='SAMEA111458369', tag='Namespace:BioSample', unit=None)]
GAL [Characteristic(text='Natural History Museum', tag='attribute', unit=None)]
GAL_sample_id [Characteristic(text='NHMUK014433935', tag='attribute', unit=None)]
INSDC center alias [Characteristic(text='EarlhamInstitute', tag=None, unit=None)]
INSDC center name [Characteristic(text='EarlhamInstitute', tag=None, unit=None)]
INSDC first public [Characteristic(text='2022-10-12T14:28:02Z', tag=None, unit=None)]
INSDC last update [Characteristic(text='2022-10-12T14:28:02Z', tag=None, unit=None)]
INSDC status [Characteristic(text='public', tag=None, unit=None)]
SRA accession [Characteristic(text='ERS13552405', tag=None, unit=None)]
Submitter Id [Characteristic(text='633

In [12]:
sample_obj

{'name': '633abc0b573023627367efd0',
 'accession': 'SAMEA111457860',
 'webinSubmissionAccountId': 'Webin-39233',
 'taxId': 1594449,
 'release': '2022-10-12T14:08:26Z',
 'update': '2022-12-14T23:15:48.848Z',
 'submitted': '2022-10-12T14:08:26Z',
 'characteristics': {'ENA-CHECKLIST': [{'text': 'ERC000053',
    'tag': 'attribute'}],
  'ENA-FIRST-PUBLIC': [{'text': '2022-10-12', 'tag': 'attribute'}],
  'ENA-LAST-UPDATE': [{'text': '2022-10-12', 'tag': 'attribute'}],
  'External Id': [{'text': 'SAMEA111457860', 'tag': 'Namespace:BioSample'}],
  'GAL': [{'text': 'Natural History Museum', 'tag': 'attribute'}],
  'GAL_sample_id': [{'text': 'NHMUK013805989', 'tag': 'attribute'}],
  'INSDC center alias': [{'text': 'EarlhamInstitute'}],
  'INSDC center name': [{'text': 'EarlhamInstitute'}],
  'INSDC first public': [{'text': '2022-10-12T14:08:26Z'}],
  'INSDC last update': [{'text': '2022-10-12T14:08:26Z'}],
  'INSDC status': [{'text': 'public'}],
  'SRA accession': [{'text': 'ERS13551896'}],
  'S

In [2]:
request_uri = f"http://www.ebi.ac.uk/biosamples/samples"

In [10]:
fname = "NHMUK014453625-01.jpg"

In [11]:
specimen_id = "NHMUK014453625"

In [12]:
r = requests.get(request_uri, params='filter=attr:specimen+id:NHMUK014453625')

In [13]:
r.status_code

200

In [14]:
r.url

'https://www.ebi.ac.uk/biosamples/samples?filter=attr:specimen+id:NHMUK014453625'

In [15]:
obj = json.loads(r.content)

In [17]:
bs_obj

{'name': '628cfe9dd10d9fc0282b8ab0',
 'accession': 'SAMEA14448373',
 'webinSubmissionAccountId': 'Webin-39233',
 'taxId': 6396,
 'release': '2022-05-24T16:49:50Z',
 'update': '2022-07-15T16:50:36.599Z',
 'submitted': '2022-05-24T16:49:50Z',
 'characteristics': {'ENA first public': [{'text': '2022-05-24',
    'tag': 'attribute'}],
  'ENA last update': [{'text': '2022-05-24', 'tag': 'attribute'}],
  'ENA-CHECKLIST': [{'text': 'ERC000053', 'tag': 'attribute'}],
  'External Id': [{'text': 'SAMEA14448373', 'tag': 'Namespace:BioSample'}],
  'GAL': [{'text': 'Natural History Museum', 'tag': 'attribute'}],
  'GAL_sample_id': [{'text': 'NOT PROVIDED', 'tag': 'attribute'}],
  'INSDC center alias': [{'text': 'EarlhamInstitute'}],
  'INSDC center name': [{'text': 'EarlhamInstitute'}],
  'INSDC first public': [{'text': '2022-05-24T16:49:50Z'}],
  'INSDC last update': [{'text': '2022-05-24T16:49:50Z'}],
  'INSDC status': [{'text': 'public'}],
  'SRA accession': [{'text': 'ERS12067904'}],
  'Submitte

In [16]:
bs_obj = obj['_embedded']['samples'][0]

KeyError: '_embedded'

In [8]:
bs_obj

{'name': '628cfe9dd10d9fc0282b8ab0',
 'accession': 'SAMEA14448373',
 'webinSubmissionAccountId': 'Webin-39233',
 'taxId': 6396,
 'release': '2022-05-24T16:49:50Z',
 'update': '2022-07-15T16:50:36.599Z',
 'submitted': '2022-05-24T16:49:50Z',
 'characteristics': {'ENA first public': [{'text': '2022-05-24',
    'tag': 'attribute'}],
  'ENA last update': [{'text': '2022-05-24', 'tag': 'attribute'}],
  'ENA-CHECKLIST': [{'text': 'ERC000053', 'tag': 'attribute'}],
  'External Id': [{'text': 'SAMEA14448373', 'tag': 'Namespace:BioSample'}],
  'GAL': [{'text': 'Natural History Museum', 'tag': 'attribute'}],
  'GAL_sample_id': [{'text': 'NOT PROVIDED', 'tag': 'attribute'}],
  'INSDC center alias': [{'text': 'EarlhamInstitute'}],
  'INSDC center name': [{'text': 'EarlhamInstitute'}],
  'INSDC first public': [{'text': '2022-05-24T16:49:50Z'}],
  'INSDC last update': [{'text': '2022-05-24T16:49:50Z'}],
  'INSDC status': [{'text': 'public'}],
  'SRA accession': [{'text': 'ERS12067904'}],
  'Submitte

In [9]:
BioSample.parse_obj(bs_obj).relationships

NameError: name 'BioSample' is not defined