Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename bpa_id to sample_id throughout #52

Merged
merged 7 commits into from Nov 16, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 0 additions & 1 deletion Dockerfile-builder
Expand Up @@ -9,7 +9,6 @@ ENV NO_PROXY ${PIP_TRUSTED_HOST}

RUN env | sort

COPY docker-build.key /root/.ssh/id_rsa
RUN chmod 600 /root/.ssh/id_rsa && \
echo "StrictHostKeyChecking no" > /root/.ssh/config && \
echo "UserKnownHostsFile /dev/null" >> /root/.ssh/config
Expand Down
2 changes: 1 addition & 1 deletion bpaingest/abstract.py
Expand Up @@ -9,7 +9,7 @@


class BaseMetadata:
resource_linkage = ('bpa_id',)
resource_linkage = ('sample_id',)

@classmethod
def parse_spreadsheet(cls, fname, metadata_info):
Expand Down
2 changes: 1 addition & 1 deletion bpaingest/genhash.py
Expand Up @@ -24,7 +24,7 @@ def localpath(mirror_path, legacy_url):


def genhash(ckan, meta, mirror_path, num_threads):
def calculate_hashes(bpa_id, legacy_url, resource):
def calculate_hashes(sample_id, legacy_url, resource):
fpath = localpath(mirror_path, legacy_url)
patch_obj = {}

Expand Down
18 changes: 9 additions & 9 deletions bpaingest/handlers/base_contextual_metadata_apply.py
Expand Up @@ -18,7 +18,7 @@
class Handler(GenericHandler):
'''Applies BASE contextual metadata values to packages with a given BPA ID.

The function should be set up to be triggered by SNS messages that have the bpa_id and values
The function should be set up to be triggered by SNS messages that have the sample_id and values
to apply in them.
The packages matching the BPA ID will be looked up from CKAN and an SNS message will be created
for each package, containing the package id and the values that have to be applied.
Expand All @@ -40,26 +40,26 @@ class Handler(GenericHandler):
SNS_ON_ERROR_SUBJECT = 'ERROR: BASE Contextual Metadata Apply'

def handler(self, event, context):
bpa_id, metadata = self._extract_data(event)
logger.info('Processing BPA ID %s', bpa_id)
sample_id, metadata = self._extract_data(event)
logger.info('Processing BPA ID %s', sample_id)

ckan_service = set_up_ckan_service(self.env)

packages = ckan_service.get_packages_by_bpa_id(bpa_id)
packages = ckan_service.get_packages_by_sample_id(sample_id)
pids_and_changes = [(p['id'], changes(p, metadata)) for p in packages]
packages_with_changes = [x for x in pids_and_changes if len(x[1]) > 0]
for pid, updates in packages_with_changes:
self.sns_ckan_patch_package(pid, updates)

unchanged_package_ids = [x[0] for x in pids_and_changes if len(x[1]) == 0]
self.sns_success(bpa_id, packages_with_changes, unchanged_package_ids)
self.sns_success(sample_id, packages_with_changes, unchanged_package_ids)

def sns_success(self, bpa_id, packages_with_changes, unchanged_package_ids):
subject = shorten('BASE Apply Contextual Metadata - BPA ID %s' % bpa_id)
def sns_success(self, sample_id, packages_with_changes, unchanged_package_ids):
subject = shorten('BASE Apply Contextual Metadata - Sample ID %s' % sample_id)
changed_count = len(packages_with_changes)
unchanged_count = len(unchanged_package_ids)
msg = 'Processed BPA ID %s, found %d packages, %d already up-to-date, sent SNS patch requests for %d.' % (
bpa_id, changed_count + unchanged_count, unchanged_count, changed_count)
sample_id, changed_count + unchanged_count, unchanged_count, changed_count)

logger.info(msg)
if not self.env.sns_on_success:
Expand Down Expand Up @@ -88,7 +88,7 @@ def sns_ckan_patch_package(self, package_id, updates):

def _extract_data(self, event):
message = json.loads(event['Records'][0]['Sns']['Message'])
return (message['bpa_id'], message['metadata'])
return (message['sample_id'], message['metadata'])


handler = Handler(logger)
Expand Down
20 changes: 10 additions & 10 deletions bpaingest/handlers/base_contextual_metadata_sheet.py
Expand Up @@ -21,12 +21,12 @@ class Handler(GenericHandler):

The function should be set up to be triggered by ObjectCreate S3 events for a bucket and path
where contextual metadata spreadsheets will be uploaded.
The function will read each row and will create one SNS message for each, containing the bpa_id
The function will read each row and will create one SNS message for each, containing the sample_id
and the metadata (rest of the values in the spreadsheet for the row).
The sns_apply_to_bpa_id should be set to the SNS topic arn that will receive these messages.
The sns_apply_to_sample_id should be set to the SNS topic arn that will receive these messages.
'''
ENV_VAR_DEFS = {
'names': ('sns_apply_to_bpa_id', 'sns_on_success', 'sns_on_error'),
'names': ('sns_apply_to_sample_id', 'sns_on_success', 'sns_on_error'),
}
SNS_ON_ERROR_SUBJECT = 'ERROR: BASE Contextual Metadata Sheet'

Expand All @@ -44,8 +44,8 @@ def handler(self, event, context):

contextual = BASESampleContextual(dirname)
rows = list(contextual.sample_metadata.items())
for bpa_id, values in rows:
self.sns_publish_apply_metadata(bpa_id, values)
for sample_id, values in rows:
self.sns_publish_apply_metadata(sample_id, values)
self.sns_success(rows)

def sns_success(self, rows):
Expand All @@ -58,11 +58,11 @@ def sns_success(self, rows):
Subject=subject,
Message=msg)

def sns_publish_apply_metadata(self, bpa_id, metadata):
default = 'Apply contextual metadata to sample bpa_id:%s from %s' % (
bpa_id, self.metadata_s3_key)
def sns_publish_apply_metadata(self, sample_id, metadata):
default = 'Apply contextual metadata to sample sample_id:%s from %s' % (
sample_id, self.metadata_s3_key)
json_data = json.dumps({
'bpa_id': bpa_id,
'sample_id': sample_id,
'metadata': metadata,
})
data = {
Expand All @@ -72,7 +72,7 @@ def sns_publish_apply_metadata(self, bpa_id, metadata):
}

sns.publish(
TopicArn=self.env.sns_apply_to_bpa_id,
TopicArn=self.env.sns_apply_to_sample_id,
MessageStructure='json',
Message=json.dumps(data))

Expand Down
10 changes: 5 additions & 5 deletions bpaingest/handlers/ckan_service.py
Expand Up @@ -42,21 +42,21 @@ def auth_header(self):
def auth_admin_header(self):
return {'Authorization': self.credentials['CKAN_ADMIN_API_KEY']}

def get_packages_by_bpa_id(self, bpa_id):
def get_packages_by_sample_id(self, sample_id):
params = {
'include_private': True,
'q': 'bpa_id:%s' % bpa_id,
'q': 'sample_id:%s' % sample_id,
}
resp = self.session.get(self.urls.package_search, headers=self.auth_header, params=params)
try:
resp.raise_for_status()
json_resp = resp.json()
if not json_resp['success']:
raise Exception('Package search (by bpa_id) returned success False')
raise Exception('Package search (by sample_id) returned success False')
return json_resp['result']['results']
except Exception as exc:
msg = 'Package search (%s) for packages with bpa_id "%s" was NOT successful!' % (
resp.request.url, bpa_id)
msg = 'Package search (%s) for packages with sample_id "%s" was NOT successful!' % (
resp.request.url, sample_id)
raise Exception(msg) from exc

def get_all_resources(self):
Expand Down
78 changes: 0 additions & 78 deletions bpaingest/libs/bpa_id_utils.py

This file was deleted.

30 changes: 15 additions & 15 deletions bpaingest/libs/ingest_utils.py
Expand Up @@ -7,11 +7,11 @@

logger = make_logger(__name__)

bpa_id_re = re.compile(r'^102\.100\.100[/\.](\d+)$')
bpa_id_abbrev_re = re.compile(r'^(\d+)$')
ands_id_re = re.compile(r'^102\.100\.100[/\.](\d+)$')
ands_id_abbrev_re = re.compile(r'^(\d+)$')
# this format of BPA ID has been used in older projects (e.g. BASE)
bpa_id_abbrev_2_re = re.compile(r'^102\.100\.\.100[/\.](\d+)$')
# <BPA_ID>_<extraction>
ands_id_abbrev_2_re = re.compile(r'^102\.100\.\.100[/\.](\d+)$')
# <sample_id>_<extraction>
sample_extraction_id_re = re.compile(r'^\d{4,6}_\d')


Expand Down Expand Up @@ -57,10 +57,10 @@ def fix_sample_extraction_id(val):
return val


def make_sample_extraction_id(extraction_id, bpa_id):
def make_sample_extraction_id(extraction_id, sample_id):
# instructions from project manager: if no extraction_id in the spreadsheet,
# append _1 to the bpa_id_to_ckan_name
return extraction_id or (bpa_id.split('.')[-1] + "_1")
# append _1 to the sample_id_to_ckan_name
return extraction_id or (sample_id.split('.')[-1] + "_1")


def fix_date_interval(val):
Expand Down Expand Up @@ -96,7 +96,7 @@ def merge_pass_fail(row):
raise Exception("more than one amplicon pass_fail column value: %s" % (vals))


def extract_bpa_id(s, silent=False):
def extract_ands_id(s, silent=False):
"parse a BPA ID, with or without the prefix, returning with the prefix"
if isinstance(s, float):
s = int(s)
Expand All @@ -114,26 +114,26 @@ def extract_bpa_id(s, silent=False):
# handle a sample extraction id tacked on the end with an underscore
if '_' in s:
s = s.rsplit('_', 1)[0]
m = bpa_id_re.match(s)
m = ands_id_re.match(s)
if m:
return BPA_PREFIX + m.groups()[0]
m = bpa_id_abbrev_re.match(s)
m = ands_id_abbrev_re.match(s)
if m:
return BPA_PREFIX + m.groups()[0]
m = bpa_id_abbrev_2_re.match(s)
m = ands_id_abbrev_2_re.match(s)
if m:
return BPA_PREFIX + m.groups()[0]
if not silent:
logger.warning("unable to parse BPA ID: `%s'" % s)
return None


def extract_bpa_id_silent(s):
return extract_bpa_id(s, silent=True)
def extract_ands_id_silent(s):
return extract_ands_id(s, silent=True)


def short_bpa_id(s):
return extract_bpa_id(s).split('.')[-1]
def short_ands_id(s):
return extract_ands_id(s).split('.')[-1]


def get_int(val, default=None):
Expand Down
4 changes: 4 additions & 0 deletions bpaingest/metadata.py
Expand Up @@ -2,6 +2,7 @@
import shutil
import json
import os
from contextlib import suppress
from .util import make_logger
from .libs.fetch_data import Fetcher, get_password

Expand Down Expand Up @@ -42,6 +43,9 @@ def _fetch_metadata(self, project_class, contextual, info_json, metadata_info):
metadata_info,
getattr(project_class, 'metadata_url_components', []))

with suppress(FileExistsError):
os.mkdir(self.path)

for contextual_path, contextual_cls in contextual:
os.mkdir(contextual_path)
logger.info("fetching contextual metadata: %s" % (contextual_cls.metadata_urls))
Expand Down
13 changes: 8 additions & 5 deletions bpaingest/ncbi.py
Expand Up @@ -37,7 +37,7 @@ def _read_2016_accessions(self):
if not os.access(fname, os.R_OK):
return {}
_, biosample_rows = csv_to_named_tuple('BioSample', fname, mode='rU')
return dict((ingest_utils.extract_bpa_id(t.sample_name), t.accession.strip()) for t in biosample_rows)
return dict((ingest_utils.extract_ands_id(t.sample_name), t.accession.strip()) for t in biosample_rows)

def _read_accessions(self):
"""
Expand All @@ -48,7 +48,7 @@ def _read_accessions(self):
accessions = {}
for fname in sample_objects:
_, rows = csv_to_named_tuple('SRARow', fname, mode='rU', dialect='excel-tab')
accessions.update(dict((ingest_utils.extract_bpa_id(t.sample_name), t.accession) for t in rows))
accessions.update(dict((ingest_utils.extract_ands_id(t.sample_name), t.accession) for t in rows))
return accessions

def _read_ncbi_sra(self):
Expand Down Expand Up @@ -76,12 +76,15 @@ def _read_2016_submitted(self):
_, upload_rows = csv_to_named_tuple('BioProject', fname, mode='rU')
return {t.filename for t in upload_rows}

def get(self, bpa_id):
def sample_ids(self):
return list(self.bpaid_biosample.keys())

def get(self, sample_id):
obj = {
'ncbi_bioproject_accession': self.bioproject_accession,
}
if bpa_id in self.bpaid_biosample:
obj['ncbi_biosample_accession'] = self.bpaid_biosample[bpa_id]
if sample_id in self.bpaid_biosample:
obj['ncbi_biosample_accession'] = self.bpaid_biosample[sample_id]
return obj

def filename_metadata(self, filename):
Expand Down