From 012d8667e4fee799645ca2adcb5126c4d71231d2 Mon Sep 17 00:00:00 2001 From: Grahame Bowland Date: Wed, 14 Nov 2018 14:12:48 +0800 Subject: [PATCH 1/6] contextual metadata access for library usage - used by bpaotu, which installs bpa-ingest as a library --- bpaingest/metadata.py | 4 + bpaingest/ncbi.py | 3 + bpaingest/projects/amdb/contextual.py | 315 ++++++++++++++------------ bpaingest/projects/amdb/ingest.py | 64 ++++-- setup.py | 2 +- 5 files changed, 228 insertions(+), 160 deletions(-) diff --git a/bpaingest/metadata.py b/bpaingest/metadata.py index 8335badf..b337eeb2 100644 --- a/bpaingest/metadata.py +++ b/bpaingest/metadata.py @@ -2,6 +2,7 @@ import shutil import json import os +from contextlib import suppress from .util import make_logger from .libs.fetch_data import Fetcher, get_password @@ -42,6 +43,9 @@ def _fetch_metadata(self, project_class, contextual, info_json, metadata_info): metadata_info, getattr(project_class, 'metadata_url_components', [])) + with suppress(FileExistsError): + os.mkdir(self.path) + for contextual_path, contextual_cls in contextual: os.mkdir(contextual_path) logger.info("fetching contextual metadata: %s" % (contextual_cls.metadata_urls)) diff --git a/bpaingest/ncbi.py b/bpaingest/ncbi.py index 501a1dd8..7a1eb1f3 100644 --- a/bpaingest/ncbi.py +++ b/bpaingest/ncbi.py @@ -76,6 +76,9 @@ def _read_2016_submitted(self): _, upload_rows = csv_to_named_tuple('BioProject', fname, mode='rU') return {t.filename for t in upload_rows} + def sample_ids(self): + return list(self.bpaid_biosample.keys()) + def get(self, bpa_id): obj = { 'ncbi_bioproject_accession': self.bioproject_accession, diff --git a/bpaingest/projects/amdb/contextual.py b/bpaingest/projects/amdb/contextual.py index fcb6030b..a38028a9 100644 --- a/bpaingest/projects/amdb/contextual.py +++ b/bpaingest/projects/amdb/contextual.py @@ -211,6 +211,9 @@ def __init__(self, path): xlsx_path = one(glob(path + '/*.xlsx')) self.sample_metadata = self._package_metadata(self._read_metadata(xlsx_path)) + def sample_ids(self): + return list(self.sample_metadata.keys()) + def filename_metadata(self, *args, **kwargs): return {} @@ -269,19 +272,19 @@ def _read_metadata(self, metadata_path): fld('date_sampled', 'date sampled', coerce=ingest_utils.get_date_isoformat), fld('latitude', 'latitude', coerce=ingest_utils.get_clean_number), fld('longitude', 'longitude', coerce=ingest_utils.get_clean_number), - fld('depth', 'depth'), + fld('depth', 'depth', coerce=ingest_utils.get_clean_number), fld('horizon_classification', 'horizon'), fld('soil_sample_storage_method', 'soil sample storage method'), - fld('geo_loc_name', 'geo_loc'), + fld('geo_loc', 'geo_loc'), fld('location_description', 'location description'), fld('broad_land_use', 'broad land use'), fld('detailed_land_use', 'detailed land use'), fld('general_ecological_zone', 'general ecological zone'), fld('vegetation_type', 'vegetation type'), - fld('vegetation_total_cover', 'vegetation total cover (%)'), - fld('vegetation_dom_trees', 'vegetation dom. trees (%)'), - fld('vegetation_dom_shrubs', 'vegetation dom. shrubs (%)'), - fld('vegetation_dom_grasses', 'vegetation dom. grasses (%)'), + fld('vegetation_total_cover', 'vegetation total cover (%)', coerce=ingest_utils.get_clean_number), + fld('vegetation_dom_trees', 'vegetation dom. trees (%)', coerce=ingest_utils.get_clean_number), + fld('vegetation_dom_shrubs', 'vegetation dom. shrubs (%)', coerce=ingest_utils.get_clean_number), + fld('vegetation_dom_grasses', 'vegetation dom. grasses (%)', coerce=ingest_utils.get_clean_number), fld('elevation', 'elevation ()', coerce=ingest_utils.get_clean_number), fld('slope', 'slope (%)', coerce=fix_slope_date), fld('slope_aspect', 'slope aspect (direction or degrees; e.g., nw or 315)'), @@ -303,34 +306,40 @@ def _read_metadata(self, metadata_path): fld('extreme_events', 'extreme events'), fld('soil_moisture', 'soil moisture (%)', coerce=ingest_utils.get_clean_number), fld('color', 'color controlled vocab (10)'), - fld('gravel', 'gravel (%)- ( >2.0 mm)'), + fld('gravel', 'gravel (%)- ( >2.0 mm)', coerce=ingest_utils.get_clean_number), fld('texture', 'texture ()', coerce=ingest_utils.get_clean_number), fld('course_sand', 'course sand (%) (200-2000 m)', coerce=ingest_utils.get_clean_number), fld('fine_sand', 'fine sand (%) - (20-200 m)', coerce=ingest_utils.get_clean_number), fld('sand', 'sand (%)', coerce=ingest_utils.get_clean_number), fld('silt', 'silt (%) (2-20 m)', coerce=ingest_utils.get_clean_number), fld('clay', 'clay (%) (<2 m)', coerce=ingest_utils.get_clean_number), - fld('ammonium_nitrogen', 'ammonium nitrogen (mg/kg)'), - fld('nitrate_nitrogen', 'nitrate nitrogen (mg/kg)'), - fld('phosphorus_colwell', 'phosphorus colwell (mg/kg)'), - fld('potassium_colwell', 'potassium colwell (mg/kg)'), - fld('sulphur', 'sulphur (mg/kg)'), - fld('organic_carbon', 'organic carbon (%)'), - fld('conductivity', 'conductivity (ds/m)'), - fld('ph_level_cacl2', 'ph level (cacl2) (ph)'), - fld('ph_level_h2o', 'ph level (h2o) (ph)'), - fld('dtpa_copper', 'dtpa copper (mg/kg)'), - fld('dtpa_iron', 'dtpa iron (mg/kg)'), - fld('dtpa_manganese', 'dtpa manganese (mg/kg)'), - fld('dtpa_zinc', 'dtpa zinc (mg/kg)'), - fld('exc_aluminium', 'exc. aluminium (meq/100g)'), - fld('exc_calcium', 'exc. calcium (meq/100g)'), - fld('exc_magnesium', 'exc. magnesium (meq/100g)'), - fld('exc_potassium', 'exc. potassium (meq/100g)'), - fld('exc_sodium', 'exc. sodium (meq/100g)'), - fld('boron_hot_cacl2', 'boron hot cacl2 (mg/kg)'), + fld('ammonium_nitrogen', 'ammonium nitrogen (mg/kg)', coerce=ingest_utils.get_clean_number), + fld('nitrate_nitrogen', 'nitrate nitrogen (mg/kg)', coerce=ingest_utils.get_clean_number), + fld('phosphorus_colwell', 'phosphorus colwell (mg/kg)', coerce=ingest_utils.get_clean_number), + fld('potassium_colwell', 'potassium colwell (mg/kg)', coerce=ingest_utils.get_clean_number), + fld('sulphur', 'sulphur (mg/kg)', coerce=ingest_utils.get_clean_number), + fld('organic_carbon', 'organic carbon (%)', coerce=ingest_utils.get_clean_number), + fld('conductivity', 'conductivity (ds/m)', coerce=ingest_utils.get_clean_number), + fld('ph_level_cacl2', 'ph level (cacl2) (ph)', coerce=ingest_utils.get_clean_number), + fld('ph_level_h2o', 'ph level (h2o) (ph)', coerce=ingest_utils.get_clean_number), + fld('dtpa_copper', 'dtpa copper (mg/kg)', coerce=ingest_utils.get_clean_number), + fld('dtpa_iron', 'dtpa iron (mg/kg)', coerce=ingest_utils.get_clean_number), + fld('dtpa_manganese', 'dtpa manganese (mg/kg)', coerce=ingest_utils.get_clean_number), + fld('dtpa_zinc', 'dtpa zinc (mg/kg)', coerce=ingest_utils.get_clean_number), + fld('exc_aluminium', 'exc. aluminium (meq/100g)', coerce=ingest_utils.get_clean_number), + fld('exc_calcium', 'exc. calcium (meq/100g)', coerce=ingest_utils.get_clean_number), + fld('exc_magnesium', 'exc. magnesium (meq/100g)', coerce=ingest_utils.get_clean_number), + fld('exc_potassium', 'exc. potassium (meq/100g)', coerce=ingest_utils.get_clean_number), + fld('exc_sodium', 'exc. sodium (meq/100g)', coerce=ingest_utils.get_clean_number), + fld('boron_hot_cacl2', 'boron hot cacl2 (mg/kg)', coerce=ingest_utils.get_clean_number), ] - wrapper = ExcelWrapper(field_spec, metadata_path, sheet_name=None, header_length=1, column_name_row_index=0) + wrapper = ExcelWrapper( + field_spec, + metadata_path, + sheet_name=None, + header_length=1, + column_name_row_index=0, + additional_context={'sample_type': 'Soil', 'environment': 'Soil'}) for error in wrapper.get_errors(): logger.error(error) return wrapper.get_all() @@ -353,129 +362,130 @@ class MarineMicrobesSampleContextual(object): fld('bpa_id', 'bpa_id', coerce=ingest_utils.extract_bpa_id), fld('date_sampled', 'date sampled (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), fld('time_sampled', 'time sampled (hh:mm)', coerce=ingest_utils.get_time), - fld('latitude', 'latitude (decimal degrees)'), - fld('longitude', 'longitude (decimal degrees)'), - fld('depth', 'depth (m)'), + fld('latitude', 'latitude (decimal degrees)', coerce=ingest_utils.get_clean_number), + fld('longitude', 'longitude (decimal degrees)', coerce=ingest_utils.get_clean_number), + fld('depth', 'depth (m)', coerce=ingest_utils.get_clean_number), fld('geo_loc', 'geo_loc (country:subregion)'), fld('sample_site', 'sample site'), fld('coastal_id', 'coastal_id'), fld('notes', 'notes'), - fld('ph_level', 'ph level (h2o) (ph)'), - fld('oxygen_lab', 'oxygen (mol/l) lab'), - fld('oxygen_ctd', 'oxygen (ml/l) ctd'), - fld('nitrate_nitrite', 'nitrate/nitrite (mol/l)'), - fld('phosphate', 'phosphate (mol/l)'), - fld('ammonium', 'ammonium (mol/l)'), - fld('total_co2', 'total co2 (mol/kg)'), - fld('total_alkalinity', 'total alkalinity (mol/kg)'), - fld('temperature', 'temperature [its-90, deg c]'), - fld('conductivity', 'conductivity [s/m]'), - fld('turbidity', 'turbidity (upoly 0, wet labs flnturt)'), - fld('salinity', 'salinity [psu] laboratory'), - fld('microbial_abundance', 'microbial abundance (cells per ml)'), - fld('chlorophyll_a', 'chlorophyll a (g/l)'), - fld('per_total_carbon', '%total carbon'), - fld('per_total_inorganc_carbon', '% total inorganc carbon'), - fld('light_intensity', 'light intensity (lux)'), + fld('ph_level', 'ph level (h2o) (ph)', coerce=ingest_utils.get_clean_number), + fld('oxygen_lab', 'oxygen (mol/l) lab', coerce=ingest_utils.get_clean_number), + fld('oxygen_ctd', 'oxygen (ml/l) ctd', coerce=ingest_utils.get_clean_number), + fld('nitrate_nitrite', 'nitrate/nitrite (mol/l)', coerce=ingest_utils.get_clean_number), + fld('phosphate', 'phosphate (mol/l)', coerce=ingest_utils.get_clean_number), + fld('ammonium', 'ammonium (mol/l)', coerce=ingest_utils.get_clean_number), + fld('total_co2', 'total co2 (mol/kg)', coerce=ingest_utils.get_clean_number), + fld('total_alkalinity', 'total alkalinity (mol/kg)', coerce=ingest_utils.get_clean_number), + fld('temperature', 'temperature [its-90, deg c]', coerce=ingest_utils.get_clean_number), + fld('conductivity', 'conductivity [s/m]', coerce=ingest_utils.get_clean_number), + fld('turbidity', 'turbidity (upoly 0, wet labs flnturt)', coerce=ingest_utils.get_clean_number), + fld('salinity', 'salinity [psu] laboratory', coerce=ingest_utils.get_clean_number), + fld('microbial_abundance', 'microbial abundance (cells per ml)', coerce=ingest_utils.get_clean_number), + fld('chlorophyll_a', 'chlorophyll a (g/l)', coerce=ingest_utils.get_clean_number), + fld('per_total_carbon', '%total carbon', coerce=ingest_utils.get_clean_number), + fld('per_total_inorganc_carbon', '% total inorganc carbon', coerce=ingest_utils.get_clean_number), + fld('light_intensity', 'light intensity (lux)', coerce=ingest_utils.get_clean_number), ], 'Coral': [ fld('bpa_id', 'bpa_id', coerce=ingest_utils.extract_bpa_id), fld('date_sampled', 'date sampled (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), fld('time_sampled', 'time sampled (hh:mm)', coerce=ingest_utils.get_time), - fld('latitude', 'latitude (decimal degrees)'), - fld('longitude', 'longitude (decimal degrees)'), - fld('depth', 'depth (m)'), + fld('latitude', 'latitude (decimal degrees)', coerce=ingest_utils.get_clean_number), + fld('longitude', 'longitude (decimal degrees)', coerce=ingest_utils.get_clean_number), + fld('depth', 'depth (m)', coerce=ingest_utils.get_clean_number), fld('geo_loc', 'geo_loc (country:subregion)'), fld('sample_site', 'sample site'), fld('coastal_id', 'coastal_id'), fld('host_species', 'host species'), fld('notes', 'notes'), - fld('pulse_amplitude_modulated_fluorometer_measurement', 'pulse amplitude modulated (pam) fluorometer measurement'), + fld( + 'pulse_amplitude_modulated_fluorometer_measurement', + 'pulse amplitude modulated (pam) fluorometer measurement', + coerce=ingest_utils.get_clean_number), fld('host_state', 'host state (free text field)'), - fld('host_abundance', 'host abundance (individuals per m2)'), + fld('host_abundance', 'host abundance (individuals per m2)', coerce=ingest_utils.get_clean_number), ], 'Pelagic_Public': [ fld('bpa_id', 'id', coerce=ingest_utils.extract_bpa_id), fld('organism', 'organism'), - fld('tax_id', 'tax id'), - fld('samplename_depth', 'samplename_depth'), - fld('ncbi_bioproject', 'ncbi bioproject'), + fld('tax_id', 'tax id', coerce=ingest_utils.get_clean_number), fld('date_sampled', 'date sampled (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), fld('time_sampled', 'time sampled (hh:mm)', coerce=ingest_utils.get_time), - fld('latitude', 'latitude (decimal degrees)'), - fld('longitude', 'longitude (decimal degrees)'), - fld('depth', 'depth (m)'), + fld('latitude', 'latitude (decimal degrees)', coerce=ingest_utils.get_clean_number), + fld('longitude', 'longitude (decimal degrees)', coerce=ingest_utils.get_clean_number), + fld('depth', 'depth (m)', coerce=ingest_utils.get_clean_number), fld('nrs_location_code_voyage_code', 'nrs_location_code; voyage_code'), fld('nrs_trip_code', 'nrs_trip_code'), fld('nrs_sample_code', 'nrs_sample_code'), fld('geo_loc', 'geo_loc (country:subregion)'), fld('sample_site', 'sample site'), fld('notes', 'notes'), - fld('ph_level', 'ph level (h2o) (ph)'), - fld('fluorescence', 'fluorescence (au)'), - fld('transmittance', 'transmittance (%)'), - fld('secchi_depth', 'secchi depth (m)'), - fld('bottom_depth', 'bottom depth'), - fld('pressure_bottle', 'pressure bottle'), - fld('temperature', 'temperature: ctd [its-90, deg c]'), - fld('salinity_ctd', 'salinity [psu] ctd'), - fld('oxygen_ctd', 'oxygen (mol/kg) ctd'), - fld('density', 'density [density, kg/m^3] ctd'), - fld('turbidity', 'turbidity (nephelometric turbidity units) ctd'), - fld('chlf_ctd', 'chlf: ctd'), - fld('silicate', 'silicate (mol/l)'), - fld('nitrate_nitrite', 'nitrate/nitrite (mol/l)'), - fld('nitrite', 'nitrite (mol/l)'), - fld('phosphate', 'phosphate (mol/l)'), - fld('ammonium', 'ammonium (mol/l)'), - fld('salinity_lab', 'salinity [psu] lab'), - fld('oxygen_lab', 'oxygen (mol/l) lab'), - fld('total_co2', 'total co2 (mol/kg)'), - fld('total_alkalinity', 'total alkalinity (mol/kg)'), - fld('tss', 'tss [mg/l]'), - fld('inorganic_fraction', 'inorganic fraction [mg/l]'), - fld('organic_fraction', 'organic fraction [mg/l]'), - fld('allo', 'allo [mg/m3]'), - fld('alpha_beta_car', 'alpha_beta_car [mg/m3]'), - fld('anth', 'anth [mg/m3]'), - fld('asta', 'asta [mg/m3]'), - fld('beta_beta_car', 'beta_beta_car [mg/m3]'), - fld('beta_epi_car', 'beta_epi_car [mg/m3]'), - fld('but_fuco', 'but_fuco [mg/m3]'), - fld('cantha', 'cantha [mg/m3]'), - fld('cphl_a', 'cphl_a [mg/m3]'), - fld('cphl_b', 'cphl_b [mg/m3]'), - fld('cphl_c1c2', 'cphl_c1c2 [mg/m3]'), - fld('cphl_c1', 'cphl_c1 [mg/m3]'), - fld('cphl_c2', 'cphl_c2 [mg/m3]'), - fld('cphl_c3', 'cphl_c3 [mg/m3]'), - fld('cphlide_a', 'cphlide_a [mg/m3]'), - fld('diadchr', 'diadchr [mg/m3]'), - fld('diadino', 'diadino [mg/m3]'), - fld('diato', 'diato [mg/m3]'), - fld('dino', 'dino [mg/m3]'), - fld('dv_cphl_a_and_cphl_a', 'dv_cphl_a_and_cphl_a [mg/m3]'), - fld('dv_cphl_a', 'dv_cphl_a [mg/m3]'), - fld('dv_cphl_b_and_cphl_b', 'dv_cphl_b_and_cphl_b [mg/m3]'), - fld('dv_cphl_b', 'dv_cphl_b [mg/m3]'), - fld('echin', 'echin [mg/m3]'), - fld('fuco', 'fuco [mg/m3]'), - fld('gyro', 'gyro [mg/m3]'), - fld('hex_fuco', 'hex_fuco [mg/m3]'), - fld('keto_hex_fuco', 'keto_hex_fuco [mg/m3]'), - fld('lut', 'lut [mg/m3]'), - fld('lyco', 'lyco [mg/m3]'), - fld('mg_dvp', 'mg_dvp [mg/m3]'), - fld('neo', 'neo [mg/m3]'), - fld('perid', 'perid [mg/m3]'), - fld('phide_a', 'phide_a [mg/m3]'), - fld('phytin_a', 'phytin_a [mg/m3]'), - fld('phytin_b', 'phytin_b [mg/m3]'), - fld('pras', 'pras [mg/m3]'), - fld('pyrophide_a', 'pyrophide_a [mg/m3]'), - fld('pyrophytin_a', 'pyrophytin_a [mg/m3]'), - fld('viola', 'viola [mg/m3]'), - fld('zea', 'zea [mg/m3]'), + fld('ph_level', 'ph level (h2o) (ph)', coerce=ingest_utils.get_clean_number), + fld('fluorescence', 'fluorescence (au)', coerce=ingest_utils.get_clean_number), + fld('transmittance', 'transmittance (%)', coerce=ingest_utils.get_clean_number), + fld('secchi_depth', 'secchi depth (m)', coerce=ingest_utils.get_clean_number), + fld('bottom_depth', 'bottom depth', coerce=ingest_utils.get_clean_number), + fld('pressure_bottle', 'pressure bottle', coerce=ingest_utils.get_clean_number), + fld('temperature', 'temperature: ctd [its-90, deg c]', coerce=ingest_utils.get_clean_number), + fld('salinity_ctd', 'salinity [psu] ctd', coerce=ingest_utils.get_clean_number), + fld('oxygen_ctd', 'oxygen (mol/kg) ctd', coerce=ingest_utils.get_clean_number), + fld('density', 'density [density, kg/m^3] ctd', coerce=ingest_utils.get_clean_number), + fld('turbidity', 'turbidity (nephelometric turbidity units) ctd', coerce=ingest_utils.get_clean_number), + fld('chlf_ctd', 'chlf: ctd', coerce=ingest_utils.get_clean_number), + fld('silicate', 'silicate (mol/l)', coerce=ingest_utils.get_clean_number), + fld('nitrate_nitrite', 'nitrate/nitrite (mol/l)', coerce=ingest_utils.get_clean_number), + fld('nitrite', 'nitrite (mol/l)', coerce=ingest_utils.get_clean_number), + fld('phosphate', 'phosphate (mol/l)', coerce=ingest_utils.get_clean_number), + fld('ammonium', 'ammonium (mol/l)', coerce=ingest_utils.get_clean_number), + fld('salinity_lab', 'salinity [psu] lab', coerce=ingest_utils.get_clean_number), + fld('oxygen_lab', 'oxygen (mol/l) lab', coerce=ingest_utils.get_clean_number), + fld('total_co2', 'total co2 (mol/kg)', coerce=ingest_utils.get_clean_number), + fld('total_alkalinity', 'total alkalinity (mol/kg)', coerce=ingest_utils.get_clean_number), + fld('tss', 'tss [mg/l]', coerce=ingest_utils.get_clean_number), + fld('inorganic_fraction', 'inorganic fraction [mg/l]', coerce=ingest_utils.get_clean_number), + fld('organic_fraction', 'organic fraction [mg/l]', coerce=ingest_utils.get_clean_number), + fld('allo', 'allo [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('alpha_beta_car', 'alpha_beta_car [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('anth', 'anth [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('asta', 'asta [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('beta_beta_car', 'beta_beta_car [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('beta_epi_car', 'beta_epi_car [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('but_fuco', 'but_fuco [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('cantha', 'cantha [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('cphl_a', 'cphl_a [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('cphl_b', 'cphl_b [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('cphl_c1c2', 'cphl_c1c2 [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('cphl_c1', 'cphl_c1 [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('cphl_c2', 'cphl_c2 [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('cphl_c3', 'cphl_c3 [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('cphlide_a', 'cphlide_a [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('diadchr', 'diadchr [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('diadino', 'diadino [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('diato', 'diato [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('dino', 'dino [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('dv_cphl_a_and_cphl_a', 'dv_cphl_a_and_cphl_a [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('dv_cphl_a', 'dv_cphl_a [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('dv_cphl_b_and_cphl_b', 'dv_cphl_b_and_cphl_b [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('dv_cphl_b', 'dv_cphl_b [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('echin', 'echin [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('fuco', 'fuco [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('gyro', 'gyro [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('hex_fuco', 'hex_fuco [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('keto_hex_fuco', 'keto_hex_fuco [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('lut', 'lut [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('lyco', 'lyco [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('mg_dvp', 'mg_dvp [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('neo', 'neo [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('perid', 'perid [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('phide_a', 'phide_a [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('phytin_a', 'phytin_a [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('phytin_b', 'phytin_b [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('pras', 'pras [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('pyrophide_a', 'pyrophide_a [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('pyrophytin_a', 'pyrophytin_a [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('viola', 'viola [mg/m3]', coerce=ingest_utils.get_clean_number), + fld('zea', 'zea [mg/m3]', coerce=ingest_utils.get_clean_number), ], 'Seagrass': [ fld('bpa_id', 'bpa_id', coerce=ingest_utils.extract_bpa_id), @@ -489,28 +499,38 @@ class MarineMicrobesSampleContextual(object): fld('coastal_id', 'coastal_id'), fld('host_species', 'host species'), fld('notes', 'notes'), - fld('pulse_amplitude_modulated_pam_fluorometer_measurement', 'pulse amplitude modulated (pam) fluorometer measurement', coerce=ingest_utils.get_clean_number), + fld( + 'pulse_amplitude_modulated_pam_fluorometer_measurement', + 'pulse amplitude modulated (pam) fluorometer measurement', + coerce=ingest_utils.get_clean_number), fld('host_state', 'host state (free text field)'), fld('host_abundance', 'host abundance (individuals per m2)', coerce=ingest_utils.get_clean_number), - fld('light_intensity_surface', re.compile(r'^light intensity \(surface\).*'), coerce=ingest_utils.get_clean_number), - fld('light_intensity_meadow', re.compile(r'^light intensity \(meadow\).*'), coerce=ingest_utils.get_clean_number), + fld( + 'light_intensity_surface', + re.compile(r'^light intensity \(surface\).*'), + coerce=ingest_utils.get_clean_number), + fld( + 'light_intensity_meadow', + re.compile(r'^light intensity \(meadow\).*'), + coerce=ingest_utils.get_clean_number), ], 'Seaweed': [ fld('bpa_id', 'bpa_id', coerce=ingest_utils.extract_bpa_id), fld('date_sampled', 'date sampled (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), fld('time_sampled', 'time sampled (hh:mm)', coerce=ingest_utils.get_time), - fld('latitude', 'latitude (decimal degrees)'), - fld('longitude', 'longitude (decimal degrees)'), - fld('depth', 'depth (m)'), + fld('latitude', 'latitude (decimal degrees)', coerce=ingest_utils.get_clean_number), + fld('longitude', 'longitude (decimal degrees)', coerce=ingest_utils.get_clean_number), + fld('depth', 'depth (m)', coerce=ingest_utils.get_clean_number), fld('geo_loc', 'geo_loc (country:subregion)'), fld('sample_site', 'sample site'), fld('coastal_id', 'coastal_id'), fld('host_species', 'host species'), fld('notes', 'notes'), - fld('pulse_amplitude_modulated_pam_fluorometer_measurement', 'pulse amplitude modulated (pam) fluorometer measurement'), + fld( + 'pulse_amplitude_modulated_pam_fluorometer_measurement', + 'pulse amplitude modulated (pam) fluorometer measurement', + coerce=ingest_utils.get_clean_number), fld('host_state', 'host state (free text field)'), - fld('host_abundance', 'average host abundance (% of individuals per m2)'), - fld('length', 'length(cm)', coerce=ingest_utils.get_clean_number), fld('fouling', 'fouling', coerce=ingest_utils.get_clean_number), fld('fouling_organisms', 'fouling_organisms'), fld('bleaching', 'bleaching (%)', coerce=ingest_utils.get_clean_number), @@ -521,33 +541,33 @@ class MarineMicrobesSampleContextual(object): fld('bpa_id', 'bpa_id', coerce=ingest_utils.extract_bpa_id), fld('date_sampled', 'date sampled (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), fld('time_sampled', 'time sampled (hh:mm)', coerce=ingest_utils.get_time), - fld('latitude', 'latitude (decimal degrees)'), - fld('longitude', 'longitude (decimal degrees)'), - fld('depth', 'depth (m)'), + fld('latitude', 'latitude (decimal degrees)', coerce=ingest_utils.get_clean_number), + fld('longitude', 'longitude (decimal degrees)', coerce=ingest_utils.get_clean_number), + fld('depth', 'depth (m)', coerce=ingest_utils.get_clean_number), fld('geo_loc', 'geo_loc (country:subregion)'), fld('sample_site', 'sample site'), fld('coastal_id', 'coastal_id'), fld('notes', 'notes'), - fld('per_total_carbon', '%total carbon'), - fld('per_fine_sediment', '% fine sediment'), - fld('per_total_nitrogen', '% total nitrogen'), - fld('per_total_phosphorous', '% total phosphorous'), - fld('sedimentation_rate', 'sedimentation rate (g /(cm2 x y)r)'), + fld('per_total_carbon', '%total carbon', coerce=ingest_utils.get_clean_number), + fld('per_fine_sediment', '% fine sediment', coerce=ingest_utils.get_clean_number), + fld('per_total_nitrogen', '% total nitrogen', coerce=ingest_utils.get_clean_number), + fld('per_total_phosphorous', '% total phosphorous', coerce=ingest_utils.get_clean_number), + fld('sedimentation_rate', 'sedimentation rate (g /(cm2 x y)r)', coerce=ingest_utils.get_clean_number), ], 'Sponge': [ fld('bpa_id', 'bpa_id', coerce=ingest_utils.extract_bpa_id), fld('date_sampled', 'date sampled (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), fld('time_sampled', 'time sampled (hh:mm)', coerce=ingest_utils.get_time), - fld('latitude', 'latitude (decimal degrees)'), - fld('longitude', 'longitude (decimal degrees)'), - fld('depth', 'depth (m)'), + fld('latitude', 'latitude (decimal degrees)', coerce=ingest_utils.get_clean_number), + fld('longitude', 'longitude (decimal degrees)', coerce=ingest_utils.get_clean_number), + fld('depth', 'depth (m)', coerce=ingest_utils.get_clean_number), fld('geo_loc', 'geo_loc (country:subregion)'), fld('sample_site', 'sample site'), fld('coastal_id', 'coastal_id'), fld('host_species', 'host species'), fld('notes', 'notes'), fld('host_state', 'host state (free text field)'), - fld('host_abundance', 'host abundance (individuals per m2)'), + fld('host_abundance', 'host abundance (individuals per m2)', coerce=ingest_utils.get_clean_number), ] } @@ -555,6 +575,9 @@ def __init__(self, path): xlsx_path = one(glob(path + '/*.xlsx')) self.sample_metadata = self._package_metadata(self._read_metadata(xlsx_path)) + def sample_ids(self): + return list(self.sample_metadata.keys()) + def get(self, bpa_id): if bpa_id in self.sample_metadata: return self.sample_metadata[bpa_id] @@ -588,7 +611,7 @@ def _read_metadata(self, metadata_path): header_length=1, column_name_row_index=0, suggest_template=True, - additional_context={'sample_type': sheet_name}) + additional_context={'sample_type': sheet_name, 'environment': 'Marine'}) for error in wrapper.get_errors(): logger.error(error) rows += wrapper.get_all() @@ -600,5 +623,5 @@ def filename_metadata(self, *args, **kwargs): class MarineMicrobesNCBIContextual(NCBISRAContextual): metadata_urls = ['https://downloads-qcif.bioplatforms.com/bpa/marine_microbes/metadata/ncbi/'] - name = 'base-ncbi-contextual' + name = 'mm-ncbi-contextual' bioproject_accession = 'PRJNA385736' diff --git a/bpaingest/projects/amdb/ingest.py b/bpaingest/projects/amdb/ingest.py index 9d0d6712..9f20d30d 100755 --- a/bpaingest/projects/amdb/ingest.py +++ b/bpaingest/projects/amdb/ingest.py @@ -18,7 +18,9 @@ MarineMicrobesTrackMetadata) logger = make_logger(__name__) -common_context = [BASESampleContextual, BASENCBIContextual] + +base_common_context = [BASESampleContextual, BASENCBIContextual] +marine_common_context = [MarineMicrobesSampleContextual, MarineMicrobesNCBIContextual] # fixed read lengths provided by AB at CSIRO @@ -48,13 +50,50 @@ def build_base_amplicon_linkage(index_linkage, flow_id, index): return flow_id +class BaseContextualAccessMetadata(): + """ + for use by tools (e.g. bpaotu) which need access to the contextual metadata for all + AMD data, but not package or resource metadata + """ + + def __init__(self, metadata_path, contextual_metadata=None, metadata_info=None): + super().__init__() + self.contextual_metadata = contextual_metadata + + def _get_packages(self): + return [] + + def _get_resources(self): + return [] + + +class AccessBASEContextualMetadata(BaseContextualAccessMetadata): + auth = ('base', 'base') + """ + for use by tools (e.g. bpaotu) which need access to the contextual metadata for all + AMD data + """ + contextual_classes = base_common_context + metadata_urls = [] + + +class AccessMarineMicrobesContextualMetadata(BaseContextualAccessMetadata): + auth = ('marine', 'marine') + """ + for use by tools (e.g. bpaotu) which need access to the contextual metadata for all + AMD data + """ + contextual_classes = marine_common_context + metadata_urls = [] + + class BASEAmpliconsMetadata(BaseMetadata): auth = ('base', 'base') organization = 'australian-microbiome' ckan_data_type = 'base-genomics-amplicon' omics = 'genomics' technology = 'amplicons' - contextual_classes = common_context + contextual_classes = base_common_context metadata_patterns = [r'^.*\.md5$', r'^.*_metadata.*.*\.xlsx$'] metadata_urls = [ 'https://downloads-qcif.bioplatforms.com/bpa/base/raw/amplicons/', @@ -241,7 +280,7 @@ class BASEAmpliconsControlMetadata(BaseMetadata): ckan_data_type = 'base-genomics-amplicon-control' omics = 'genomics' technology = 'amplicons-control' - contextual_classes = common_context + contextual_classes = base_common_context metadata_patterns = [r'^.*\.md5$', r'^.*_metadata.*.*\.xlsx$'] metadata_urls = [ 'https://downloads-qcif.bioplatforms.com/bpa/base/raw/amplicons/', @@ -332,7 +371,7 @@ class BASEMetagenomicsMetadata(BaseMetadata): organization = 'australian-microbiome' ckan_data_type = 'base-metagenomics' omics = 'metagenomics' - contextual_classes = common_context + contextual_classes = base_common_context metadata_patterns = [r'^.*\.md5$', r'^.*_metadata.*.*\.xlsx$'] metadata_urls = [ 'https://downloads-qcif.bioplatforms.com/bpa/base/raw/metagenomics/', @@ -527,7 +566,7 @@ class BASESiteImagesMetadata(BaseMetadata): auth = ('base', 'base') organization = 'australian-microbiome' ckan_data_type = 'base-site-image' - contextual_classes = common_context + contextual_classes = base_common_context metadata_patterns = [r'^.*\.md5$'] omics = None technology = 'site-images' @@ -618,11 +657,6 @@ def _get_resources(self): return resources -index_from_comment_re = re.compile(r'([G|A|T|C|-]{6,}_[G|A|T|C|-]{6,})') -index_from_comment_pilot_re = re.compile(r'_([G|A|T|C|-]{6,})_') - -common_context = [MarineMicrobesSampleContextual, MarineMicrobesNCBIContextual] - read_lengths = { '16S': '300bp', 'A16S': '300bp', @@ -634,6 +668,10 @@ def mm_amplicon_read_length(amplicon): return read_lengths[amplicon] +index_from_comment_re = re.compile(r'([G|A|T|C|-]{6,}_[G|A|T|C|-]{6,})') +index_from_comment_pilot_re = re.compile(r'_([G|A|T|C|-]{6,})_') + + def index_from_comment(attrs): # return the index from a comment (for linkage on pilot data) # 34865_1_18S_UNSW_ATCTCAGG_GTAAGGAG_AWMVL @@ -686,7 +724,7 @@ class BaseMarineMicrobesAmpliconsMetadata(BaseMarineMicrobesMetadata): organization = 'australian-microbiome' ckan_data_type = 'mm-genomics-amplicon' omics = 'genomics' - contextual_classes = common_context + contextual_classes = marine_common_context metadata_patterns = [r'^.*\.md5', r'^.*_metadata.*.*\.xlsx'] resource_linkage = ('bpa_id', 'mm_amplicon_linkage') spreadsheet = { @@ -976,7 +1014,7 @@ class MarineMicrobesMetagenomicsMetadata(BaseMarineMicrobesMetadata): organization = 'australian-microbiome' ckan_data_type = 'mm-metagenomics' omics = 'metagenomics' - contextual_classes = common_context + contextual_classes = marine_common_context metadata_patterns = [r'^.*\.md5', r'^.*_metadata.*\.xlsx'] metadata_urls = [ 'https://downloads-qcif.bioplatforms.com/bpa/marine_microbes/raw/metagenomics/' @@ -1084,7 +1122,7 @@ class MarineMicrobesMetatranscriptomeMetadata(BaseMarineMicrobesMetadata): auth = ('marine', 'marine') organization = 'australian-microbiome' ckan_data_type = 'mm-metatranscriptome' - contextual_classes = common_context + contextual_classes = marine_common_context omics = 'metatranscriptomics' metadata_patterns = [r'^.*\.md5', r'^.*_metadata.*\.xlsx'] metadata_urls = [ diff --git a/setup.py b/setup.py index 63db7163..6f547135 100755 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ keywords="", url="https://github.com/muccg/bpa-ingest", name="bpaingest", - version="5.0.6" + version="5.0.6", packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), entry_points={ 'console_scripts': [ From 88712b19ad541801b023608118cd8ca2f851ea35 Mon Sep 17 00:00:00 2001 From: Grahame Bowland Date: Thu, 15 Nov 2018 14:52:40 +0800 Subject: [PATCH 2/6] remove build keys --- Dockerfile-builder | 1 - docker-build.key | 7 ------- docker-build.pub | 1 - 3 files changed, 9 deletions(-) delete mode 100644 docker-build.key delete mode 100644 docker-build.pub diff --git a/Dockerfile-builder b/Dockerfile-builder index 1b1297ee..f820dc0c 100644 --- a/Dockerfile-builder +++ b/Dockerfile-builder @@ -9,7 +9,6 @@ ENV NO_PROXY ${PIP_TRUSTED_HOST} RUN env | sort -COPY docker-build.key /root/.ssh/id_rsa RUN chmod 600 /root/.ssh/id_rsa && \ echo "StrictHostKeyChecking no" > /root/.ssh/config && \ echo "UserKnownHostsFile /dev/null" >> /root/.ssh/config diff --git a/docker-build.key b/docker-build.key deleted file mode 100644 index 4f0a1c44..00000000 --- a/docker-build.key +++ /dev/null @@ -1,7 +0,0 @@ ------BEGIN OPENSSH PRIVATE KEY----- -b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW -QyNTUxOQAAACCIRLueNd7wkSRQm8GrPUK3o2gFK8IppK+n2lB8l7dqkAAAAJht6MX6bejF -+gAAAAtzc2gtZWQyNTUxOQAAACCIRLueNd7wkSRQm8GrPUK3o2gFK8IppK+n2lB8l7dqkA -AAAEBupLXbQdr+9uZNleBpL1VYGk1lWx7w3fh/c3ISBxC1nYhEu5413vCRJFCbwas9Qrej -aAUrwimkr6faUHyXt2qQAAAAEGdib3dsYW5kQHNhdHN1bWEBAgMEBQ== ------END OPENSSH PRIVATE KEY----- diff --git a/docker-build.pub b/docker-build.pub deleted file mode 100644 index 9b1aa273..00000000 --- a/docker-build.pub +++ /dev/null @@ -1 +0,0 @@ -ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIhEu5413vCRJFCbwas9QrejaAUrwimkr6faUHyXt2qQ From 181c7d7c7ff017c06c1e59ee86470317abd758c3 Mon Sep 17 00:00:00 2001 From: Grahame Bowland Date: Thu, 15 Nov 2018 14:53:01 +0800 Subject: [PATCH 3/6] syntax error in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 63db7163..6f547135 100755 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ keywords="", url="https://github.com/muccg/bpa-ingest", name="bpaingest", - version="5.0.6" + version="5.0.6", packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), entry_points={ 'console_scripts': [ From 0476d8371320a9a8d66446f485a52dee025c0dc7 Mon Sep 17 00:00:00 2001 From: Grahame Bowland Date: Thu, 15 Nov 2018 16:41:07 +0800 Subject: [PATCH 4/6] expose BASE contextual field_spec --- bpaingest/projects/amdb/contextual.py | 134 +++++++++++++------------- 1 file changed, 67 insertions(+), 67 deletions(-) diff --git a/bpaingest/projects/amdb/contextual.py b/bpaingest/projects/amdb/contextual.py index a38028a9..37e1d42e 100644 --- a/bpaingest/projects/amdb/contextual.py +++ b/bpaingest/projects/amdb/contextual.py @@ -206,6 +206,72 @@ def fix_slope_date(val): class BASESampleContextual: metadata_urls = ['https://downloads-qcif.bioplatforms.com/bpa/base/metadata/contextual/2017-06-28/'] name = 'base-contextual' + field_spec = [ + fld('bpa_id', 'sample_id', coerce=ingest_utils.extract_bpa_id), + fld('date_sampled', 'date sampled', coerce=ingest_utils.get_date_isoformat), + fld('latitude', 'latitude', coerce=ingest_utils.get_clean_number), + fld('longitude', 'longitude', coerce=ingest_utils.get_clean_number), + fld('depth', 'depth', coerce=ingest_utils.get_clean_number), + fld('horizon_classification', 'horizon'), + fld('soil_sample_storage_method', 'soil sample storage method'), + fld('geo_loc', 'geo_loc'), + fld('location_description', 'location description'), + fld('broad_land_use', 'broad land use'), + fld('detailed_land_use', 'detailed land use'), + fld('general_ecological_zone', 'general ecological zone'), + fld('vegetation_type', 'vegetation type'), + fld('vegetation_total_cover', 'vegetation total cover (%)', coerce=ingest_utils.get_clean_number), + fld('vegetation_dom_trees', 'vegetation dom. trees (%)', coerce=ingest_utils.get_clean_number), + fld('vegetation_dom_shrubs', 'vegetation dom. shrubs (%)', coerce=ingest_utils.get_clean_number), + fld('vegetation_dom_grasses', 'vegetation dom. grasses (%)', coerce=ingest_utils.get_clean_number), + fld('elevation', 'elevation ()', coerce=ingest_utils.get_clean_number), + fld('slope', 'slope (%)', coerce=fix_slope_date), + fld('slope_aspect', 'slope aspect (direction or degrees; e.g., nw or 315)'), + fld('profile_position', 'profile position controlled vocab (5)'), + fld('australian_soil_classification', 'australian soil classification controlled vocab (6)'), + fld('fao_soil_classification', 'fao soil classification controlled vocab (7)'), + fld('immediate_previous_land_use', 'immediate previous land use controlled vocab (2)'), + fld('date_since_change_in_land_use', 'date since change in land use'), + fld('crop_rotation_1yr_since_present', 'crop rotation 1yr since present'), + fld('crop_rotation_2yrs_since_present', 'crop rotation 2yrs since present'), + fld('crop_rotation_3yrs_since_present', 'crop rotation 3yrs since present'), + fld('crop_rotation_4yrs_since_present', 'crop rotation 4yrs since present'), + fld('crop_rotation_5yrs_since_present', 'crop rotation 5yrs since present'), + fld('agrochemical_additions', 'agrochemical additions'), + fld('tillage', 'tillage controlled vocab (9)'), + fld('fire_history', 'fire', coerce=fix_sometimes_date), + fld('fire_intensity_if_known', 'fire intensity if known'), + fld('flooding', 'flooding', coerce=fix_sometimes_date), + fld('extreme_events', 'extreme events'), + fld('soil_moisture', 'soil moisture (%)', coerce=ingest_utils.get_clean_number), + fld('color', 'color controlled vocab (10)'), + fld('gravel', 'gravel (%)- ( >2.0 mm)', coerce=ingest_utils.get_clean_number), + fld('texture', 'texture ()', coerce=ingest_utils.get_clean_number), + fld('course_sand', 'course sand (%) (200-2000 m)', coerce=ingest_utils.get_clean_number), + fld('fine_sand', 'fine sand (%) - (20-200 m)', coerce=ingest_utils.get_clean_number), + fld('sand', 'sand (%)', coerce=ingest_utils.get_clean_number), + fld('silt', 'silt (%) (2-20 m)', coerce=ingest_utils.get_clean_number), + fld('clay', 'clay (%) (<2 m)', coerce=ingest_utils.get_clean_number), + fld('ammonium_nitrogen', 'ammonium nitrogen (mg/kg)', coerce=ingest_utils.get_clean_number), + fld('nitrate_nitrogen', 'nitrate nitrogen (mg/kg)', coerce=ingest_utils.get_clean_number), + fld('phosphorus_colwell', 'phosphorus colwell (mg/kg)', coerce=ingest_utils.get_clean_number), + fld('potassium_colwell', 'potassium colwell (mg/kg)', coerce=ingest_utils.get_clean_number), + fld('sulphur', 'sulphur (mg/kg)', coerce=ingest_utils.get_clean_number), + fld('organic_carbon', 'organic carbon (%)', coerce=ingest_utils.get_clean_number), + fld('conductivity', 'conductivity (ds/m)', coerce=ingest_utils.get_clean_number), + fld('ph_level_cacl2', 'ph level (cacl2) (ph)', coerce=ingest_utils.get_clean_number), + fld('ph_level_h2o', 'ph level (h2o) (ph)', coerce=ingest_utils.get_clean_number), + fld('dtpa_copper', 'dtpa copper (mg/kg)', coerce=ingest_utils.get_clean_number), + fld('dtpa_iron', 'dtpa iron (mg/kg)', coerce=ingest_utils.get_clean_number), + fld('dtpa_manganese', 'dtpa manganese (mg/kg)', coerce=ingest_utils.get_clean_number), + fld('dtpa_zinc', 'dtpa zinc (mg/kg)', coerce=ingest_utils.get_clean_number), + fld('exc_aluminium', 'exc. aluminium (meq/100g)', coerce=ingest_utils.get_clean_number), + fld('exc_calcium', 'exc. calcium (meq/100g)', coerce=ingest_utils.get_clean_number), + fld('exc_magnesium', 'exc. magnesium (meq/100g)', coerce=ingest_utils.get_clean_number), + fld('exc_potassium', 'exc. potassium (meq/100g)', coerce=ingest_utils.get_clean_number), + fld('exc_sodium', 'exc. sodium (meq/100g)', coerce=ingest_utils.get_clean_number), + fld('boron_hot_cacl2', 'boron hot cacl2 (mg/kg)', coerce=ingest_utils.get_clean_number), + ] def __init__(self, path): xlsx_path = one(glob(path + '/*.xlsx')) @@ -267,74 +333,8 @@ def _package_metadata(self, rows): return sample_metadata def _read_metadata(self, metadata_path): - field_spec = [ - fld('bpa_id', 'sample_id', coerce=ingest_utils.extract_bpa_id), - fld('date_sampled', 'date sampled', coerce=ingest_utils.get_date_isoformat), - fld('latitude', 'latitude', coerce=ingest_utils.get_clean_number), - fld('longitude', 'longitude', coerce=ingest_utils.get_clean_number), - fld('depth', 'depth', coerce=ingest_utils.get_clean_number), - fld('horizon_classification', 'horizon'), - fld('soil_sample_storage_method', 'soil sample storage method'), - fld('geo_loc', 'geo_loc'), - fld('location_description', 'location description'), - fld('broad_land_use', 'broad land use'), - fld('detailed_land_use', 'detailed land use'), - fld('general_ecological_zone', 'general ecological zone'), - fld('vegetation_type', 'vegetation type'), - fld('vegetation_total_cover', 'vegetation total cover (%)', coerce=ingest_utils.get_clean_number), - fld('vegetation_dom_trees', 'vegetation dom. trees (%)', coerce=ingest_utils.get_clean_number), - fld('vegetation_dom_shrubs', 'vegetation dom. shrubs (%)', coerce=ingest_utils.get_clean_number), - fld('vegetation_dom_grasses', 'vegetation dom. grasses (%)', coerce=ingest_utils.get_clean_number), - fld('elevation', 'elevation ()', coerce=ingest_utils.get_clean_number), - fld('slope', 'slope (%)', coerce=fix_slope_date), - fld('slope_aspect', 'slope aspect (direction or degrees; e.g., nw or 315)'), - fld('profile_position', 'profile position controlled vocab (5)'), - fld('australian_soil_classification', 'australian soil classification controlled vocab (6)'), - fld('fao_soil_classification', 'fao soil classification controlled vocab (7)'), - fld('immediate_previous_land_use', 'immediate previous land use controlled vocab (2)'), - fld('date_since_change_in_land_use', 'date since change in land use'), - fld('crop_rotation_1yr_since_present', 'crop rotation 1yr since present'), - fld('crop_rotation_2yrs_since_present', 'crop rotation 2yrs since present'), - fld('crop_rotation_3yrs_since_present', 'crop rotation 3yrs since present'), - fld('crop_rotation_4yrs_since_present', 'crop rotation 4yrs since present'), - fld('crop_rotation_5yrs_since_present', 'crop rotation 5yrs since present'), - fld('agrochemical_additions', 'agrochemical additions'), - fld('tillage', 'tillage controlled vocab (9)'), - fld('fire_history', 'fire', coerce=fix_sometimes_date), - fld('fire_intensity_if_known', 'fire intensity if known'), - fld('flooding', 'flooding', coerce=fix_sometimes_date), - fld('extreme_events', 'extreme events'), - fld('soil_moisture', 'soil moisture (%)', coerce=ingest_utils.get_clean_number), - fld('color', 'color controlled vocab (10)'), - fld('gravel', 'gravel (%)- ( >2.0 mm)', coerce=ingest_utils.get_clean_number), - fld('texture', 'texture ()', coerce=ingest_utils.get_clean_number), - fld('course_sand', 'course sand (%) (200-2000 m)', coerce=ingest_utils.get_clean_number), - fld('fine_sand', 'fine sand (%) - (20-200 m)', coerce=ingest_utils.get_clean_number), - fld('sand', 'sand (%)', coerce=ingest_utils.get_clean_number), - fld('silt', 'silt (%) (2-20 m)', coerce=ingest_utils.get_clean_number), - fld('clay', 'clay (%) (<2 m)', coerce=ingest_utils.get_clean_number), - fld('ammonium_nitrogen', 'ammonium nitrogen (mg/kg)', coerce=ingest_utils.get_clean_number), - fld('nitrate_nitrogen', 'nitrate nitrogen (mg/kg)', coerce=ingest_utils.get_clean_number), - fld('phosphorus_colwell', 'phosphorus colwell (mg/kg)', coerce=ingest_utils.get_clean_number), - fld('potassium_colwell', 'potassium colwell (mg/kg)', coerce=ingest_utils.get_clean_number), - fld('sulphur', 'sulphur (mg/kg)', coerce=ingest_utils.get_clean_number), - fld('organic_carbon', 'organic carbon (%)', coerce=ingest_utils.get_clean_number), - fld('conductivity', 'conductivity (ds/m)', coerce=ingest_utils.get_clean_number), - fld('ph_level_cacl2', 'ph level (cacl2) (ph)', coerce=ingest_utils.get_clean_number), - fld('ph_level_h2o', 'ph level (h2o) (ph)', coerce=ingest_utils.get_clean_number), - fld('dtpa_copper', 'dtpa copper (mg/kg)', coerce=ingest_utils.get_clean_number), - fld('dtpa_iron', 'dtpa iron (mg/kg)', coerce=ingest_utils.get_clean_number), - fld('dtpa_manganese', 'dtpa manganese (mg/kg)', coerce=ingest_utils.get_clean_number), - fld('dtpa_zinc', 'dtpa zinc (mg/kg)', coerce=ingest_utils.get_clean_number), - fld('exc_aluminium', 'exc. aluminium (meq/100g)', coerce=ingest_utils.get_clean_number), - fld('exc_calcium', 'exc. calcium (meq/100g)', coerce=ingest_utils.get_clean_number), - fld('exc_magnesium', 'exc. magnesium (meq/100g)', coerce=ingest_utils.get_clean_number), - fld('exc_potassium', 'exc. potassium (meq/100g)', coerce=ingest_utils.get_clean_number), - fld('exc_sodium', 'exc. sodium (meq/100g)', coerce=ingest_utils.get_clean_number), - fld('boron_hot_cacl2', 'boron hot cacl2 (mg/kg)', coerce=ingest_utils.get_clean_number), - ] wrapper = ExcelWrapper( - field_spec, + self.field_spec, metadata_path, sheet_name=None, header_length=1, From 295e199d35dc8daac065c7527b61370bd9b0f4ce Mon Sep 17 00:00:00 2001 From: Grahame Bowland Date: Thu, 15 Nov 2018 23:11:26 +0800 Subject: [PATCH 5/6] initial brute rename --- bpaingest/abstract.py | 2 +- bpaingest/genhash.py | 2 +- .../base_contextual_metadata_apply.py | 18 +- .../base_contextual_metadata_sheet.py | 20 +- bpaingest/handlers/ckan_service.py | 10 +- bpaingest/libs/bpa_id_utils.py | 78 ----- bpaingest/libs/ingest_utils.py | 30 +- bpaingest/ncbi.py | 10 +- bpaingest/projects/amdb/contextual.py | 52 ++-- bpaingest/projects/amdb/ingest.py | 134 ++++----- bpaingest/projects/amdb/tracking.py | 10 +- bpaingest/projects/gbr/files.py | 6 +- bpaingest/projects/gbr/ingest.py | 42 +-- bpaingest/projects/omg/contextual.py | 10 +- bpaingest/projects/omg/ingest.py | 58 ++-- bpaingest/projects/sepsis/contextual.py | 74 ++--- bpaingest/projects/sepsis/ingest.py | 267 +++++++++--------- bpaingest/projects/sepsis/tracking.py | 16 +- bpaingest/projects/stemcells/contextual.py | 94 +++--- bpaingest/projects/stemcells/ingest.py | 154 +++++----- bpaingest/projects/wheat_cultivars/files.py | 22 +- bpaingest/projects/wheat_cultivars/ingest.py | 20 +- bpaingest/projects/wheat_cultivars/runs.py | 6 +- .../wheat_pathogens_genomes/ingest.py | 24 +- bpaingest/util.py | 4 +- 25 files changed, 540 insertions(+), 623 deletions(-) delete mode 100755 bpaingest/libs/bpa_id_utils.py diff --git a/bpaingest/abstract.py b/bpaingest/abstract.py index 94ca3007..0f6220f7 100644 --- a/bpaingest/abstract.py +++ b/bpaingest/abstract.py @@ -9,7 +9,7 @@ class BaseMetadata: - resource_linkage = ('bpa_id',) + resource_linkage = ('sample_id',) @classmethod def parse_spreadsheet(cls, fname, metadata_info): diff --git a/bpaingest/genhash.py b/bpaingest/genhash.py index c86a33ba..b17c0c08 100644 --- a/bpaingest/genhash.py +++ b/bpaingest/genhash.py @@ -24,7 +24,7 @@ def localpath(mirror_path, legacy_url): def genhash(ckan, meta, mirror_path, num_threads): - def calculate_hashes(bpa_id, legacy_url, resource): + def calculate_hashes(sample_id, legacy_url, resource): fpath = localpath(mirror_path, legacy_url) patch_obj = {} diff --git a/bpaingest/handlers/base_contextual_metadata_apply.py b/bpaingest/handlers/base_contextual_metadata_apply.py index 7aa2d389..f54ed749 100644 --- a/bpaingest/handlers/base_contextual_metadata_apply.py +++ b/bpaingest/handlers/base_contextual_metadata_apply.py @@ -18,7 +18,7 @@ class Handler(GenericHandler): '''Applies BASE contextual metadata values to packages with a given BPA ID. - The function should be set up to be triggered by SNS messages that have the bpa_id and values + The function should be set up to be triggered by SNS messages that have the sample_id and values to apply in them. The packages matching the BPA ID will be looked up from CKAN and an SNS message will be created for each package, containing the package id and the values that have to be applied. @@ -40,26 +40,26 @@ class Handler(GenericHandler): SNS_ON_ERROR_SUBJECT = 'ERROR: BASE Contextual Metadata Apply' def handler(self, event, context): - bpa_id, metadata = self._extract_data(event) - logger.info('Processing BPA ID %s', bpa_id) + sample_id, metadata = self._extract_data(event) + logger.info('Processing BPA ID %s', sample_id) ckan_service = set_up_ckan_service(self.env) - packages = ckan_service.get_packages_by_bpa_id(bpa_id) + packages = ckan_service.get_packages_by_sample_id(sample_id) pids_and_changes = [(p['id'], changes(p, metadata)) for p in packages] packages_with_changes = [x for x in pids_and_changes if len(x[1]) > 0] for pid, updates in packages_with_changes: self.sns_ckan_patch_package(pid, updates) unchanged_package_ids = [x[0] for x in pids_and_changes if len(x[1]) == 0] - self.sns_success(bpa_id, packages_with_changes, unchanged_package_ids) + self.sns_success(sample_id, packages_with_changes, unchanged_package_ids) - def sns_success(self, bpa_id, packages_with_changes, unchanged_package_ids): - subject = shorten('BASE Apply Contextual Metadata - BPA ID %s' % bpa_id) + def sns_success(self, sample_id, packages_with_changes, unchanged_package_ids): + subject = shorten('BASE Apply Contextual Metadata - Sample ID %s' % sample_id) changed_count = len(packages_with_changes) unchanged_count = len(unchanged_package_ids) msg = 'Processed BPA ID %s, found %d packages, %d already up-to-date, sent SNS patch requests for %d.' % ( - bpa_id, changed_count + unchanged_count, unchanged_count, changed_count) + sample_id, changed_count + unchanged_count, unchanged_count, changed_count) logger.info(msg) if not self.env.sns_on_success: @@ -88,7 +88,7 @@ def sns_ckan_patch_package(self, package_id, updates): def _extract_data(self, event): message = json.loads(event['Records'][0]['Sns']['Message']) - return (message['bpa_id'], message['metadata']) + return (message['sample_id'], message['metadata']) handler = Handler(logger) diff --git a/bpaingest/handlers/base_contextual_metadata_sheet.py b/bpaingest/handlers/base_contextual_metadata_sheet.py index d2f400e8..57b35059 100644 --- a/bpaingest/handlers/base_contextual_metadata_sheet.py +++ b/bpaingest/handlers/base_contextual_metadata_sheet.py @@ -21,12 +21,12 @@ class Handler(GenericHandler): The function should be set up to be triggered by ObjectCreate S3 events for a bucket and path where contextual metadata spreadsheets will be uploaded. - The function will read each row and will create one SNS message for each, containing the bpa_id + The function will read each row and will create one SNS message for each, containing the sample_id and the metadata (rest of the values in the spreadsheet for the row). - The sns_apply_to_bpa_id should be set to the SNS topic arn that will receive these messages. + The sns_apply_to_sample_id should be set to the SNS topic arn that will receive these messages. ''' ENV_VAR_DEFS = { - 'names': ('sns_apply_to_bpa_id', 'sns_on_success', 'sns_on_error'), + 'names': ('sns_apply_to_sample_id', 'sns_on_success', 'sns_on_error'), } SNS_ON_ERROR_SUBJECT = 'ERROR: BASE Contextual Metadata Sheet' @@ -44,8 +44,8 @@ def handler(self, event, context): contextual = BASESampleContextual(dirname) rows = list(contextual.sample_metadata.items()) - for bpa_id, values in rows: - self.sns_publish_apply_metadata(bpa_id, values) + for sample_id, values in rows: + self.sns_publish_apply_metadata(sample_id, values) self.sns_success(rows) def sns_success(self, rows): @@ -58,11 +58,11 @@ def sns_success(self, rows): Subject=subject, Message=msg) - def sns_publish_apply_metadata(self, bpa_id, metadata): - default = 'Apply contextual metadata to sample bpa_id:%s from %s' % ( - bpa_id, self.metadata_s3_key) + def sns_publish_apply_metadata(self, sample_id, metadata): + default = 'Apply contextual metadata to sample sample_id:%s from %s' % ( + sample_id, self.metadata_s3_key) json_data = json.dumps({ - 'bpa_id': bpa_id, + 'sample_id': sample_id, 'metadata': metadata, }) data = { @@ -72,7 +72,7 @@ def sns_publish_apply_metadata(self, bpa_id, metadata): } sns.publish( - TopicArn=self.env.sns_apply_to_bpa_id, + TopicArn=self.env.sns_apply_to_sample_id, MessageStructure='json', Message=json.dumps(data)) diff --git a/bpaingest/handlers/ckan_service.py b/bpaingest/handlers/ckan_service.py index 2450360c..6ccbcc37 100644 --- a/bpaingest/handlers/ckan_service.py +++ b/bpaingest/handlers/ckan_service.py @@ -42,21 +42,21 @@ def auth_header(self): def auth_admin_header(self): return {'Authorization': self.credentials['CKAN_ADMIN_API_KEY']} - def get_packages_by_bpa_id(self, bpa_id): + def get_packages_by_sample_id(self, sample_id): params = { 'include_private': True, - 'q': 'bpa_id:%s' % bpa_id, + 'q': 'sample_id:%s' % sample_id, } resp = self.session.get(self.urls.package_search, headers=self.auth_header, params=params) try: resp.raise_for_status() json_resp = resp.json() if not json_resp['success']: - raise Exception('Package search (by bpa_id) returned success False') + raise Exception('Package search (by sample_id) returned success False') return json_resp['result']['results'] except Exception as exc: - msg = 'Package search (%s) for packages with bpa_id "%s" was NOT successful!' % ( - resp.request.url, bpa_id) + msg = 'Package search (%s) for packages with sample_id "%s" was NOT successful!' % ( + resp.request.url, sample_id) raise Exception(msg) from exc def get_all_resources(self): diff --git a/bpaingest/libs/bpa_id_utils.py b/bpaingest/libs/bpa_id_utils.py deleted file mode 100755 index 91b22bdd..00000000 --- a/bpaingest/libs/bpa_id_utils.py +++ /dev/null @@ -1,78 +0,0 @@ -# -*- coding: utf-8 -*- - -import re - -from ..util import make_logger - -BPA_ID = "102.100.100" - -logger = make_logger(__name__) - - -def get_bpa_id(bpa_idx, add_prefix=False): - """ - Get a BPA ID, if it does not exist, make it - It also creates the necessary project. - :rtype : bpa_id - """ - - if add_prefix is True and bpa_idx is not None: - bpa_idx = BPA_ID + '.' + bpa_idx - - validator = BPAIdValidator(bpa_idx) - if validator.is_valid(): - return bpa_idx - - -class BPAIdValidator(object): - """ - Given a BPA ID string, check validity. - """ - - RE_ID = re.compile(r"^102\.100\.100\.\d*", re.MULTILINE) - - def __init__(self, bpa_id): - self.valid_report = None - self.valid = None - if bpa_id is not None: - self.bpa_id = bpa_id.strip() - else: - self.bpa_id = None - - def get_id(self): - """ - Return validated ID - """ - return self.bpa_id - - def is_valid(self): - if self.valid is None: - self.is_valid_bpa_id() - return self.valid - - def is_valid_bpa_id(self): - """ - Determines if id is a valid BPA ID - """ - - if self.bpa_id is None: - self.valid_report = 'BPA ID is None' - self.valid = False - - # empties - elif self.bpa_id == '': - self.valid_report = 'BPA ID is empty string' - self.valid = False - - # no BPA prefix - elif self.bpa_id.find(BPA_ID) == -1: - self.valid_report = 'No "{0}" identifying the string as a BPA ID'.format(BPA_ID) - self.valid = False - - elif self.RE_ID.match(self.bpa_id) is None: - self.valid_report = '{} does not match {}'.format(self.bpa_id, self.RE_ID.pattern) - self.valid = False - - # this function has failed to find a reason why this can't be a BPA ID.... - else: - self.valid = True diff --git a/bpaingest/libs/ingest_utils.py b/bpaingest/libs/ingest_utils.py index a290ee29..ca44e9d8 100755 --- a/bpaingest/libs/ingest_utils.py +++ b/bpaingest/libs/ingest_utils.py @@ -7,11 +7,11 @@ logger = make_logger(__name__) -bpa_id_re = re.compile(r'^102\.100\.100[/\.](\d+)$') -bpa_id_abbrev_re = re.compile(r'^(\d+)$') +ands_id_re = re.compile(r'^102\.100\.100[/\.](\d+)$') +ands_id_abbrev_re = re.compile(r'^(\d+)$') # this format of BPA ID has been used in older projects (e.g. BASE) -bpa_id_abbrev_2_re = re.compile(r'^102\.100\.\.100[/\.](\d+)$') -# _ +ands_id_abbrev_2_re = re.compile(r'^102\.100\.\.100[/\.](\d+)$') +# _ sample_extraction_id_re = re.compile(r'^\d{4,6}_\d') @@ -57,10 +57,10 @@ def fix_sample_extraction_id(val): return val -def make_sample_extraction_id(extraction_id, bpa_id): +def make_sample_extraction_id(extraction_id, sample_id): # instructions from project manager: if no extraction_id in the spreadsheet, - # append _1 to the bpa_id_to_ckan_name - return extraction_id or (bpa_id.split('.')[-1] + "_1") + # append _1 to the sample_id_to_ckan_name + return extraction_id or (sample_id.split('.')[-1] + "_1") def fix_date_interval(val): @@ -96,7 +96,7 @@ def merge_pass_fail(row): raise Exception("more than one amplicon pass_fail column value: %s" % (vals)) -def extract_bpa_id(s, silent=False): +def extract_ands_id(s, silent=False): "parse a BPA ID, with or without the prefix, returning with the prefix" if isinstance(s, float): s = int(s) @@ -114,13 +114,13 @@ def extract_bpa_id(s, silent=False): # handle a sample extraction id tacked on the end with an underscore if '_' in s: s = s.rsplit('_', 1)[0] - m = bpa_id_re.match(s) + m = ands_id_re.match(s) if m: return BPA_PREFIX + m.groups()[0] - m = bpa_id_abbrev_re.match(s) + m = ands_id_abbrev_re.match(s) if m: return BPA_PREFIX + m.groups()[0] - m = bpa_id_abbrev_2_re.match(s) + m = ands_id_abbrev_2_re.match(s) if m: return BPA_PREFIX + m.groups()[0] if not silent: @@ -128,12 +128,12 @@ def extract_bpa_id(s, silent=False): return None -def extract_bpa_id_silent(s): - return extract_bpa_id(s, silent=True) +def extract_ands_id_silent(s): + return extract_ands_id(s, silent=True) -def short_bpa_id(s): - return extract_bpa_id(s).split('.')[-1] +def short_ands_id(s): + return extract_ands_id(s).split('.')[-1] def get_int(val, default=None): diff --git a/bpaingest/ncbi.py b/bpaingest/ncbi.py index 7a1eb1f3..2966922a 100644 --- a/bpaingest/ncbi.py +++ b/bpaingest/ncbi.py @@ -37,7 +37,7 @@ def _read_2016_accessions(self): if not os.access(fname, os.R_OK): return {} _, biosample_rows = csv_to_named_tuple('BioSample', fname, mode='rU') - return dict((ingest_utils.extract_bpa_id(t.sample_name), t.accession.strip()) for t in biosample_rows) + return dict((ingest_utils.extract_sample_id(t.sample_name), t.accession.strip()) for t in biosample_rows) def _read_accessions(self): """ @@ -48,7 +48,7 @@ def _read_accessions(self): accessions = {} for fname in sample_objects: _, rows = csv_to_named_tuple('SRARow', fname, mode='rU', dialect='excel-tab') - accessions.update(dict((ingest_utils.extract_bpa_id(t.sample_name), t.accession) for t in rows)) + accessions.update(dict((ingest_utils.extract_sample_id(t.sample_name), t.accession) for t in rows)) return accessions def _read_ncbi_sra(self): @@ -79,12 +79,12 @@ def _read_2016_submitted(self): def sample_ids(self): return list(self.bpaid_biosample.keys()) - def get(self, bpa_id): + def get(self, sample_id): obj = { 'ncbi_bioproject_accession': self.bioproject_accession, } - if bpa_id in self.bpaid_biosample: - obj['ncbi_biosample_accession'] = self.bpaid_biosample[bpa_id] + if sample_id in self.bpaid_biosample: + obj['ncbi_biosample_accession'] = self.bpaid_biosample[sample_id] return obj def filename_metadata(self, filename): diff --git a/bpaingest/projects/amdb/contextual.py b/bpaingest/projects/amdb/contextual.py index 37e1d42e..3137ee75 100644 --- a/bpaingest/projects/amdb/contextual.py +++ b/bpaingest/projects/amdb/contextual.py @@ -207,7 +207,7 @@ class BASESampleContextual: metadata_urls = ['https://downloads-qcif.bioplatforms.com/bpa/base/metadata/contextual/2017-06-28/'] name = 'base-contextual' field_spec = [ - fld('bpa_id', 'sample_id', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'sample_id', coerce=ingest_utils.extract_ands_id), fld('date_sampled', 'date sampled', coerce=ingest_utils.get_date_isoformat), fld('latitude', 'latitude', coerce=ingest_utils.get_clean_number), fld('longitude', 'longitude', coerce=ingest_utils.get_clean_number), @@ -283,10 +283,10 @@ def sample_ids(self): def filename_metadata(self, *args, **kwargs): return {} - def get(self, bpa_id): - if bpa_id in self.sample_metadata: - return self.sample_metadata[bpa_id] - logger.warning("no %s metadata available for: %s" % (type(self).__name__, repr(bpa_id))) + def get(self, sample_id): + if sample_id in self.sample_metadata: + return self.sample_metadata[sample_id] + logger.warning("no %s metadata available for: %s" % (type(self).__name__, repr(sample_id))) return {} def _package_metadata(self, rows): @@ -312,17 +312,17 @@ def _package_metadata(self, rows): onotology_error_values = dict((t, set()) for t in ontology_cleanups) sample_metadata = {} for row in rows: - if row.bpa_id is None: + if row.sample_id is None: continue - assert(row.bpa_id not in sample_metadata) - sample_metadata[row.bpa_id] = row_meta = {} + assert(row.sample_id not in sample_metadata) + sample_metadata[row.sample_id] = row_meta = {} for field in row._fields: val = getattr(row, field) if field == 'latitude': if val and val > 0: - logger.warning("Positioned in northern hemisphere, inverting: %s / %s" % (row.bpa_id, val)) + logger.warning("Positioned in northern hemisphere, inverting: %s / %s" % (row.sample_id, val)) val *= -1 - if field != 'bpa_id': + if field != 'sample_id': row_meta[field] = val for cleanup_name, enforcer in ontology_cleanups.items(): try: @@ -359,7 +359,7 @@ class MarineMicrobesSampleContextual(object): name = 'mm-samplecontextual' field_specs = { 'Coastal water': [ - fld('bpa_id', 'bpa_id', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'bpa_id', coerce=ingest_utils.extract_ands_id), fld('date_sampled', 'date sampled (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), fld('time_sampled', 'time sampled (hh:mm)', coerce=ingest_utils.get_time), fld('latitude', 'latitude (decimal degrees)', coerce=ingest_utils.get_clean_number), @@ -388,7 +388,7 @@ class MarineMicrobesSampleContextual(object): fld('light_intensity', 'light intensity (lux)', coerce=ingest_utils.get_clean_number), ], 'Coral': [ - fld('bpa_id', 'bpa_id', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'bpa_id', coerce=ingest_utils.extract_ands_id), fld('date_sampled', 'date sampled (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), fld('time_sampled', 'time sampled (hh:mm)', coerce=ingest_utils.get_time), fld('latitude', 'latitude (decimal degrees)', coerce=ingest_utils.get_clean_number), @@ -407,7 +407,7 @@ class MarineMicrobesSampleContextual(object): fld('host_abundance', 'host abundance (individuals per m2)', coerce=ingest_utils.get_clean_number), ], 'Pelagic_Public': [ - fld('bpa_id', 'id', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'id', coerce=ingest_utils.extract_ands_id), fld('organism', 'organism'), fld('tax_id', 'tax id', coerce=ingest_utils.get_clean_number), fld('date_sampled', 'date sampled (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), @@ -488,7 +488,7 @@ class MarineMicrobesSampleContextual(object): fld('zea', 'zea [mg/m3]', coerce=ingest_utils.get_clean_number), ], 'Seagrass': [ - fld('bpa_id', 'bpa_id', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'bpa_id', coerce=ingest_utils.extract_ands_id), fld('date_sampled', 'date sampled (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), fld('time_sampled', 'time sampled (hh:mm)', coerce=ingest_utils.get_time), fld('latitude', 'latitude (decimal degrees)', coerce=ingest_utils.get_clean_number), @@ -515,7 +515,7 @@ class MarineMicrobesSampleContextual(object): coerce=ingest_utils.get_clean_number), ], 'Seaweed': [ - fld('bpa_id', 'bpa_id', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'bpa_id', coerce=ingest_utils.extract_ands_id), fld('date_sampled', 'date sampled (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), fld('time_sampled', 'time sampled (hh:mm)', coerce=ingest_utils.get_time), fld('latitude', 'latitude (decimal degrees)', coerce=ingest_utils.get_clean_number), @@ -538,7 +538,7 @@ class MarineMicrobesSampleContextual(object): fld('information', 'Information'), ], 'Sediment': [ - fld('bpa_id', 'bpa_id', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'bpa_id', coerce=ingest_utils.extract_ands_id), fld('date_sampled', 'date sampled (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), fld('time_sampled', 'time sampled (hh:mm)', coerce=ingest_utils.get_time), fld('latitude', 'latitude (decimal degrees)', coerce=ingest_utils.get_clean_number), @@ -555,7 +555,7 @@ class MarineMicrobesSampleContextual(object): fld('sedimentation_rate', 'sedimentation rate (g /(cm2 x y)r)', coerce=ingest_utils.get_clean_number), ], 'Sponge': [ - fld('bpa_id', 'bpa_id', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'bpa_id', coerce=ingest_utils.extract_ands_id), fld('date_sampled', 'date sampled (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), fld('time_sampled', 'time sampled (hh:mm)', coerce=ingest_utils.get_time), fld('latitude', 'latitude (decimal degrees)', coerce=ingest_utils.get_clean_number), @@ -578,26 +578,26 @@ def __init__(self, path): def sample_ids(self): return list(self.sample_metadata.keys()) - def get(self, bpa_id): - if bpa_id in self.sample_metadata: - return self.sample_metadata[bpa_id] - logger.warning("no %s metadata available for: %s" % (type(self).__name__, repr(bpa_id))) + def get(self, sample_id): + if sample_id in self.sample_metadata: + return self.sample_metadata[sample_id] + logger.warning("no %s metadata available for: %s" % (type(self).__name__, repr(sample_id))) return {} def _package_metadata(self, rows): sample_metadata = {} for row in rows: - if row.bpa_id is None: + if row.sample_id is None: continue - assert(row.bpa_id not in sample_metadata) - sample_metadata[row.bpa_id] = row_meta = {} + assert(row.sample_id not in sample_metadata) + sample_metadata[row.sample_id] = row_meta = {} for field in row._fields: val = getattr(row, field) if field == 'latitude': if val and type(val) is float and val > 0: - logger.warning("Positioned in northern hemisphere, inverting: %s / %s" % (row.bpa_id, val)) + logger.warning("Positioned in northern hemisphere, inverting: %s / %s" % (row.sample_id, val)) val *= -1 - if field != 'bpa_id': + if field != 'sample_id': row_meta[field] = val return sample_metadata diff --git a/bpaingest/projects/amdb/ingest.py b/bpaingest/projects/amdb/ingest.py index 9f20d30d..b1246b18 100755 --- a/bpaingest/projects/amdb/ingest.py +++ b/bpaingest/projects/amdb/ingest.py @@ -9,7 +9,7 @@ from ...abstract import BaseMetadata from ...libs import ingest_utils from ...libs.excel_wrapper import make_field_definition as fld -from ...util import (apply_license, bpa_id_to_ckan_name, common_values, +from ...util import (apply_license, sample_id_to_ckan_name, common_values, make_logger, one) from .contextual import (BASENCBIContextual, BASESampleContextual, MarineMicrobesNCBIContextual, @@ -128,7 +128,7 @@ class BASEAmpliconsMetadata(BaseMetadata): # spreadsheet spreadsheet = { 'fields': [ - fld("bpa_id", re.compile(r".*sample unique id"), coerce=ingest_utils.extract_bpa_id), + fld("sample_id", re.compile(r".*sample unique id"), coerce=ingest_utils.extract_ands_id), fld("sample_extraction_id", "Sample extraction ID", coerce=ingest_utils.fix_sample_extraction_id), fld("sequencing_facility", "Sequencing facility"), fld("target", "Target", coerce=lambda s: s.upper().strip()), @@ -187,10 +187,10 @@ def track_get(k): return None return getattr(track_meta, k) - bpa_id = row.bpa_id - if bpa_id is None: + sample_id = row.sample_id + if sample_id is None: continue - sample_extraction_id = ingest_utils.make_sample_extraction_id(row.sample_extraction_id, bpa_id) + sample_extraction_id = ingest_utils.make_sample_extraction_id(row.sample_extraction_id, sample_id) base_fname = os.path.basename(fname) index_linkage = base_fname in self.index_linkage_spreadsheets base_amplicon_linkage = build_base_amplicon_linkage(index_linkage, flow_id, row.index) @@ -200,7 +200,7 @@ def track_get(k): note_extra = flow_id obj = {} amplicon = row.amplicon.upper() - name = bpa_id_to_ckan_name(sample_extraction_id, self.ckan_data_type + '-' + amplicon, base_amplicon_linkage) + name = sample_id_to_ckan_name(sample_extraction_id, self.ckan_data_type + '-' + amplicon, base_amplicon_linkage) archive_ingestion_date = ingest_utils.get_date_isoformat(track_get('date_of_transfer_to_archive')) obj.update({ @@ -208,7 +208,7 @@ def track_get(k): 'id': name, 'sample_type': 'soil', 'read_length': base_amplicon_read_length(amplicon), # hard-coded for now, on advice of AB at CSIRO - 'bpa_id': bpa_id, + 'sample_id': sample_id, 'flow_id': flow_id, 'base_amplicon_linkage': base_amplicon_linkage, 'sample_extraction_id': sample_extraction_id, @@ -245,7 +245,7 @@ def track_get(k): 'private': True, }) for contextual_source in self.contextual_metadata: - obj.update(contextual_source.get(bpa_id)) + obj.update(contextual_source.get(sample_id)) ingest_utils.add_spatial_extra(obj) tag_names = ['amplicons', amplicon, obj['sample_type']] obj['tags'] = [{'name': t} for t in tag_names] @@ -260,14 +260,14 @@ def _get_resources(self): index_linkage = os.path.basename(md5_file) in self.index_linkage_md5s logger.info("Processing md5 file {}".format(md5_file)) for filename, md5, file_info in self.parse_md5file(md5_file): - bpa_id = ingest_utils.extract_bpa_id(file_info.get('id')) + sample_id = ingest_utils.extract_ands_id(file_info.get('id')) resource = file_info.copy() resource['md5'] = resource['id'] = md5 resource['name'] = filename resource['resource_type'] = self.ckan_data_type for contextual_source in self.contextual_metadata: resource.update(contextual_source.filename_metadata(filename)) - sample_extraction_id = bpa_id.split('.')[-1] + '_' + file_info.get('extraction') + sample_extraction_id = sample_id.split('.')[-1] + '_' + file_info.get('extraction') xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) resources.append(((sample_extraction_id, resource['amplicon'], build_base_amplicon_linkage(index_linkage, resource['flow_id'], resource['index'])), legacy_url, resource)) @@ -310,7 +310,7 @@ def _get_packages(self): packages = [] for (amplicon, flow_id), info in sorted(flow_id_ticket.items()): obj = {} - name = bpa_id_to_ckan_name('control', self.ckan_data_type + '-' + amplicon, flow_id).lower() + name = sample_id_to_ckan_name('control', self.ckan_data_type + '-' + amplicon, flow_id).lower() track_meta = self.track_meta.get(info['ticket']) def track_get(k): @@ -380,7 +380,7 @@ class BASEMetagenomicsMetadata(BaseMetadata): resource_linkage = ('sample_extraction_id', 'flow_id') spreadsheet = { 'fields': [ - fld('bpa_id', 'Soil sample unique ID', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'Soil sample unique ID', coerce=ingest_utils.extract_ands_id), fld('sample_extraction_id', 'Sample extraction ID', coerce=ingest_utils.fix_sample_extraction_id), fld('insert_size_range', 'Insert size range'), fld('library_construction_protocol', 'Library construction protocol'), @@ -421,7 +421,7 @@ def __init__(self, metadata_path, contextual_metadata=None, metadata_info=None): self.metadata_info = metadata_info self.track_meta = BASETrackMetadata() - def assemble_obj(self, bpa_id, sample_extraction_id, flow_id, row, track_meta): + def assemble_obj(self, sample_id, sample_extraction_id, flow_id, row, track_meta): def track_get(k): if track_meta is None: return None @@ -435,14 +435,14 @@ def row_get(k, v_fn=None): res = v_fn(res) return res - name = bpa_id_to_ckan_name(sample_extraction_id, self.ckan_data_type, flow_id) + name = sample_id_to_ckan_name(sample_extraction_id, self.ckan_data_type, flow_id) archive_ingestion_date = ingest_utils.get_date_isoformat(track_get('date_of_transfer_to_archive')) obj = { 'name': name, 'sample_type': 'soil', 'id': name, - 'bpa_id': bpa_id, + 'sample_id': sample_id, 'flow_id': flow_id, 'read_length': '150bp', # hard-coded for now, on advice of AB at CSIRO 'sample_extraction_id': sample_extraction_id, @@ -468,7 +468,7 @@ def row_get(k, v_fn=None): 'private': True, } for contextual_source in self.contextual_metadata: - obj.update(contextual_source.get(bpa_id)) + obj.update(contextual_source.get(sample_id)) ingest_utils.add_spatial_extra(obj) tag_names = ['metagenomics', obj['sample_type']] obj['tags'] = [{'name': t} for t in tag_names] @@ -503,11 +503,11 @@ def __init__(self): # missing metadata (see note above) for sample_extraction_id, flow_id in self.missing_packages: - bpa_id = ingest_utils.extract_bpa_id(sample_extraction_id.split('_')[0]) - sample_extraction_id = ingest_utils.make_sample_extraction_id(sample_extraction_id, bpa_id) + sample_id = ingest_utils.extract_ands_id(sample_extraction_id.split('_')[0]) + sample_extraction_id = ingest_utils.make_sample_extraction_id(sample_extraction_id, sample_id) md5_file = one(glob(self.path + '/*%s*.md5' % (flow_id))) xlsx_info = self.metadata_info[os.path.basename(md5_file)] - packages.append(self.assemble_obj(bpa_id, sample_extraction_id, flow_id, FakePilotRow(xlsx_info), FakePilotTrackMeta())) + packages.append(self.assemble_obj(sample_id, sample_extraction_id, flow_id, FakePilotRow(xlsx_info), FakePilotTrackMeta())) # the generated package IDs will have duplicates, due to data issues in the pilot data # we simply skip over the duplicates, which don't have any significant data differences @@ -529,11 +529,11 @@ def __init__(self): flow_id = get_flow_id(fname) if flow_id is None: raise Exception("can't determine flow_id for %s / %s" % (fname, repr(row))) - bpa_id = row.bpa_id - if bpa_id is None: + sample_id = row.sample_id + if sample_id is None: continue - sample_extraction_id = ingest_utils.make_sample_extraction_id(row.sample_extraction_id, bpa_id) - new_obj = self.assemble_obj(bpa_id, sample_extraction_id, flow_id, row, track_meta) + sample_extraction_id = ingest_utils.make_sample_extraction_id(row.sample_extraction_id, sample_id) + new_obj = self.assemble_obj(sample_id, sample_extraction_id, flow_id, row, track_meta) if new_obj['id'] in generated_packages: logger.debug('skipped attempt to generate duplicate package: %s' % new_obj['id']) continue @@ -548,14 +548,14 @@ def _get_resources(self): for md5_file in glob(self.path + '/*.md5'): logger.info("Processing md5 file {}".format(md5_file)) for filename, md5, file_info in self.parse_md5file(md5_file): - bpa_id = ingest_utils.extract_bpa_id(file_info.get('id')) + sample_id = ingest_utils.extract_ands_id(file_info.get('id')) resource = file_info.copy() resource['md5'] = resource['id'] = md5 resource['name'] = filename resource['resource_type'] = self.ckan_data_type for contextual_source in self.contextual_metadata: resource.update(contextual_source.filename_metadata(filename)) - sample_extraction_id = bpa_id.split('.')[-1] + '_' + file_info.get('extraction') + sample_extraction_id = sample_id.split('.')[-1] + '_' + file_info.get('extraction') xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) resources.append(((sample_extraction_id, resource['flow_id']), legacy_url, resource)) @@ -607,7 +607,7 @@ def _read_md5s(self): @classmethod def id_tpl_to_site_ids(cls, id_tpl): - return ', '.join([ingest_utils.extract_bpa_id(t) for t in id_tpl]) + return ', '.join([ingest_utils.extract_ands_id(t) for t in id_tpl]) def _get_packages(self): logger.info("Ingesting BPA BASE Images metadata from {0}".format(self.path)) @@ -615,13 +615,13 @@ def _get_packages(self): for id_tpl in sorted(self.id_to_resources): info = self.id_to_resources[id_tpl] obj = {} - name = bpa_id_to_ckan_name('%s-%s' % id_tpl, self.ckan_data_type).lower() + name = sample_id_to_ckan_name('%s-%s' % id_tpl, self.ckan_data_type).lower() # find the common contextual metadata for the site IDs context = [] for abbrev in id_tpl: fragment = {} for contextual_source in self.contextual_metadata: - fragment.update(contextual_source.get(ingest_utils.extract_bpa_id(abbrev))) + fragment.update(contextual_source.get(ingest_utils.extract_ands_id(abbrev))) context.append(fragment) obj.update(common_values(context)) obj.update({ @@ -726,10 +726,10 @@ class BaseMarineMicrobesAmpliconsMetadata(BaseMarineMicrobesMetadata): omics = 'genomics' contextual_classes = marine_common_context metadata_patterns = [r'^.*\.md5', r'^.*_metadata.*.*\.xlsx'] - resource_linkage = ('bpa_id', 'mm_amplicon_linkage') + resource_linkage = ('sample_id', 'mm_amplicon_linkage') spreadsheet = { 'fields': [ - fld("bpa_id", re.compile(r'^.*sample unique id$'), coerce=ingest_utils.extract_bpa_id), + fld("sample_id", re.compile(r'^.*sample unique id$'), coerce=ingest_utils.extract_ands_id), fld("sample_extraction_id", "Sample extraction ID"), fld("target", "Target"), fld("dilution_used", "Dilution used", coerce=ingest_utils.fix_date_interval), @@ -786,26 +786,26 @@ def get_flow_id_from_comment(comment): # the GOSHIP data has flowcells in the comments field use_flowid_from_comment = base_fname in self.flowcell_comment_spreadsheets for row in BaseMarineMicrobesAmpliconsMetadata.parse_spreadsheet(fname, self.metadata_info): - bpa_id = row.bpa_id - if bpa_id is None: + sample_id = row.sample_id + if sample_id is None: continue if use_flowid_from_comment: flow_id = get_flow_id_from_comment(row.comments) - track_meta = self.track_meta.get(bpa_id) + track_meta = self.track_meta.get(sample_id) google_track_meta = self.google_track_meta.get(row.ticket) obj = self.extract_bpam_metadata(track_meta) index = index_from_comment([row.comments, row.sample_name_on_sample_sheet]) mm_amplicon_linkage = build_mm_amplicon_linkage(use_index_linkage, flow_id, index) - name = bpa_id_to_ckan_name(bpa_id.split('.')[-1], self.ckan_data_type + '-' + self.amplicon.lower(), mm_amplicon_linkage) + name = sample_id_to_ckan_name(sample_id.split('.')[-1], self.ckan_data_type + '-' + self.amplicon.lower(), mm_amplicon_linkage) archive_ingestion_date = ingest_utils.get_date_isoformat(google_track_meta.date_of_transfer_to_archive) obj.update({ 'name': name, 'id': name, - 'bpa_id': bpa_id, + 'sample_id': sample_id, 'flow_id': flow_id, 'mm_amplicon_linkage': mm_amplicon_linkage, - 'sample_extraction_id': ingest_utils.make_sample_extraction_id(row.sample_extraction_id, bpa_id), + 'sample_extraction_id': ingest_utils.make_sample_extraction_id(row.sample_extraction_id, sample_id), 'read_length': mm_amplicon_read_length(self.amplicon), 'target': row.target, 'pass_fail': ingest_utils.merge_pass_fail(row), @@ -813,8 +813,8 @@ def get_flow_id_from_comment(comment): 'reads': row.reads, 'analysis_software_version': row.analysis_software_version, 'amplicon': self.amplicon, - 'notes': 'Marine Microbes Amplicons %s %s %s' % (self.amplicon, bpa_id, flow_id), - 'title': 'Marine Microbes Amplicons %s %s %s' % (self.amplicon, bpa_id, flow_id), + 'notes': 'Marine Microbes Amplicons %s %s %s' % (self.amplicon, sample_id, flow_id), + 'title': 'Marine Microbes Amplicons %s %s %s' % (self.amplicon, sample_id, flow_id), 'omics': 'Genomics', 'analytical_platform': 'MiSeq', 'date_of_transfer': ingest_utils.get_date_isoformat(google_track_meta.date_of_transfer), @@ -833,7 +833,7 @@ def get_flow_id_from_comment(comment): 'private': True, }) for contextual_source in self.contextual_metadata: - obj.update(contextual_source.get(bpa_id)) + obj.update(contextual_source.get(sample_id)) ingest_utils.add_spatial_extra(obj) tag_names = ['amplicons', self.amplicon] if obj.get('sample_type'): @@ -855,10 +855,10 @@ def _get_resources(self): resource['resource_type'] = self.ckan_data_type for contextual_source in self.contextual_metadata: resource.update(contextual_source.filename_metadata(filename)) - bpa_id = ingest_utils.extract_bpa_id(file_info.get('id')) + sample_id = ingest_utils.extract_ands_id(file_info.get('id')) xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) - resources.append(((bpa_id, build_mm_amplicon_linkage(use_index_linkage, resource['flow_id'], resource['index'])), legacy_url, resource)) + resources.append(((sample_id, build_mm_amplicon_linkage(use_index_linkage, resource['flow_id'], resource['index'])), legacy_url, resource)) return resources @@ -931,7 +931,7 @@ def _get_packages(self): packages = [] for flow_id, info in sorted(flow_id_ticket.items()): obj = {} - name = bpa_id_to_ckan_name('control', self.ckan_data_type + '-' + self.amplicon, flow_id).lower() + name = sample_id_to_ckan_name('control', self.ckan_data_type + '-' + self.amplicon, flow_id).lower() google_track_meta = self.google_track_meta.get(info['ticket']) archive_ingestion_date = ingest_utils.get_date_isoformat(google_track_meta.date_of_transfer_to_archive) @@ -1023,7 +1023,7 @@ class MarineMicrobesMetagenomicsMetadata(BaseMarineMicrobesMetadata): tracker_filename = 'MetagenomicsTrack' spreadsheet = { 'fields': [ - fld("bpa_id", re.compile(r'^.*sample unique id$'), coerce=ingest_utils.extract_bpa_id), + fld("sample_id", re.compile(r'^.*sample unique id$'), coerce=ingest_utils.extract_ands_id), fld("sample_extraction_id", "Sample extraction ID"), fld("insert_size_range", "Insert size range"), fld("library_construction_protocol", "Library construction protocol"), @@ -1052,21 +1052,21 @@ def _get_packages(self): for fname in unique_spreadsheets(glob(self.path + '/*.xlsx')): logger.info("Processing Marine Microbes metadata file {0}".format(os.path.basename(fname))) for row in MarineMicrobesMetagenomicsMetadata.parse_spreadsheet(fname, self.metadata_info): - bpa_id = row.bpa_id - if bpa_id is None: + sample_id = row.sample_id + if sample_id is None: continue - track_meta = self.track_meta.get(bpa_id) + track_meta = self.track_meta.get(sample_id) google_track_meta = self.google_track_meta.get(row.ticket) obj = self.extract_bpam_metadata(track_meta) - name = bpa_id_to_ckan_name(bpa_id.split('.')[-1], self.ckan_data_type) + name = sample_id_to_ckan_name(sample_id.split('.')[-1], self.ckan_data_type) archive_ingestion_date = ingest_utils.get_date_isoformat(google_track_meta.date_of_transfer_to_archive) obj.update({ 'name': name, 'id': name, - 'bpa_id': bpa_id, - 'notes': 'Marine Microbes Metagenomics %s' % (bpa_id), - 'title': 'Marine Microbes Metagenomics %s' % (bpa_id), + 'sample_id': sample_id, + 'notes': 'Marine Microbes Metagenomics %s' % (sample_id), + 'title': 'Marine Microbes Metagenomics %s' % (sample_id), 'omics': 'metagenomics', 'analytical_platform': 'HiSeq', 'read_length': '250bp', @@ -1079,7 +1079,7 @@ def _get_packages(self): 'archive_ingestion_date': archive_ingestion_date, 'license_id': apply_license(archive_ingestion_date), 'dataset_url': google_track_meta.download, - 'sample_extraction_id': ingest_utils.make_sample_extraction_id(row.sample_extraction_id, bpa_id), + 'sample_extraction_id': ingest_utils.make_sample_extraction_id(row.sample_extraction_id, sample_id), 'insert_size_range': row.insert_size_range, 'library_construction_protocol': row.library_construction_protocol, 'sequencer': row.sequencer, @@ -1090,7 +1090,7 @@ def _get_packages(self): 'private': True, }) for contextual_source in self.contextual_metadata: - obj.update(contextual_source.get(bpa_id)) + obj.update(contextual_source.get(sample_id)) ingest_utils.add_spatial_extra(obj) tag_names = ['metagenomics', 'raw'] if obj.get('sample_type'): @@ -1111,10 +1111,10 @@ def _get_resources(self): resource['resource_type'] = self.ckan_data_type for contextual_source in self.contextual_metadata: resource.update(contextual_source.filename_metadata(filename)) - bpa_id = ingest_utils.extract_bpa_id(file_info.get('id')) + sample_id = ingest_utils.extract_ands_id(file_info.get('id')) xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) - resources.append(((bpa_id,), legacy_url, resource)) + resources.append(((sample_id,), legacy_url, resource)) return resources @@ -1132,7 +1132,7 @@ class MarineMicrobesMetatranscriptomeMetadata(BaseMarineMicrobesMetadata): tracker_filename = 'MetatranscriptomeTrack' spreadsheet = { 'fields': [ - fld("bpa_id", re.compile(r'^.*sample unique id$'), coerce=ingest_utils.extract_bpa_id), + fld("sample_id", re.compile(r'^.*sample unique id$'), coerce=ingest_utils.extract_ands_id), fld("sample_extraction_id", "Sample extraction ID"), fld("insert_size_range", "Insert size range"), fld("library_construction_protocol", "Library construction protocol"), @@ -1159,28 +1159,28 @@ def _get_packages(self): logger.info("Ingesting Marine Microbes Transcriptomics metadata from {0}".format(self.path)) packages = [] # duplicate rows are an issue in this project. we filter them out by uniquifying - # this is harmless as they have to precisly match, and BPA_ID is the primary key + # this is harmless as they have to precisly match, and sample_id is the primary key all_rows = set() for fname in unique_spreadsheets(glob(self.path + '/*.xlsx')): logger.info("Processing Marine Microbes Transcriptomics metadata file {0}".format(os.path.basename(fname))) for row in MarineMicrobesMetatranscriptomeMetadata.parse_spreadsheet(fname, self.metadata_info): all_rows.add(row) for row in all_rows: - bpa_id = row.bpa_id - if bpa_id is None: + sample_id = row.sample_id + if sample_id is None: continue - track_meta = self.track_meta.get(bpa_id) + track_meta = self.track_meta.get(sample_id) google_track_meta = self.google_track_meta.get(row.ticket) obj = self.extract_bpam_metadata(track_meta) - name = bpa_id_to_ckan_name(bpa_id.split('.')[-1], self.ckan_data_type) + name = sample_id_to_ckan_name(sample_id.split('.')[-1], self.ckan_data_type) archive_ingestion_date = ingest_utils.get_date_isoformat(google_track_meta.date_of_transfer_to_archive) obj.update({ 'name': name, 'id': name, - 'bpa_id': bpa_id, - 'notes': 'Marine Microbes Metatranscriptome %s' % (bpa_id), - 'title': 'Marine Microbes Metatranscriptome %s' % (bpa_id), + 'sample_id': sample_id, + 'notes': 'Marine Microbes Metatranscriptome %s' % (sample_id), + 'title': 'Marine Microbes Metatranscriptome %s' % (sample_id), 'omics': 'metatranscriptomics', 'analytical_platform': 'HiSeq', 'read_length': '250bp', # to be confirmed by Jason Koval @@ -1193,7 +1193,7 @@ def _get_packages(self): 'archive_ingestion_date': archive_ingestion_date, 'license_id': apply_license(archive_ingestion_date), 'dataset_url': google_track_meta.download, - 'sample_extraction_id': ingest_utils.make_sample_extraction_id(row.sample_extraction_id, bpa_id), + 'sample_extraction_id': ingest_utils.make_sample_extraction_id(row.sample_extraction_id, sample_id), 'insert_size_range': row.insert_size_range, 'library_construction_protocol': row.library_construction_protocol, 'sequencer': row.sequencer, @@ -1204,7 +1204,7 @@ def _get_packages(self): 'private': True, }) for contextual_source in self.contextual_metadata: - obj.update(contextual_source.get(bpa_id)) + obj.update(contextual_source.get(sample_id)) ingest_utils.add_spatial_extra(obj) tag_names = ['metatranscriptome', 'raw'] if obj.get('sample_type'): @@ -1225,8 +1225,8 @@ def _get_resources(self): resource['resource_type'] = self.ckan_data_type for contextual_source in self.contextual_metadata: resource.update(contextual_source.filename_metadata(filename)) - bpa_id = ingest_utils.extract_bpa_id(file_info.get('id')) + sample_id = ingest_utils.extract_ands_id(file_info.get('id')) xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) - resources.append(((bpa_id,), legacy_url, resource)) + resources.append(((sample_id,), legacy_url, resource)) return resources diff --git a/bpaingest/projects/amdb/tracking.py b/bpaingest/projects/amdb/tracking.py index ff6f56c6..a55c734d 100644 --- a/bpaingest/projects/amdb/tracking.py +++ b/bpaingest/projects/amdb/tracking.py @@ -17,13 +17,13 @@ def __init__(self, name): def read_track_csv(self, fname): header, rows = csv_to_named_tuple('MarineMicrobesTrack', fname) - return dict((ingest_utils.extract_bpa_id(t.five_digit_bpa_id), t) for t in rows) + return dict((ingest_utils.extract_ands_id(t.five_digit_bpa_id), t) for t in rows) - def get(self, bpa_id): + def get(self, sample_id): data = {} - if bpa_id not in self.track_meta: - logger.debug("No %s metadata for %s" % (type(self).__name__, bpa_id)) + if sample_id not in self.track_meta: + logger.debug("No %s metadata for %s" % (type(self).__name__, sample_id)) data = { 'sample_type': '', 'costal_id': '', @@ -38,7 +38,7 @@ def get(self, bpa_id): } return data - track_meta = self.track_meta[bpa_id] + track_meta = self.track_meta[sample_id] data = { 'sample_type': track_meta.sample_type, 'costal_id': track_meta.costal_id.strip(), diff --git a/bpaingest/projects/gbr/files.py b/bpaingest/projects/gbr/files.py index 7840185e..28ed5593 100755 --- a/bpaingest/projects/gbr/files.py +++ b/bpaingest/projects/gbr/files.py @@ -6,7 +6,7 @@ logger = make_logger(__name__) AMPLICON_FILE_PATTERN = """ - (?P\d{4,6})_ + (?P\d{4,6})_ GBR_ (?PAGRF|UNSW)_ (?P16S|18S|A16S|ITS)_ @@ -19,7 +19,7 @@ PACBIO_FILE_PATTERN = """ - ^(?P\d{4,6})_ + ^(?P\d{4,6})_ GBR_ (?PAGRF|UNSW)_ (?Pm\d+_\d+)_ @@ -31,7 +31,7 @@ PACBIO_FILE_PATTERN2 = """ - ^(?P\d{4,6})_ + ^(?P\d{4,6})_ GBR_ (?PAGRF|UNSW)_ PAC_ diff --git a/bpaingest/projects/gbr/ingest.py b/bpaingest/projects/gbr/ingest.py index 1abd6e3c..de41507e 100755 --- a/bpaingest/projects/gbr/ingest.py +++ b/bpaingest/projects/gbr/ingest.py @@ -5,7 +5,7 @@ from ...libs.excel_wrapper import make_field_definition as fld from unipath import Path from glob import glob -from ...util import make_logger, bpa_id_to_ckan_name +from ...util import make_logger, sample_id_to_ckan_name from ...libs import ingest_utils from urllib.parse import urljoin from ...abstract import BaseMetadata @@ -42,10 +42,10 @@ class GbrPacbioMetadata(BaseMetadata): omics = 'genomics' technology = 'pacbio' auth = ("bpa", "gbr") - resource_linkage = ('ticket', 'bpa_id', 'pacbio_linkage') + resource_linkage = ('ticket', 'sample_id', 'pacbio_linkage') spreadsheet = { 'fields': [ - fld('bpa_id', 'Sample unique ID', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'Sample unique ID', coerce=ingest_utils.extract_ands_id), fld('sequencing_facility', 'Sequencing facility'), fld('index', 'index'), fld('pacbio', 'Library'), @@ -80,22 +80,22 @@ def _get_packages(self): logger.info("Processing Pacbio metadata file {0}".format(fname)) for row in self.parse_spreadsheet(fname, self.metadata_info): xlsx_info = self.metadata_info[os.path.basename(fname)] - bpa_id = row.bpa_id - if bpa_id is None: + sample_id = row.sample_id + if sample_id is None: continue pacbio_linkage = make_pacbio_linkage(row.flow_cell_id, row.run_number) - name = bpa_id_to_ckan_name(ingest_utils.short_bpa_id(bpa_id), self.ckan_data_type, pacbio_linkage) + name = sample_id_to_ckan_name(ingest_utils.short_ands_id(sample_id), self.ckan_data_type, pacbio_linkage) obj = { 'name': name, 'id': name, - 'title': 'Pacbio {} {}'.format(bpa_id, row.flow_cell_id), - 'notes': 'Pacbio {} {}'.format(bpa_id, row.flow_cell_id), + 'title': 'Pacbio {} {}'.format(sample_id, row.flow_cell_id), + 'notes': 'Pacbio {} {}'.format(sample_id, row.flow_cell_id), 'tags': [{'name': 'Pacbio'}], 'type': GbrPacbioMetadata.ckan_data_type, 'private': True, - 'bpa_id': bpa_id, + 'sample_id': sample_id, 'sequencing_facility': row.sequencing_facility, 'ticket': xlsx_info['ticket'], 'run_number': row.run_number, @@ -121,11 +121,11 @@ def _get_resources(self): resource = file_info.copy() resource['md5'] = resource['id'] = md5 resource['name'] = filename - bpa_id = ingest_utils.extract_bpa_id(file_info['bpa_id']) + sample_id = ingest_utils.extract_ands_id(file_info['sample_id']) xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) pacbio_linkage = make_pacbio_linkage(file_info['flow_cell_id'], file_info['run_number']) - resources.append(((xlsx_info['ticket'], bpa_id, pacbio_linkage), legacy_url, resource)) + resources.append(((xlsx_info['ticket'], sample_id, pacbio_linkage), legacy_url, resource)) return resources @@ -137,11 +137,11 @@ class GbrAmpliconsMetadata(BaseMetadata): omics = 'genomics' technology = 'amplicons' auth = ("bpa", "gbr") - resource_linkage = ('bpa_id', 'amplicon', 'index') + resource_linkage = ('sample_id', 'amplicon', 'index') extract_index_re = re.compile('^.*_([GATC]{8}_[GATC]{8})$') spreadsheet = { 'fields': [ - fld('bpa_id', 'Sample unique ID', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'Sample unique ID', coerce=ingest_utils.extract_ands_id), fld('sample_extraction_id', 'Sample extraction ID', coerce=ingest_utils.fix_sample_extraction_id), fld('sequencing_facility', 'Sequencing facility'), fld('target_range', 'Target Range'), @@ -181,18 +181,18 @@ def _get_packages(self): for fname in glob(self.path + '/*.xlsx'): logger.info("Processing Stemcells Transcriptomics metadata file {0}".format(fname)) for row in self.parse_spreadsheet(fname, self.metadata_info): - bpa_id = row.bpa_id - if bpa_id is None: + sample_id = row.sample_id + if sample_id is None: continue index = self.extract_index_re.match(row.name).groups()[0].upper() amplicon = row.amplicon.upper() - name = bpa_id_to_ckan_name(bpa_id, self.ckan_data_type + '-' + amplicon, index) + name = sample_id_to_ckan_name(sample_id, self.ckan_data_type + '-' + amplicon, index) obj = { 'name': name, 'id': name, - 'bpa_id': bpa_id, - 'title': 'Amplicon {} {}'.format(bpa_id, index), - 'notes': 'Amplicon {} {}'.format(bpa_id, index), + 'sample_id': sample_id, + 'title': 'Amplicon {} {}'.format(sample_id, index), + 'notes': 'Amplicon {} {}'.format(sample_id, index), 'tags': [{'name': 'Amplicon'}], 'type': GbrAmpliconsMetadata.ckan_data_type, 'private': True, @@ -227,8 +227,8 @@ def _get_resources(self): resource = file_info.copy() resource['md5'] = resource['id'] = md5 resource['name'] = filename - bpa_id = ingest_utils.extract_bpa_id(file_info['bpa_id']) + sample_id = ingest_utils.extract_ands_id(file_info['sample_id']) xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) - resources.append(((bpa_id, file_info['amplicon'], file_info['index']), legacy_url, resource)) + resources.append(((sample_id, file_info['amplicon'], file_info['index']), legacy_url, resource)) return resources diff --git a/bpaingest/projects/omg/contextual.py b/bpaingest/projects/omg/contextual.py index 185a1169..8c2f4eca 100644 --- a/bpaingest/projects/omg/contextual.py +++ b/bpaingest/projects/omg/contextual.py @@ -23,7 +23,7 @@ def get(self, bpa_sample_id, bpa_library_id): def _read_metadata(self, fname): field_spec = [ - fld('bpa_sample_id', 'bpa_sample_id', coerce=ingest_utils.extract_bpa_id), + fld('bpa_sample_id', 'bpa_sample_id', coerce=ingest_utils.extract_ands_id), fld('voucher_id', 'voucher_id'), fld('tissue_number', 'tissue_number'), fld('institution_name', 'institution_name'), @@ -94,7 +94,7 @@ def _read_metadata(self, fname): if not row.bpa_sample_id: continue assert(row.bpa_sample_id not in sample_metadata) - bpa_sample_id = ingest_utils.extract_bpa_id(row.bpa_sample_id) + bpa_sample_id = ingest_utils.extract_ands_id(row.bpa_sample_id) sample_metadata[bpa_sample_id] = row_meta = {} for field in row._fields: value = getattr(row, field) @@ -120,8 +120,8 @@ def get(self, bpa_sample_id, bpa_library_id): def _read_metadata(self, fname): field_spec = [ - fld('bpa_library_id', 'bpa_library_id', coerce=ingest_utils.extract_bpa_id), - fld('bpa_sample_id', 'bpa_sample_id', coerce=ingest_utils.extract_bpa_id), + fld('bpa_library_id', 'bpa_library_id', coerce=ingest_utils.extract_ands_id), + fld('bpa_sample_id', 'bpa_sample_id', coerce=ingest_utils.extract_ands_id), fld('library_type', 'library_type'), fld('library_prep_date', 'library_prep_date', coerce=ingest_utils.get_date_isoformat), fld('library_prepared_by', 'library_prepared_by'), @@ -155,7 +155,7 @@ def _read_metadata(self, fname): if not row.bpa_library_id: continue assert(row.bpa_library_id not in library_metadata) - bpa_library_id = ingest_utils.extract_bpa_id(row.bpa_library_id) + bpa_library_id = ingest_utils.extract_ands_id(row.bpa_library_id) library_metadata[bpa_library_id] = row_meta = {} for field in row._fields: value = getattr(row, field) diff --git a/bpaingest/projects/omg/ingest.py b/bpaingest/projects/omg/ingest.py index 2461e0d5..7524a1fc 100755 --- a/bpaingest/projects/omg/ingest.py +++ b/bpaingest/projects/omg/ingest.py @@ -5,7 +5,7 @@ from ...abstract import BaseMetadata -from ...util import make_logger, bpa_id_to_ckan_name, common_values +from ...util import make_logger, sample_id_to_ckan_name, common_values from urllib.parse import urljoin from glob import glob @@ -56,9 +56,9 @@ class OMG10XRawIlluminaMetadata(OMGBaseMetadata): resource_linkage = ('archive_name',) spreadsheet = { 'fields': [ - fld('bpa_dataset_id', 'bpa_dataset_id', coerce=ingest_utils.extract_bpa_id), - fld('bpa_library_id', 'bpa_library_id', coerce=ingest_utils.extract_bpa_id), - fld('bpa_sample_id', 'bpa_sample_id', coerce=ingest_utils.extract_bpa_id), + fld('bpa_dataset_id', 'bpa_dataset_id', coerce=ingest_utils.extract_ands_id), + fld('bpa_library_id', 'bpa_library_id', coerce=ingest_utils.extract_ands_id), + fld('bpa_sample_id', 'bpa_sample_id', coerce=ingest_utils.extract_ands_id), fld('facility_sample_id', 'facility_sample_id'), fld('library_type', 'library_type'), fld('library_prep_date', 'library_prep_date'), @@ -141,7 +141,7 @@ def make_row_metadata(row): packages = [] for (flow_id, fname), rows in fname_rows.items(): - name = bpa_id_to_ckan_name(fname, self.ckan_data_type, flow_id) + name = sample_id_to_ckan_name(fname, self.ckan_data_type, flow_id) assert(fname not in self.file_package) self.file_package[fname] = fname row_metadata = [make_row_metadata(row) for row in rows] @@ -236,9 +236,9 @@ class OMG10XRawMetadata(OMGBaseMetadata): resource_linkage = ('bpa_sample_id', 'flow_id') spreadsheet = { 'fields': [ - fld('bpa_dataset_id', 'bpa_dataset_id', coerce=ingest_utils.extract_bpa_id), - fld('bpa_library_id', 'bpa_library_id', coerce=ingest_utils.extract_bpa_id), - fld('bpa_sample_id', 'bpa_sample_id', coerce=ingest_utils.extract_bpa_id), + fld('bpa_dataset_id', 'bpa_dataset_id', coerce=ingest_utils.extract_ands_id), + fld('bpa_library_id', 'bpa_library_id', coerce=ingest_utils.extract_ands_id), + fld('bpa_sample_id', 'bpa_sample_id', coerce=ingest_utils.extract_ands_id), fld('facility_sample_id', 'facility_sample_id'), fld('library_type', 'library_type'), fld('library_prep_date', 'library_prep_date'), @@ -308,7 +308,7 @@ def _get_packages(self): flow_id = obj['flow_id'] self.flow_lookup[obj['ticket']] = flow_id - name = bpa_id_to_ckan_name(bpa_sample_id, self.ckan_data_type, flow_id) + name = sample_id_to_ckan_name(bpa_sample_id, self.ckan_data_type, flow_id) context = {} for contextual_source in self.contextual_metadata: context.update(contextual_source.get(bpa_sample_id, bpa_library_id)) @@ -355,7 +355,7 @@ def _get_resources(self): xlsx_info = self.metadata_info[os.path.basename(md5_file)] ticket = xlsx_info['ticket'] flow_id = self.flow_lookup[ticket] - bpa_sample_id = ingest_utils.extract_bpa_id(file_info['bpa_sample_id']) + bpa_sample_id = ingest_utils.extract_ands_id(file_info['bpa_sample_id']) resource = file_info.copy() resource['md5'] = resource['id'] = md5 @@ -380,9 +380,9 @@ class OMG10XProcessedIlluminaMetadata(OMGBaseMetadata): resource_linkage = ('bpa_sample_id', 'flow_id') spreadsheet = { 'fields': [ - fld('bpa_dataset_id', 'bpa_dataset_id', coerce=ingest_utils.extract_bpa_id), - fld('bpa_library_id', 'bpa_library_id', coerce=ingest_utils.extract_bpa_id), - fld('bpa_sample_id', 'bpa_sample_id', coerce=ingest_utils.extract_bpa_id), + fld('bpa_dataset_id', 'bpa_dataset_id', coerce=ingest_utils.extract_ands_id), + fld('bpa_library_id', 'bpa_library_id', coerce=ingest_utils.extract_ands_id), + fld('bpa_sample_id', 'bpa_sample_id', coerce=ingest_utils.extract_ands_id), fld('facility_sample_id', 'facility_sample_id'), fld('library_type', 'library_type'), fld('library_prep_date', 'library_prep_date'), @@ -461,7 +461,7 @@ def track_get(k): if bpa_sample_id is None: continue obj = {} - name = bpa_id_to_ckan_name(bpa_sample_id, self.ckan_data_type, flow_id) + name = sample_id_to_ckan_name(bpa_sample_id, self.ckan_data_type, flow_id) assert(row.file not in self.file_package) self.file_package[row.file] = bpa_sample_id, flow_id context = {} @@ -528,9 +528,9 @@ class OMGExonCaptureMetadata(OMGBaseMetadata): resource_linkage = ('bpa_library_id', 'flowcell_id', 'library_index_sequence') spreadsheet = { 'fields': [ - fld('bpa_dataset_id', 'bpa_dataset_id', coerce=ingest_utils.extract_bpa_id), - fld('bpa_library_id', 'bpa_library_id', coerce=ingest_utils.extract_bpa_id), - fld('bpa_sample_id', 'bpa_sample_id', coerce=ingest_utils.extract_bpa_id), + fld('bpa_dataset_id', 'bpa_dataset_id', coerce=ingest_utils.extract_ands_id), + fld('bpa_library_id', 'bpa_library_id', coerce=ingest_utils.extract_ands_id), + fld('bpa_sample_id', 'bpa_sample_id', coerce=ingest_utils.extract_ands_id), fld('facility_sample_id', 'facility_sample_id'), fld('library_type', 'library_type'), fld('library_prep_date', 'library_prep_date', coerce=ingest_utils.get_date_isoformat), @@ -602,7 +602,7 @@ def track_get(k): if library_id is None: continue linkage = self.flow_cell_index_linkage(row.flowcell_id, row.library_index_sequence) - name = bpa_id_to_ckan_name(library_id, self.ckan_data_type, linkage) + name = sample_id_to_ckan_name(library_id, self.ckan_data_type, linkage) obj = row._asdict() context = {} for contextual_source in self.contextual_metadata: @@ -642,7 +642,7 @@ def _get_resources(self): resource['md5'] = resource['id'] = md5 resource['name'] = filename resource['resource_type'] = self.ckan_data_type - library_id = ingest_utils.extract_bpa_id(resource['bpa_library_id']) + library_id = ingest_utils.extract_ands_id(resource['bpa_library_id']) xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) resources.append(((library_id, resource['flow_cell_id'], resource['index']), legacy_url, resource)) @@ -664,9 +664,9 @@ class OMGGenomicsHiSeqMetadata(OMGBaseMetadata): resource_linkage = ('bpa_sample_id', 'flowcell_id') spreadsheet = { 'fields': [ - fld('bpa_dataset_id', 'bpa_dataset_id', coerce=ingest_utils.extract_bpa_id), - fld('bpa_library_id', 'bpa_library_id', coerce=ingest_utils.extract_bpa_id), - fld('bpa_sample_id', 'bpa_sample_id', coerce=ingest_utils.extract_bpa_id), + fld('bpa_dataset_id', 'bpa_dataset_id', coerce=ingest_utils.extract_ands_id), + fld('bpa_library_id', 'bpa_library_id', coerce=ingest_utils.extract_ands_id), + fld('bpa_sample_id', 'bpa_sample_id', coerce=ingest_utils.extract_ands_id), fld('facility_sample_id', 'facility_sample_id'), fld('library_type', 'library_type'), fld('library_prep_date', 'library_prep_date'), @@ -750,7 +750,7 @@ def track_get(k): bpa_library_id = obj['bpa_library_id'] if bpa_sample_id is None: continue - name = bpa_id_to_ckan_name(bpa_sample_id, self.ckan_data_type, flow_id) + name = sample_id_to_ckan_name(bpa_sample_id, self.ckan_data_type, flow_id) context = {} for contextual_source in self.contextual_metadata: context.update(contextual_source.get(bpa_sample_id, bpa_library_id)) @@ -793,7 +793,7 @@ def _get_resources(self): xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) resources.append( - ((ingest_utils.extract_bpa_id(resource['bpa_sample_id']), resource['flow_cell_id']), legacy_url, resource)) + ((ingest_utils.extract_ands_id(resource['bpa_sample_id']), resource['flow_cell_id']), legacy_url, resource)) return resources @@ -820,9 +820,9 @@ class OMGGenomicsDDRADMetadata(OMGBaseMetadata): fld('genus', 'genus'), fld('species', 'species'), fld('voucher_id', 'voucher_id'), - fld('bpa_dataset_id', 'bpa_dataset_id', coerce=ingest_utils.extract_bpa_id), - fld('bpa_library_id', 'bpa_library_id', coerce=ingest_utils.extract_bpa_id), - fld('bpa_sample_id', 'bpa_sample_id', coerce=ingest_utils.extract_bpa_id), + fld('bpa_dataset_id', 'bpa_dataset_id', coerce=ingest_utils.extract_ands_id), + fld('bpa_library_id', 'bpa_library_id', coerce=ingest_utils.extract_ands_id), + fld('bpa_sample_id', 'bpa_sample_id', coerce=ingest_utils.extract_ands_id), fld('plate_name', 'plate_name'), fld('plate_well', 'plate_well'), fld('facility_sample_id', 'facility_sample_id'), @@ -902,7 +902,7 @@ def track_get(k): if track_meta is None: return None return getattr(track_meta, k) - name = bpa_id_to_ckan_name(bpa_dataset_id, self.ckan_data_type, flowcell_id) + name = sample_id_to_ckan_name(bpa_dataset_id, self.ckan_data_type, flowcell_id) obj.update({ 'name': name, 'id': name, @@ -940,5 +940,5 @@ def _get_resources(self): xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) resources.append( - ((ingest_utils.extract_bpa_id(resource['bpa_dataset_id']), resource['flowcell_id']), legacy_url, resource)) + ((ingest_utils.extract_ands_id(resource['bpa_dataset_id']), resource['flowcell_id']), legacy_url, resource)) return resources diff --git a/bpaingest/projects/sepsis/contextual.py b/bpaingest/projects/sepsis/contextual.py index ebd02c24..75c6328c 100644 --- a/bpaingest/projects/sepsis/contextual.py +++ b/bpaingest/projects/sepsis/contextual.py @@ -80,7 +80,7 @@ def __init__(self, path): xlsx_path = one(glob(path + '/*.xlsx')) self.sample_metadata = self._package_metadata(self._read_metadata(xlsx_path)) - def get(self, bpa_id, submission_obj): + def get(self, sample_id, submission_obj): if 'taxon_or_organism' in submission_obj and 'strain_or_isolate' in submission_obj: tpl = (submission_obj['taxon_or_organism'], submission_obj['strain_or_isolate']) if tpl in self.sample_metadata: @@ -147,20 +147,20 @@ def __init__(self, path): xlsx_path = one(glob(path + '/*.xlsx')) self.sample_metadata = self._package_metadata(self._read_metadata(xlsx_path)) - def get(self, bpa_id, submission_obj): - if bpa_id in self.sample_metadata: - return self.sample_metadata[bpa_id] - logger.warning("no %s metadata available for: %s" % (type(self).__name__, repr(bpa_id))) + def get(self, sample_id, submission_obj): + if sample_id in self.sample_metadata: + return self.sample_metadata[sample_id] + logger.warning("no %s metadata available for: %s" % (type(self).__name__, repr(sample_id))) return {} def _package_metadata(self, rows): sample_metadata = {} for row in rows: - if not row.bpa_id: + if not row.sample_id: continue - if row.bpa_id in sample_metadata: - logger.warning("{}: duplicate sample metadata row for {}".format(self.__class__.__name__, row.bpa_id)) - sample_metadata[row.bpa_id] = row_meta = {} + if row.sample_id in sample_metadata: + logger.warning("{}: duplicate sample metadata row for {}".format(self.__class__.__name__, row.sample_id)) + sample_metadata[row.sample_id] = row_meta = {} for field in row._fields: if field != 'taxon_or_organism' and field != 'strain_or_isolate': row_meta[field] = getattr(row, field) @@ -168,7 +168,7 @@ def _package_metadata(self, rows): def _read_metadata(self, metadata_path): field_spec = [ - fld('bpa_id', "BPA_sample_ID", coerce=ingest_utils.extract_bpa_id), + fld('sample_id', "BPA_sample_ID", coerce=ingest_utils.extract_ands_id), fld('taxon_or_organism', "Taxon_OR_organism"), fld('strain_or_isolate', "Strain_OR_isolate"), fld('serovar', "Serovar", coerce=int_or_comment), @@ -205,20 +205,20 @@ def __init__(self, path): for xlsx_path in glob(path + '/*.xlsx'): self.sample_metadata.update(self._package_metadata(self._read_metadata(xlsx_path))) - def get(self, bpa_id, submission_obj): - if bpa_id in self.sample_metadata: - return self.sample_metadata[bpa_id] - logger.warning("no %s metadata available for: %s" % (type(self).__name__, repr(bpa_id))) + def get(self, sample_id, submission_obj): + if sample_id in self.sample_metadata: + return self.sample_metadata[sample_id] + logger.warning("no %s metadata available for: %s" % (type(self).__name__, repr(sample_id))) return {} def _package_metadata(self, rows): sample_metadata = {} for row in rows: - if not row.bpa_id: + if not row.sample_id: continue - if row.bpa_id in sample_metadata: - logger.warning("{}: duplicate sample metadata row for {}".format(self.__class__.__name__, row.bpa_id)) - sample_metadata[row.bpa_id] = row_meta = {} + if row.sample_id in sample_metadata: + logger.warning("{}: duplicate sample metadata row for {}".format(self.__class__.__name__, row.sample_id)) + sample_metadata[row.sample_id] = row_meta = {} for field in row._fields: if field != 'taxon_or_organism' and field != 'strain_or_isolate': row_meta[field] = getattr(row, field) @@ -227,7 +227,7 @@ def _package_metadata(self, rows): def _read_metadata(self, metadata_path): field_spec = [ fld('sample_submission_date', 'sample submission date (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), - fld('bpa_id', 'sample name i.e. 5 digit bpa id', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'sample name i.e. 5 digit bpa id', coerce=ingest_utils.extract_ands_id), fld('sample_type', 'sample type'), fld('volume_ul', 'volume (ul)'), fld('concentration_ng_per_ul', 'concentration (ng/ul)'), @@ -274,20 +274,20 @@ def __init__(self, path): xlsx_path = one(glob(path + '/*.xlsx')) self.sample_metadata.update(self._package_metadata(self._read_metadata(xlsx_path))) - def get(self, bpa_id, submission_obj): - if bpa_id in self.sample_metadata: - return self.sample_metadata[bpa_id] - logger.warning("no %s metadata available for: %s" % (type(self).__name__, repr(bpa_id))) + def get(self, sample_id, submission_obj): + if sample_id in self.sample_metadata: + return self.sample_metadata[sample_id] + logger.warning("no %s metadata available for: %s" % (type(self).__name__, repr(sample_id))) return {} def _package_metadata(self, rows): sample_metadata = {} for row in rows: - if not row.bpa_id: + if not row.sample_id: continue - if row.bpa_id in sample_metadata: - logger.warning("{}: duplicate sample metadata row for {}".format(self.__class__.__name__, row.bpa_id)) - sample_metadata[row.bpa_id] = row_meta = {} + if row.sample_id in sample_metadata: + logger.warning("{}: duplicate sample metadata row for {}".format(self.__class__.__name__, row.sample_id)) + sample_metadata[row.sample_id] = row_meta = {} for field in row._fields: if field != 'taxon_or_organism' and field != 'strain_or_isolate': row_meta[field] = getattr(row, field) @@ -296,7 +296,7 @@ def _package_metadata(self, rows): def _read_metadata(self, metadata_path): field_spec = [ fld('sample_submission_date', 'sample submission date (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), - fld('bpa_id', 'sample name i.e. 5 digit bpa id', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'sample name i.e. 5 digit bpa id', coerce=ingest_utils.extract_ands_id), fld('taxon_or_organism', 'taxon_or_organism'), fld('strain_or_isolate', 'strain_or_isolate'), fld('serovar', 'serovar', coerce=int_or_comment), @@ -338,22 +338,22 @@ def __init__(self, path, analytical_platform): xlsx_path = one(glob(path + '/*.xlsx')) self.sample_metadata = self._package_metadata(self._read_metadata(xlsx_path)) - def get(self, bpa_id, submission_obj): - if bpa_id in self.sample_metadata: - return self.sample_metadata[bpa_id] - logger.warning("no %s metadata available for: %s" % (type(self).__name__, repr(bpa_id))) + def get(self, sample_id, submission_obj): + if sample_id in self.sample_metadata: + return self.sample_metadata[sample_id] + logger.warning("no %s metadata available for: %s" % (type(self).__name__, repr(sample_id))) return {} def _package_metadata(self, rows): sample_metadata = {} for row in rows: - if not row.bpa_id: + if not row.sample_id: continue if row.analytical_platform.lower() != self.analytical_platform.lower(): continue - if row.bpa_id in sample_metadata: - logger.warning("{}: duplicate sample metadata row for {}".format(self.__class__.__name__, row.bpa_id)) - sample_metadata[row.bpa_id] = row_meta = {} + if row.sample_id in sample_metadata: + logger.warning("{}: duplicate sample metadata row for {}".format(self.__class__.__name__, row.sample_id)) + sample_metadata[row.sample_id] = row_meta = {} for field in row._fields: if field != 'taxon_or_organism' and field != 'strain_or_isolate': row_meta[field] = getattr(row, field) @@ -362,7 +362,7 @@ def _package_metadata(self, rows): def _read_metadata(self, metadata_path): field_spec = [ fld('sample_submission_date', 'Sample submission date (YYYY-MM-DD)', coerce=ingest_utils.get_date_isoformat), - fld('bpa_id', 'Sample name i.e. 5 digit BPA ID', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'Sample name i.e. 5 digit BPA ID', coerce=ingest_utils.extract_ands_id), fld('sample_type', 'Sample type'), fld('protein_yield_total_ug', 'protein yield - total (g)'), # it really is ug, just unicode stripping drops the 'u' fld('protein_yield_facility_ug', 'Protein Yield / Facility (g)'), diff --git a/bpaingest/projects/sepsis/ingest.py b/bpaingest/projects/sepsis/ingest.py index bddebdd2..a149c71d 100755 --- a/bpaingest/projects/sepsis/ingest.py +++ b/bpaingest/projects/sepsis/ingest.py @@ -6,7 +6,7 @@ from hashlib import md5 as md5hash from ...libs import ingest_utils -from ...util import make_logger, bpa_id_to_ckan_name, csv_to_named_tuple, common_values, clean_tag_name +from ...util import make_logger, sample_id_to_ckan_name, csv_to_named_tuple, common_values, clean_tag_name from ...abstract import BaseMetadata from ...libs.excel_wrapper import make_field_definition as fld from glob import glob @@ -35,15 +35,15 @@ def fix_version(s): return str(s) -def parse_pooled_bpa_id(s): +def parse_pooled_sample_id(s): if isinstance(s, str) and ',' in s: - return tuple([ingest_utils.extract_bpa_id(t.strip()) for t in s.split(',')]) + return tuple([ingest_utils.extract_ands_id(t.strip()) for t in s.split(',')]) else: - return ingest_utils.extract_bpa_id(s) + return ingest_utils.extract_ands_id(s) -def make_bpa_id_list(s): - return tuple([ingest_utils.extract_bpa_id(t.strip()) for t in s.split(',')]) +def make_sample_id_list(s): + return tuple([ingest_utils.extract_ands_id(t.strip()) for t in s.split(',')]) expanded_names = { @@ -52,6 +52,7 @@ def make_bpa_id_list(s): 'proteindatabase': 'protein database' } + def expanded_tag_name(tag_name): ''' This function will return unique tag name. @@ -119,7 +120,7 @@ class SepsisGenomicsMiseqMetadata(BaseSepsisMetadata): technology = 'miseq' spreadsheet = { 'fields': [ - fld("bpa_id", "Bacterial sample unique ID", coerce=ingest_utils.extract_bpa_id), + fld("sample_id", "Bacterial sample unique ID", coerce=ingest_utils.extract_ands_id), fld("insert_size_range", "Insert size range"), fld("library_construction_protocol", "Library construction protocol"), fld("sequencer", "Sequencer"), @@ -152,17 +153,17 @@ def _get_packages(self): ticket = xlsx_info['ticket'] google_track_meta = self.google_track_meta.get(ticket) for row in rows: - bpa_id = row.bpa_id - track_meta = self.bpam_track_meta.get(bpa_id) + sample_id = row.sample_id + track_meta = self.bpam_track_meta.get(sample_id) obj = track_meta.copy() - name = bpa_id_to_ckan_name(bpa_id.split('.')[-1], self.ckan_data_type) + name = sample_id_to_ckan_name(sample_id.split('.')[-1], self.ckan_data_type) obj.update({ 'name': name, 'id': name, - 'bpa_id': bpa_id, + 'sample_id': sample_id, 'archive_ingestion_date': ingest_utils.get_date_isoformat(google_track_meta.date_of_transfer_to_archive), 'notes': 'ARP Genomics Miseq Raw Data: %s %s %s Replicate %s' % (track_meta['taxon_or_organism'], track_meta['strain_or_isolate'], obj['growth_media'], obj['replicate']), - 'title': 'Sepsis Genomics Miseq %s' % (bpa_id.split('.')[-1]), + 'title': 'Sepsis Genomics Miseq %s' % (sample_id.split('.')[-1]), 'ticket': row.ticket, 'facility': row.facility_code.upper(), 'insert_size_range': row.insert_size_range, @@ -174,7 +175,7 @@ def _get_packages(self): 'private': True, }) for contextual_source in self.contextual_metadata: - obj.update(contextual_source.get(bpa_id, track_meta)) + obj.update(contextual_source.get(sample_id, track_meta)) tag_names = sepsis_contextual_tags(self, obj) obj['tags'] = [{'name': t} for t in tag_names] packages.append(obj) @@ -192,10 +193,10 @@ def _get_resources(self): resource['md5'] = resource['id'] = md5 resource['name'] = filename resource['resource_type'] = self.ckan_data_type - bpa_id = ingest_utils.extract_bpa_id(file_info.get('id')) + sample_id = ingest_utils.extract_ands_id(file_info.get('id')) xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) - resources.append(((bpa_id,), legacy_url, resource)) + resources.append(((sample_id,), legacy_url, resource)) return resources @@ -210,7 +211,7 @@ class SepsisGenomicsPacbioMetadata(BaseSepsisMetadata): technology = 'pacbio' spreadsheet = { 'fields': [ - fld("bpa_id", "Bacterial sample unique ID", coerce=ingest_utils.extract_bpa_id), + fld("sample_id", "Bacterial sample unique ID", coerce=ingest_utils.extract_ands_id), fld("insert_size_range", "Insert size range"), fld("library_construction_protocol", "Library construction protocol"), fld("sequencer", "Sequencer"), @@ -238,7 +239,7 @@ def __init__(self, metadata_path, contextual_metadata=None, metadata_info=None): def read_track_csv(self, fname): header, rows = csv_to_named_tuple('SepsisGenomicsPacbioTrack', fname) - return dict((ingest_utils.extract_bpa_id(t.five_digit_bpa_id), t) for t in rows) + return dict((ingest_utils.extract_ands_id(t.five_digit_bpa_id), t) for t in rows) def _get_packages(self): logger.info("Ingesting Sepsis Genomics Pacbio metadata from {0}".format(self.path)) @@ -250,16 +251,16 @@ def _get_packages(self): ticket = xlsx_info['ticket'] google_track_meta = self.google_track_meta.get(ticket) for row in rows: - bpa_id = row.bpa_id - track_meta = self.bpam_track_meta.get(bpa_id) + sample_id = row.sample_id + track_meta = self.bpam_track_meta.get(sample_id) obj = track_meta.copy() - name = bpa_id_to_ckan_name(bpa_id.split('.')[-1], self.ckan_data_type) + name = sample_id_to_ckan_name(sample_id.split('.')[-1], self.ckan_data_type) obj.update({ 'name': name, 'id': name, - 'bpa_id': bpa_id, + 'sample_id': sample_id, 'archive_ingestion_date': ingest_utils.get_date_isoformat(google_track_meta.date_of_transfer_to_archive), - 'title': 'Sepsis Genomics Pacbio %s' % (bpa_id.split('.')[-1]), + 'title': 'Sepsis Genomics Pacbio %s' % (sample_id.split('.')[-1]), 'ticket': row.ticket, 'facility': row.facility_code.upper(), 'notes': 'ARP Genomics Pacbio Raw Data: %s %s %s Replicate %s' % (track_meta['taxon_or_organism'], track_meta['strain_or_isolate'], obj['growth_media'], obj['replicate']), @@ -275,7 +276,7 @@ def _get_packages(self): 'data_generated': True, }) for contextual_source in self.contextual_metadata: - obj.update(contextual_source.get(bpa_id, track_meta)) + obj.update(contextual_source.get(sample_id, track_meta)) tag_names = sepsis_contextual_tags(self, obj) obj['tags'] = [{'name': t} for t in tag_names] packages.append(obj) @@ -291,10 +292,10 @@ def _get_resources(self): resource['md5'] = resource['id'] = md5 resource['name'] = filename resource['resource_type'] = self.ckan_data_type - bpa_id = ingest_utils.extract_bpa_id(file_info.get('id')) + sample_id = ingest_utils.extract_ands_id(file_info.get('id')) xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) - resources.append(((bpa_id,), legacy_url, resource)) + resources.append(((sample_id,), legacy_url, resource)) return resources @@ -309,7 +310,7 @@ class SepsisTranscriptomicsHiseqMetadata(BaseSepsisMetadata): technology = 'hiseq' spreadsheet = { 'fields': [ - fld("bpa_id", "Antibiotic Resistant Pathogen sample unique ID", coerce=ingest_utils.extract_bpa_id), + fld("sample_id", "Antibiotic Resistant Pathogen sample unique ID", coerce=ingest_utils.extract_ands_id), fld("sample", "Sample (MGR code)"), fld("library_construction_protocol", "Library construction protocol"), fld("barcode_tag", "Barcode tag"), @@ -337,7 +338,7 @@ def read_track_csv(self, fname): if fname is None: return {} header, rows = csv_to_named_tuple('SepsisGenomicsHiseqTrack', fname) - return dict((ingest_utils.extract_bpa_id(t.five_digit_bpa_id), t) for t in rows) + return dict((ingest_utils.extract_ands_id(t.five_digit_bpa_id), t) for t in rows) def _get_packages(self): logger.info("Ingesting Sepsis Transcriptomics Hiseq metadata from {0}".format(self.path)) @@ -347,7 +348,7 @@ def _get_packages(self): # for these to be combined together, into a single package, with two flow-cells. # Should be an uncommon case, only in AGRF data. - bpa_id_info = defaultdict(list) + sample_id_info = defaultdict(list) for fname in glob(self.path + '/*.xlsx'): logger.info("Processing Sepsis Transcriptomics metadata file {0}".format(fname)) @@ -357,31 +358,31 @@ def _get_packages(self): google_track_meta = self.google_track_meta.get(ticket) for row in rows: - bpa_id = row.bpa_id - if bpa_id is None: + sample_id = row.sample_id + if sample_id is None: continue - bpa_id_info[bpa_id].append([row, xlsx_info, google_track_meta]) + sample_id_info[sample_id].append([row, xlsx_info, google_track_meta]) # collate together the flow cell IDs - bpa_id_flowcells = defaultdict(set) + sample_id_flowcells = defaultdict(set) for md5_file in glob(self.path + '/*.md5'): for filename, md5, file_info in self.parse_md5file(md5_file): - bpa_id = ingest_utils.extract_bpa_id(file_info.get('id')) - bpa_id_flowcells[bpa_id].add(file_info['flow_cell_id']) + sample_id = ingest_utils.extract_ands_id(file_info.get('id')) + sample_id_flowcells[sample_id].add(file_info['flow_cell_id']) - for bpa_id, info in bpa_id_info.items(): + for sample_id, info in sample_id_info.items(): tickets = ', '.join(sorted(set(xlsx_info['ticket'] for _, xlsx_info, _ in info))) archive_ingestion_dates = ', '.join( sorted(set(google_track_meta.date_of_transfer_to_archive for _, _, google_track_meta in info))) - name = bpa_id_to_ckan_name(bpa_id.split('.')[-1], self.ckan_data_type) - track_meta = self.bpam_track_meta.get(bpa_id) + name = sample_id_to_ckan_name(sample_id.split('.')[-1], self.ckan_data_type) + track_meta = self.bpam_track_meta.get(sample_id) obj = track_meta.copy() obj.update({ 'name': name, 'id': name, - 'bpa_id': bpa_id, - 'flow_cell_ids': ', '.join(sorted(bpa_id_flowcells[bpa_id])), - 'title': 'ARP Transcriptomics Hiseq %s' % (bpa_id.split('.')[-1]), + 'sample_id': sample_id, + 'flow_cell_ids': ', '.join(sorted(sample_id_flowcells[sample_id])), + 'title': 'ARP Transcriptomics Hiseq %s' % (sample_id.split('.')[-1]), 'archive_ingestion_dates': archive_ingestion_dates, 'ticket': tickets, 'facility': row.facility_code.upper(), @@ -396,7 +397,7 @@ def _get_packages(self): 'data_generated': True, }) for contextual_source in self.contextual_metadata: - obj.update(contextual_source.get(bpa_id, track_meta)) + obj.update(contextual_source.get(sample_id, track_meta)) tag_names = sepsis_contextual_tags(self, obj) obj['tags'] = [{'name': t} for t in tag_names] packages.append(obj) @@ -414,10 +415,10 @@ def _get_resources(self): resource['md5'] = resource['id'] = md5 resource['name'] = filename resource['resource_type'] = self.ckan_data_type - bpa_id = ingest_utils.extract_bpa_id(file_info.get('id')) + sample_id = ingest_utils.extract_ands_id(file_info.get('id')) xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) - resources.append(((bpa_id,), legacy_url, resource)) + resources.append(((sample_id,), legacy_url, resource)) return resources @@ -432,7 +433,7 @@ class SepsisMetabolomicsGCMSMetadata(BaseSepsisMetadata): technology = 'gcms' spreadsheet = { 'fields': [ - fld('bpa_id', 'bacterial sample unique id', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'bacterial sample unique id', coerce=ingest_utils.extract_ands_id), fld('sample_fractionation_extract_solvent', 'sample fractionation / extraction solvent'), fld('gc_column_type', 'gc/column type'), fld('gradient_time_min_flow', 'gradient time (min) / flow'), @@ -461,7 +462,7 @@ def read_track_csv(self, fname): if fname is None: return {} header, rows = csv_to_named_tuple('SepsisMetabolomicsGCMSTrack', fname) - return dict((ingest_utils.extract_bpa_id(t.five_digit_bpa_id), t) for t in rows) + return dict((ingest_utils.extract_ands_id(t.five_digit_bpa_id), t) for t in rows) def _get_packages(self): packages = [] @@ -472,18 +473,18 @@ def _get_packages(self): ticket = xlsx_info['ticket'] google_track_meta = self.google_track_meta.get(ticket) for row in rows: - bpa_id = row.bpa_id - if bpa_id is None: + sample_id = row.sample_id + if sample_id is None: continue - track_meta = self.bpam_track_meta.get(bpa_id) + track_meta = self.bpam_track_meta.get(sample_id) obj = track_meta.copy() - name = bpa_id_to_ckan_name(bpa_id.split('.')[-1], self.ckan_data_type) + name = sample_id_to_ckan_name(sample_id.split('.')[-1], self.ckan_data_type) obj.update({ 'name': name, 'id': name, - 'bpa_id': bpa_id, + 'sample_id': sample_id, 'archive_ingestion_date': ingest_utils.get_date_isoformat(google_track_meta.date_of_transfer_to_archive), - 'title': 'ARP Metabolomics GCMS %s' % (bpa_id.split('.')[-1]), + 'title': 'ARP Metabolomics GCMS %s' % (sample_id.split('.')[-1]), 'ticket': row.ticket, 'facility': row.facility_code.upper(), 'notes': 'ARP Metabolomics GCMS Raw Data: %s %s %s Replicate %s' % (track_meta['taxon_or_organism'], track_meta['strain_or_isolate'], obj['growth_media'], obj['replicate']), @@ -498,7 +499,7 @@ def _get_packages(self): 'data_generated': True, }) for contextual_source in self.contextual_metadata: - obj.update(contextual_source.get(bpa_id, track_meta)) + obj.update(contextual_source.get(sample_id, track_meta)) tag_names = sepsis_contextual_tags(self, obj) obj['tags'] = [{'name': t} for t in tag_names] packages.append(obj) @@ -514,10 +515,10 @@ def _get_resources(self): resource['md5'] = resource['id'] = md5 resource['name'] = filename resource['resource_type'] = self.ckan_data_type - bpa_id = ingest_utils.extract_bpa_id(file_info.get('id')) + sample_id = ingest_utils.extract_ands_id(file_info.get('id')) xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) - resources.append(((bpa_id,), legacy_url, resource)) + resources.append(((sample_id,), legacy_url, resource)) return resources @@ -532,7 +533,7 @@ class SepsisMetabolomicsLCMSMetadata(BaseSepsisMetadata): technology = 'lcms' spreadsheet = { 'fields': [ - fld("bpa_id", "Bacterial sample unique ID", coerce=ingest_utils.extract_bpa_id), + fld("sample_id", "Bacterial sample unique ID", coerce=ingest_utils.extract_ands_id), fld("sample_fractionation_extract_solvent", "Sample fractionation / Extraction Solvent"), fld("lc_column_type", "LC/column type"), fld("gradient_time_min_flow", "Gradient time (min) / flow"), @@ -561,7 +562,7 @@ def read_track_csv(self, fname): if fname is None: return {} header, rows = csv_to_named_tuple('SepsisMetabolomicsLCMSTrack', fname) - return dict((ingest_utils.extract_bpa_id(t.five_digit_bpa_id), t) for t in rows) + return dict((ingest_utils.extract_ands_id(t.five_digit_bpa_id), t) for t in rows) def _get_packages(self): packages = [] @@ -572,18 +573,18 @@ def _get_packages(self): ticket = xlsx_info['ticket'] google_track_meta = self.google_track_meta.get(ticket) for row in rows: - bpa_id = row.bpa_id - if bpa_id is None: + sample_id = row.sample_id + if sample_id is None: continue - track_meta = self.bpam_track_meta.get(bpa_id) + track_meta = self.bpam_track_meta.get(sample_id) obj = track_meta.copy() - name = bpa_id_to_ckan_name(bpa_id.split('.')[-1], self.ckan_data_type) + name = sample_id_to_ckan_name(sample_id.split('.')[-1], self.ckan_data_type) obj.update({ 'name': name, 'id': name, - 'bpa_id': bpa_id, + 'sample_id': sample_id, 'archive_ingestion_date': ingest_utils.get_date_isoformat(google_track_meta.date_of_transfer_to_archive), - 'title': 'ARP Metabolomics LCMS %s' % (bpa_id.split('.')[-1]), + 'title': 'ARP Metabolomics LCMS %s' % (sample_id.split('.')[-1]), 'ticket': row.ticket, 'facility': row.facility_code.upper(), 'notes': 'ARP Metabolomics LCMS Raw Data: %s %s %s Replicate %s' % (track_meta['taxon_or_organism'], track_meta['strain_or_isolate'], obj['growth_media'], obj['replicate']), @@ -598,7 +599,7 @@ def _get_packages(self): 'data_generated': True, }) for contextual_source in self.contextual_metadata: - obj.update(contextual_source.get(bpa_id, track_meta)) + obj.update(contextual_source.get(sample_id, track_meta)) tag_names = sepsis_contextual_tags(self, obj) obj['tags'] = [{'name': t} for t in tag_names] packages.append(obj) @@ -614,10 +615,10 @@ def _get_resources(self): resource['md5'] = resource['id'] = md5 resource['name'] = filename resource['resource_type'] = self.ckan_data_type - bpa_id = ingest_utils.extract_bpa_id(file_info.get('id')) + sample_id = ingest_utils.extract_ands_id(file_info.get('id')) xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) - resources.append(((bpa_id,), legacy_url, resource)) + resources.append(((sample_id,), legacy_url, resource)) return resources @@ -632,7 +633,7 @@ class SepsisProteomicsMS1QuantificationMetadata(BaseSepsisMetadata): technology = 'ms1quantification' spreadsheet = { 'fields': [ - fld("bpa_id", "Bacterial sample unique ID", coerce=ingest_utils.extract_bpa_id), + fld("sample_id", "Bacterial sample unique ID", coerce=ingest_utils.extract_ands_id), fld("facility", "Facility"), fld("sample_fractionation_none_number", "Sample fractionation (none/number)"), fld("lc_column_type", "LC/column type"), @@ -663,7 +664,7 @@ def read_track_csv(self, fname): if fname is None: return {} header, rows = csv_to_named_tuple('SepsisProteomicsMS1QuantificationTrack', fname) - return dict((ingest_utils.extract_bpa_id(t.five_digit_bpa_id), t) for t in rows) + return dict((ingest_utils.extract_ands_id(t.five_digit_bpa_id), t) for t in rows) def _get_packages(self): packages = [] @@ -674,21 +675,21 @@ def _get_packages(self): ticket = xlsx_info['ticket'] google_track_meta = self.google_track_meta.get(ticket) for row in rows: - bpa_id = row.bpa_id - if bpa_id is None: + sample_id = row.sample_id + if sample_id is None: continue - bpam_track_meta = self.bpam_track_meta.get(bpa_id) + bpam_track_meta = self.bpam_track_meta.get(sample_id) if 'taxon_or_organism' not in bpam_track_meta: continue obj = bpam_track_meta.copy() - name = bpa_id_to_ckan_name(bpa_id.split('.')[-1], self.ckan_data_type) + name = sample_id_to_ckan_name(sample_id.split('.')[-1], self.ckan_data_type) for contextual_source in self.contextual_metadata: - obj.update(contextual_source.get(bpa_id, bpam_track_meta)) + obj.update(contextual_source.get(sample_id, bpam_track_meta)) obj.update({ 'name': name, 'id': name, - 'bpa_id': bpa_id, - 'title': 'ARP Proteomics MS1Quantification %s' % (bpa_id.split('.')[-1]), + 'sample_id': sample_id, + 'title': 'ARP Proteomics MS1Quantification %s' % (sample_id.split('.')[-1]), 'ticket': row.ticket, 'facility': row.facility_code.upper(), 'archive_ingestion_date': ingest_utils.get_date_isoformat(google_track_meta.date_of_transfer_to_archive), @@ -705,7 +706,7 @@ def _get_packages(self): 'data_generated': True, }) tag_names = sepsis_contextual_tags(self, obj) - obj['tags']=[{'name': expanded_tag_name(t)} for t in tag_names] + obj['tags'] = [{'name': expanded_tag_name(t)} for t in tag_names] packages.append(obj) return packages @@ -719,10 +720,10 @@ def _get_resources(self): resource['md5'] = resource['id'] = md5 resource['name'] = filename resource['resource_type'] = self.ckan_data_type - bpa_id = ingest_utils.extract_bpa_id(file_info.get('id')) + sample_id = ingest_utils.extract_ands_id(file_info.get('id')) xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) - resources.append(((bpa_id,), legacy_url, resource)) + resources.append(((sample_id,), legacy_url, resource)) return resources @@ -737,7 +738,7 @@ class SepsisProteomicsSwathMSBaseSepsisMetadata(BaseSepsisMetadata): technology = 'swathms' spreadsheet = { 'fields': [ - fld("bpa_id", "Bacterial sample unique ID", coerce=parse_pooled_bpa_id), + fld("sample_id", "Bacterial sample unique ID", coerce=parse_pooled_sample_id), fld("facility", "Facility"), fld("sample_fractionation_none_number", "Sample fractionation (none/number)"), fld("lc_column_type", "LC/column type"), @@ -765,13 +766,13 @@ def read_track_csv(self, fname): if fname is None: return {} header, rows = csv_to_named_tuple('SepsisProteomicsSwathMSTrack', fname) - return dict((ingest_utils.extract_bpa_id(t.five_digit_bpa_id), t) for t in rows) + return dict((ingest_utils.extract_ands_id(t.five_digit_bpa_id), t) for t in rows) def get_spreadsheet_data(self): """ proteomics SWATH is a bit different, the spreadsheets might have dupes by `bpa id`, so some data in the sheet is per-file and some is per-ID. the only way to go from - filename back to the pool/bpa_id is via the spreadsheet, so we also need to build + filename back to the pool/sample_id is via the spreadsheet, so we also need to build that mapping """ package_data = {} @@ -783,24 +784,24 @@ def get_spreadsheet_data(self): ticket = xlsx_info['ticket'] google_track_meta = self.google_track_meta.get(ticket) for row in rows: - bpa_id = row.bpa_id - if bpa_id is None: + sample_id = row.sample_id + if sample_id is None: continue contextual_meta = {} - # if `bpa_id` is a tuple, we've got a pooled sample - if type(bpa_id) is tuple: + # if `sample_id` is a tuple, we've got a pooled sample + if type(sample_id) is tuple: data_type = '2d' - printable_bpa_id = '_'.join([t.split('.')[-1] for t in sorted(bpa_id)]) - track_meta = common_values([self.bpam_track_meta.get(t) for t in bpa_id]) + printable_sample_id = '_'.join([t.split('.')[-1] for t in sorted(sample_id)]) + track_meta = common_values([self.bpam_track_meta.get(t) for t in sample_id]) for contextual_source in self.contextual_metadata: - contextual_meta.update(common_values([contextual_source.get(t, track_meta) for t in bpa_id])) + contextual_meta.update(common_values([contextual_source.get(t, track_meta) for t in sample_id])) else: data_type = '1d' - printable_bpa_id = bpa_id - track_meta = self.bpam_track_meta.get(bpa_id) + printable_sample_id = sample_id + track_meta = self.bpam_track_meta.get(sample_id) for contextual_source in self.contextual_metadata: - contextual_meta.update(contextual_source.get(bpa_id, track_meta)) - name = bpa_id_to_ckan_name(printable_bpa_id.split('.')[-1], self.ckan_data_type) + contextual_meta.update(contextual_source.get(sample_id, track_meta)) + name = sample_id_to_ckan_name(printable_sample_id.split('.')[-1], self.ckan_data_type) package_meta = { 'facility': row.facility_code.upper(), 'ticket': row.ticket, @@ -810,9 +811,9 @@ def get_spreadsheet_data(self): 'archive_ingestion_date': ingest_utils.get_date_isoformat(google_track_meta.date_of_transfer_to_archive), } package_meta.update(contextual_meta) - package_data[name] = (name, data_type, printable_bpa_id, track_meta, package_meta) + package_data[name] = (name, data_type, printable_sample_id, track_meta, package_meta) file_data[row.raw_file_name] = { - 'package_name': printable_bpa_id, + 'package_name': printable_sample_id, 'sample_fractionation_none_number': row.sample_fractionation_none_number, 'sample_on_column': row.sample_on_column, 'acquisition_mode_fragmentation': row.acquisition_mode_fragmentation, @@ -821,7 +822,7 @@ def get_spreadsheet_data(self): def get_swath_packages(self, data_type): packages = [] - for package_name, (name, package_data_type, printable_bpa_id, track_meta, submission_meta) in list(self.package_data.items()): + for package_name, (name, package_data_type, printable_sample_id, track_meta, submission_meta) in list(self.package_data.items()): if package_data_type != data_type: continue obj = track_meta.copy() @@ -829,22 +830,22 @@ def get_swath_packages(self, data_type): pool = '' if data_type == '1d': obj.update({ - 'bpa_id': printable_bpa_id, + 'sample_id': printable_sample_id, }) if data_type == '2d': pool = 'Pool ' obj.update({ - 'pool_bpa_ids': printable_bpa_id, + 'pool_sample_ids': printable_sample_id, }) # package won't get replicate number if datatype is 2d, because it's getting common values. see code above. if 'replicate' in obj: - replicate = ' Replicate %s'% obj['replicate'] + replicate = ' Replicate %s' % obj['replicate'] else: replicate = '' obj.update({ 'name': name, 'id': name, - 'title': 'ARP Proteomics SwathMS %s%s' % (pool, printable_bpa_id.split('.')[-1]), + 'title': 'ARP Proteomics SwathMS %s%s' % (pool, printable_sample_id.split('.')[-1]), 'notes': 'ARP Proteomics SwathMS %sRaw Data: %s %s %s%s' % (pool, track_meta['taxon_or_organism'], track_meta['strain_or_isolate'], obj['growth_media'], replicate), 'type': self.ckan_data_type, 'private': True, @@ -874,7 +875,7 @@ def get_swath_resources(self, data_type): resource.update(file_meta) resource['name'] = filename if data_type == '1d': - package_id = ingest_utils.extract_bpa_id(file_info.get('id')) + package_id = ingest_utils.extract_ands_id(file_info.get('id')) elif data_type == '2d': package_id = package_name @@ -902,7 +903,7 @@ class SepsisProteomicsSwathMSCombinedSampleMetadata(BaseSepsisMetadata): technology = 'swathms-combined-sample' spreadsheet = { 'fields': [ - fld('bpa_id_list', 'bacterial sample unique id', coerce=make_bpa_id_list), + fld('sample_id_list', 'bacterial sample unique id', coerce=make_sample_id_list), fld('facility', 'facility'), fld('sample_fractionation_none_number', 'sample fractionation (none/number)'), fld('lc_column_type', 'lc/column type'), @@ -949,7 +950,7 @@ def _get_packages(self): # we're hitting the 100-char limit, so we have to hash the folder name when # generating the CKAN name folder_name_md5 = md5hash(folder_name.encode('utf8')).hexdigest() - name = bpa_id_to_ckan_name(folder_name_md5, self.ckan_data_type) + name = sample_id_to_ckan_name(folder_name_md5, self.ckan_data_type) track_meta = self.google_track_meta.get(ticket) taxons, strains = self.google_track_meta.get_taxons_strains(ticket) obj.update({ @@ -1011,7 +1012,7 @@ class SepsisProteomics2DLibraryMetadata(BaseSepsisMetadata): technology = '2dlibrary' spreadsheet = { 'fields': [ - fld('bpa_id_list', 'bacterial sample unique id', coerce=make_bpa_id_list), + fld('sample_id_list', 'bacterial sample unique id', coerce=make_sample_id_list), fld('facility', 'facility'), fld('sample_fractionation_none_number', 'sample fractionation (none/number)'), fld('lc_column_type', 'lc/column type'), @@ -1058,7 +1059,7 @@ def _get_packages(self): # we're hitting the 100-char limit, so we have to hash the folder name when # generating the CKAN name folder_name_md5 = md5hash(folder_name.encode('utf8')).hexdigest() - name = bpa_id_to_ckan_name(folder_name_md5, self.ckan_data_type) + name = sample_id_to_ckan_name(folder_name_md5, self.ckan_data_type) track_meta = self.google_track_meta.get(ticket) obj.update({ 'name': name, @@ -1133,7 +1134,7 @@ def _get_resources(self): class SepsisProteomicsSwathMSPoolMetadata(SepsisProteomicsSwathMSBaseSepsisMetadata): ckan_data_type = 'arp-proteomics-swathms-pool' pool = True - resource_linkage = ('pool_bpa_ids',) + resource_linkage = ('pool_sample_ids',) md5 = { 'match': [ files.proteomics_swathms_lib_filename_re, @@ -1156,21 +1157,21 @@ def _get_resources(self): class BaseSepsisAnalysedMetadata(BaseSepsisMetadata): - def apply_common_context(self, obj, bpa_ids): + def apply_common_context(self, obj, sample_ids): # find the contextual metadata in common between these BPA IDs context_objs = [] - for bpa_id in bpa_ids: + for sample_id in sample_ids: context_obj = {} for contextual_source in self.contextual_metadata: - context_obj.update(contextual_source.get(bpa_id, obj)) + context_obj.update(contextual_source.get(sample_id, obj)) context_objs.append(context_obj) obj.update(common_values(context_objs)) # find the tracking metadata in common between these BPA IDs tracking_objs = [] - for bpa_id in bpa_ids: + for sample_id in sample_ids: tracking_obj = {} for bpam_source in self.bpam_track_meta: - tracking_obj.update(bpam_source.get(bpa_id)) + tracking_obj.update(bpam_source.get(sample_id)) tracking_objs.append(tracking_obj) obj.update(common_values(tracking_objs)) return obj @@ -1206,7 +1207,7 @@ class SepsisProteomicsAnalysedMetadata(BaseSepsisAnalysedMetadata): 'fields': [ fld('data_analysis_date', 'data analysis date (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), fld('facility_project_code_experiment_code', 'facility project code_facility experiment code'), - fld('bpa_id', 'sample name (5 digit bpa id)', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'sample name (5 digit bpa id)', coerce=ingest_utils.extract_ands_id), fld('taxon_or_organism', 'taxon_or_organism'), fld('strain_or_isolate', 'strain_or_isolate'), fld('serovar', 'serovar'), @@ -1267,18 +1268,18 @@ def _get_packages(self): # we're hitting the 100-char limit, so we have to hash the folder name when # generating the CKAN name folder_name_md5 = md5hash(folder_name.encode('utf8')).hexdigest() - name = bpa_id_to_ckan_name(folder_name_md5, self.ckan_data_type) + name = sample_id_to_ckan_name(folder_name_md5, self.ckan_data_type) track_meta = self.google_track_meta.get(ticket) - bpa_ids = list(sorted(set([t.bpa_id for t in rows if t.bpa_id]))) + sample_ids = list(sorted(set([t.sample_id for t in rows if t.sample_id]))) obj.update(self.google_drive_track_to_object(track_meta)) - self.apply_common_context(obj, bpa_ids) + self.apply_common_context(obj, sample_ids) obj.update({ 'name': name, 'id': name, 'notes': '%s' % (folder_name), 'title': '%s' % (folder_name), 'omics': 'proteomics', - 'bpa_ids': ', '.join(bpa_ids), + 'sample_ids': ', '.join(sample_ids), 'data_generated': 'True', 'type': self.ckan_data_type, 'date_of_transfer': ingest_utils.get_date_isoformat(track_meta.date_of_transfer), @@ -1339,7 +1340,7 @@ class SepsisTranscriptomicsAnalysedMetadata(BaseSepsisAnalysedMetadata): spreadsheet = { 'fields': [ fld('data_analysis_date', 'data analysis date (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), - fld('bpa_id', 'sample name (5 digit bpa id)', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'sample name (5 digit bpa id)', coerce=ingest_utils.extract_ands_id), fld('taxon_or_organism', 'taxon_or_organism'), fld('strain_or_isolate', 'strain_or_isolate'), fld('serovar', 'serovar'), @@ -1398,18 +1399,18 @@ def _get_packages(self): # we're hitting the 100-char limit, so we have to hash the folder name when # generating the CKAN name folder_name_md5 = md5hash(folder_name.encode('utf8')).hexdigest() - name = bpa_id_to_ckan_name(folder_name_md5, self.ckan_data_type) + name = sample_id_to_ckan_name(folder_name_md5, self.ckan_data_type) track_meta = self.google_track_meta.get(ticket) - bpa_ids = list(sorted(set([t.bpa_id for t in rows]))) + sample_ids = list(sorted(set([t.sample_id for t in rows]))) obj.update(self.google_drive_track_to_object(track_meta)) - self.apply_common_context(obj, bpa_ids) + self.apply_common_context(obj, sample_ids) obj.update({ 'name': name, 'id': name, 'notes': '%s' % (folder_name), 'title': '%s' % (folder_name), 'omics': 'transcriptomics', - 'bpa_ids': ', '.join(bpa_ids), + 'sample_ids': ', '.join(sample_ids), 'data_generated': 'True', 'type': self.ckan_data_type, 'date_of_transfer': ingest_utils.get_date_isoformat(track_meta.date_of_transfer), @@ -1464,7 +1465,7 @@ class SepsisMetabolomicsAnalysedMetadata(BaseSepsisAnalysedMetadata): spreadsheet = { 'fields': [ fld('data_analysis_date', 'data analysis date (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), - fld('bpa_id', 'sample name (5 digit bpa id)', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'sample name (5 digit bpa id)', coerce=ingest_utils.extract_ands_id), fld('taxon_or_organism', 'taxon_or_organism'), fld('strain_or_isolate', 'strain_or_isolate'), fld('serovar', 'serovar'), @@ -1520,19 +1521,19 @@ def _get_packages(self): # we're hitting the 100-char limit, so we have to hash the folder name when # generating the CKAN name folder_name_md5 = md5hash(folder_name.encode('utf8')).hexdigest() - name = bpa_id_to_ckan_name(folder_name_md5, self.ckan_data_type) + name = sample_id_to_ckan_name(folder_name_md5, self.ckan_data_type) track_meta = self.google_track_meta.get(ticket) - bpa_ids = list(sorted(set([t.bpa_id for t in rows if t.bpa_id]))) + sample_ids = list(sorted(set([t.sample_id for t in rows if t.sample_id]))) analytical_platform = list(sorted(set([t.analytical_platform for t in rows if t.analytical_platform]))) obj.update(self.google_drive_track_to_object(track_meta)) - self.apply_common_context(obj, bpa_ids) + self.apply_common_context(obj, sample_ids) obj.update({ 'name': name, 'id': name, 'notes': '%s' % (folder_name), 'title': '%s' % (folder_name), 'omics': 'metabolomics', - 'bpa_ids': ', '.join(bpa_ids), + 'sample_ids': ', '.join(sample_ids), 'analytical_platform': ', '.join(analytical_platform), 'data_generated': 'True', 'type': self.ckan_data_type, @@ -1589,7 +1590,7 @@ class SepsisGenomicsAnalysedMetadata(BaseSepsisAnalysedMetadata): spreadsheet = { 'fields': [ fld('data_analysis_date', 'data analysis date (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), - fld('bpa_id', 'sample name (5 digit bpa id)', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'sample name (5 digit bpa id)', coerce=ingest_utils.extract_ands_id), fld('taxon_or_organism', 'taxon_or_organism'), fld('strain_or_isolate', 'strain_or_isolate'), fld('serovar', 'serovar'), @@ -1642,11 +1643,11 @@ def _get_packages(self): # we're hitting the 100-char limit, so we have to hash the folder name when # generating the CKAN name folder_name_md5 = md5hash(folder_name.encode('utf8')).hexdigest() - name = bpa_id_to_ckan_name(folder_name_md5, self.ckan_data_type) + name = sample_id_to_ckan_name(folder_name_md5, self.ckan_data_type) track_meta = self.google_track_meta.get(ticket) - bpa_ids = list(sorted(set([t.bpa_id for t in rows if t.bpa_id]))) + sample_ids = list(sorted(set([t.sample_id for t in rows if t.sample_id]))) obj.update(self.google_drive_track_to_object(track_meta)) - self.apply_common_context(obj, bpa_ids) + self.apply_common_context(obj, sample_ids) analytical_platform = list(sorted(set([t.analytical_platform for t in rows if t.analytical_platform]))) obj.update({ 'name': name, @@ -1654,7 +1655,7 @@ def _get_packages(self): 'notes': '%s' % (folder_name), 'title': '%s' % (folder_name), 'omics': 'genomics', - 'bpa_ids': ', '.join(bpa_ids), + 'sample_ids': ', '.join(sample_ids), 'data_generated': 'True', 'type': self.ckan_data_type, 'date_of_transfer': ingest_utils.get_date_isoformat(track_meta.date_of_transfer), @@ -1711,7 +1712,7 @@ class SepsisProteomicsProteinDatabaseMetadata(BaseSepsisAnalysedMetadata): spreadsheet = { 'fields': [ fld('database_generation_date', 'database generation date (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), - fld('bpa_id', 'sample name (5 digit bpa id)', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'sample name (5 digit bpa id)', coerce=ingest_utils.extract_ands_id), fld('taxon_or_organism', 'taxon_or_organism'), fld('strain_or_isolate', 'strain_or_isolate'), fld('serovar', 'serovar'), @@ -1761,9 +1762,9 @@ def _get_packages(self): # we're hitting the 100-char limit, so we have to hash the folder name when # generating the CKAN name folder_name_md5 = md5hash(folder_name.encode('utf8')).hexdigest() - name = bpa_id_to_ckan_name(folder_name_md5, self.ckan_data_type) + name = sample_id_to_ckan_name(folder_name_md5, self.ckan_data_type) track_meta = self.google_track_meta.get(ticket) - bpa_ids = list(sorted(set([t.bpa_id for t in rows if t.bpa_id]))) + sample_ids = list(sorted(set([t.sample_id for t in rows if t.sample_id]))) obj.update(self.google_drive_track_to_object(track_meta)) obj.update({ 'name': name, @@ -1771,7 +1772,7 @@ def _get_packages(self): 'notes': '%s' % (folder_name), 'title': '%s' % (folder_name), 'omics': 'proteomics', - 'bpa_ids': ', '.join(bpa_ids), + 'sample_ids': ', '.join(sample_ids), 'data_generated': 'True', 'type': self.ckan_data_type, 'date_of_transfer': ingest_utils.get_date_isoformat(track_meta.date_of_transfer), diff --git a/bpaingest/projects/sepsis/tracking.py b/bpaingest/projects/sepsis/tracking.py index ac351035..0c1eb3b7 100644 --- a/bpaingest/projects/sepsis/tracking.py +++ b/bpaingest/projects/sepsis/tracking.py @@ -13,13 +13,13 @@ def __init__(self, name): def read_track_csv(self, fname): header, rows = csv_to_named_tuple('SepsisTrack', fname) - return dict((ingest_utils.extract_bpa_id(t.five_digit_bpa_id), t) for t in rows) + return dict((ingest_utils.extract_ands_id(t.five_digit_bpa_id), t) for t in rows) - def get(self, bpa_id): - if bpa_id not in self.track_meta: - logger.debug("No %s metadata for %s" % (type(self).__name__, bpa_id)) + def get(self, sample_id): + if sample_id not in self.track_meta: + logger.debug("No %s metadata for %s" % (type(self).__name__, sample_id)) return {} - track_meta = self.track_meta[bpa_id] + track_meta = self.track_meta[sample_id] return { 'data_type': track_meta.data_type, 'taxon_or_organism': track_meta.taxon_or_organism, @@ -37,9 +37,9 @@ def get(self, bpa_id): class SepsisGenomicsTrackMetadata(SepsisTrackMetadata): - def get(self, bpa_id): - obj = super(SepsisGenomicsTrackMetadata, self).get(bpa_id) - track_meta = self.track_meta.get(bpa_id) + def get(self, sample_id): + obj = super(SepsisGenomicsTrackMetadata, self).get(sample_id) + track_meta = self.track_meta.get(sample_id) if track_meta: obj['growth_condition_notes'] = track_meta.growth_condition_notes return obj diff --git a/bpaingest/projects/stemcells/contextual.py b/bpaingest/projects/stemcells/contextual.py index eefd18be..951750dd 100644 --- a/bpaingest/projects/stemcells/contextual.py +++ b/bpaingest/projects/stemcells/contextual.py @@ -21,21 +21,21 @@ def __init__(self, path): xlsx_path = one(glob(path + '/*.xlsx')) self.sample_metadata = self._package_metadata(self._read_metadata(xlsx_path)) - def get(self, bpa_id): - if bpa_id in self.sample_metadata: - return self.sample_metadata[bpa_id] - logger.warning("no %s metadata available for: %s" % (type(self).__name__, bpa_id)) + def get(self, sample_id): + if sample_id in self.sample_metadata: + return self.sample_metadata[sample_id] + logger.warning("no %s metadata available for: %s" % (type(self).__name__, sample_id)) return {} def _package_metadata(self, rows): sample_metadata = {} for row in rows: - if row.bpa_id is None: + if row.sample_id is None: continue - assert(row.bpa_id not in sample_metadata) - sample_metadata[row.bpa_id] = row_meta = {} + assert(row.sample_id not in sample_metadata) + sample_metadata[row.sample_id] = row_meta = {} for field in row._fields: - if field != 'bpa_id': + if field != 'sample_id': row_meta[field] = getattr(row, field) return sample_metadata @@ -49,8 +49,8 @@ def _read_metadata(self, metadata_path): fld('sample_submission_date', 'sample submission date', coerce=ingest_utils.get_date_isoformat), fld('archive_ingestion_date', 'archive ingestion date', coerce=ingest_utils.get_date_isoformat), fld('total_samples', 'total samples', coerce=ingest_utils.get_int), - fld('bpa_dataset_id', 'data set id', coerce=ingest_utils.extract_bpa_id), - fld('bpa_id', 'bpa id', coerce=ingest_utils.extract_bpa_id), + fld('bpa_dataset_id', 'data set id', coerce=ingest_utils.extract_ands_id), + fld('sample_id', 'bpa id', coerce=ingest_utils.extract_ands_id), fld('plate_number', 'plate number'), fld('well_number', 'well number', coerce=ingest_utils.get_int), fld('sample_name', 'sample name'), @@ -91,21 +91,21 @@ def __init__(self, path): xlsx_path = one(glob(path + '/*.xlsx')) self.sample_metadata = self._package_metadata(self._read_metadata(xlsx_path)) - def get(self, bpa_id): - if bpa_id in self.sample_metadata: - return self.sample_metadata[bpa_id] - logger.warning("no %s metadata available for: %s" % (type(self).__name__, bpa_id)) + def get(self, sample_id): + if sample_id in self.sample_metadata: + return self.sample_metadata[sample_id] + logger.warning("no %s metadata available for: %s" % (type(self).__name__, sample_id)) return {} def _package_metadata(self, rows): sample_metadata = {} for row in rows: - if row.bpa_id is None: + if row.sample_id is None: continue - assert(row.bpa_id not in sample_metadata) - sample_metadata[row.bpa_id] = row_meta = {} + assert(row.sample_id not in sample_metadata) + sample_metadata[row.sample_id] = row_meta = {} for field in row._fields: - if field != 'bpa_id': + if field != 'sample_id': row_meta[field] = getattr(row, field) return sample_metadata @@ -119,8 +119,8 @@ def _read_metadata(self, metadata_path): fld('sample_submission_date', 'sample submission date', coerce=ingest_utils.get_date_isoformat), fld('archive_ingestion_date', 'archive ingestion date', coerce=ingest_utils.get_date_isoformat), fld('total_samples', 'total samples', coerce=ingest_utils.get_int), - fld('bpa_dataset_id', 'data set id', coerce=ingest_utils.extract_bpa_id), - fld('bpa_id', 'bpa id', coerce=ingest_utils.extract_bpa_id), + fld('bpa_dataset_id', 'data set id', coerce=ingest_utils.extract_ands_id), + fld('sample_id', 'bpa id', coerce=ingest_utils.extract_ands_id), fld('plate_number', 'plate number', coerce=ingest_utils.get_int), fld('well_number', 'well number', coerce=ingest_utils.get_int), fld('sample_name', 'sample name'), @@ -161,21 +161,21 @@ def __init__(self, path): xlsx_path = one(glob(path + '/*.xlsx')) self.sample_metadata = self._package_metadata(self._read_metadata(xlsx_path)) - def get(self, bpa_id): - if bpa_id in self.sample_metadata: - return self.sample_metadata[bpa_id] - logger.warning("no %s metadata available for: %s" % (type(self).__name__, bpa_id)) + def get(self, sample_id): + if sample_id in self.sample_metadata: + return self.sample_metadata[sample_id] + logger.warning("no %s metadata available for: %s" % (type(self).__name__, sample_id)) return {} def _package_metadata(self, rows): sample_metadata = {} for row in rows: - if row.bpa_id is None: + if row.sample_id is None: continue - assert(row.bpa_id not in sample_metadata) - sample_metadata[row.bpa_id] = row_meta = {} + assert(row.sample_id not in sample_metadata) + sample_metadata[row.sample_id] = row_meta = {} for field in row._fields: - if field != 'bpa_id': + if field != 'sample_id': row_meta[field] = getattr(row, field) return sample_metadata @@ -189,8 +189,8 @@ def _read_metadata(self, metadata_path): fld('sample_submission_date', 'sample submission date', coerce=ingest_utils.get_date_isoformat), fld('archive_ingestion_date', 'archive ingestion date', coerce=ingest_utils.get_date_isoformat), fld('total_samples', 'total samples', coerce=ingest_utils.get_int), - fld('bpa_dataset_id', 'data set id', coerce=ingest_utils.extract_bpa_id), - fld('bpa_id', 'bpa id', coerce=ingest_utils.extract_bpa_id), + fld('bpa_dataset_id', 'data set id', coerce=ingest_utils.extract_ands_id), + fld('sample_id', 'bpa id', coerce=ingest_utils.extract_ands_id), fld('plate_number', 'plate number', coerce=ingest_utils.get_int), fld('well_number', 'well number', coerce=ingest_utils.get_int), fld('sample_name', 'sample name'), @@ -231,8 +231,8 @@ def __init__(self, path): xlsx_path = one(glob(path + '/*.xlsx')) self.sample_metadata = self._package_metadata(self._read_metadata(xlsx_path)) - def get(self, bpa_id, analytical_platform): - tpl = (bpa_id, analytical_platform) + def get(self, sample_id, analytical_platform): + tpl = (sample_id, analytical_platform) if tpl in self.sample_metadata: return self.sample_metadata[tpl] logger.warning("no %s metadata available for: %s" % (type(self).__name__, tpl)) @@ -242,13 +242,13 @@ def get(self, bpa_id, analytical_platform): def _package_metadata(self, rows): sample_metadata = {} for row in rows: - if row.bpa_id is None: + if row.sample_id is None: continue - tpl = (row.bpa_id, row.analytical_platform) + tpl = (row.sample_id, row.analytical_platform) assert(tpl not in sample_metadata) sample_metadata[tpl] = row_meta = {} for field in row._fields: - if field != 'bpa_id' and field != 'analytical_platform': + if field != 'sample_id' and field != 'analytical_platform': row_meta[field] = getattr(row, field) return sample_metadata @@ -262,8 +262,8 @@ def _read_metadata(self, metadata_path): fld('sample_submission_date', 'sample submission date', coerce=ingest_utils.get_date_isoformat), fld('archive_ingestion_date', 'archive ingestion date', coerce=ingest_utils.get_date_isoformat), fld('total_samples', 'total samples', coerce=ingest_utils.get_int), - fld('bpa_dataset_id', 'data set id', coerce=ingest_utils.extract_bpa_id), - fld('bpa_id', 'bpa id', coerce=ingest_utils.extract_bpa_id), + fld('bpa_dataset_id', 'data set id', coerce=ingest_utils.extract_ands_id), + fld('sample_id', 'bpa id', coerce=ingest_utils.extract_ands_id), fld('plate_number', 'plate number', coerce=ingest_utils.get_int), fld('well_number', 'well number', coerce=ingest_utils.get_int), fld('sample_name', 'sample name', coerce=ingest_utils.get_int), @@ -301,10 +301,10 @@ def __init__(self, path): xlsx_path = one(glob(path + '/*.xlsx')) self.sample_metadata = self._package_metadata(self._read_metadata(xlsx_path)) - def get(self, bpa_id): - if bpa_id in self.sample_metadata: - return self.sample_metadata[bpa_id] - logger.warning("no %s metadata available for: %s" % (type(self).__name__, bpa_id)) + def get(self, sample_id): + if sample_id in self.sample_metadata: + return self.sample_metadata[sample_id] + logger.warning("no %s metadata available for: %s" % (type(self).__name__, sample_id)) return {} def _package_metadata(self, rows): @@ -312,11 +312,11 @@ def _package_metadata(self, rows): for row in rows: row_meta = {} for field in row._fields: - if field != 'bpa_id': + if field != 'sample_id': row_meta[field] = getattr(row, field) - if row.bpa_id: - assert(row.bpa_id not in sample_metadata) - sample_metadata[row.bpa_id] = row_meta + if row.sample_id: + assert(row.sample_id not in sample_metadata) + sample_metadata[row.sample_id] = row_meta return sample_metadata def _read_metadata(self, metadata_path): @@ -331,8 +331,8 @@ def _read_metadata(self, metadata_path): fld('sample_submission_date', 'sample submission date', coerce=ingest_utils.get_date_isoformat), fld('archive_ingestion_date', 'archive ingestion date', coerce=ingest_utils.get_date_isoformat), fld('total_samples', 'total samples', coerce=ingest_utils.get_int), - fld('bpa_dataset_id', 'dataset id', coerce=ingest_utils.extract_bpa_id), - fld('bpa_id', 'bpa id', coerce=ingest_utils.extract_bpa_id), + fld('bpa_dataset_id', 'dataset id', coerce=ingest_utils.extract_ands_id), + fld('sample_id', 'bpa id', coerce=ingest_utils.extract_ands_id), fld('plate_number', 'plate number', coerce=ingest_utils.get_int), fld('well_number', 'well number', coerce=ingest_utils.get_int), fld('sample_name', 'sample name', coerce=ingest_utils.get_int), diff --git a/bpaingest/projects/stemcells/ingest.py b/bpaingest/projects/stemcells/ingest.py index b1ca0181..6b232ca0 100755 --- a/bpaingest/projects/stemcells/ingest.py +++ b/bpaingest/projects/stemcells/ingest.py @@ -6,7 +6,7 @@ from hashlib import md5 as md5hash from ...libs import ingest_utils -from ...util import make_logger, bpa_id_to_ckan_name, common_values, clean_tag_name +from ...util import make_logger, sample_id_to_ckan_name, common_values, clean_tag_name from ...abstract import BaseMetadata from ...libs.excel_wrapper import ExcelWrapper, make_field_definition as fld from .tracking import StemcellsTrackMetadata @@ -32,7 +32,7 @@ ] -def parse_bpa_id_range(s): +def parse_sample_id_range(s): return s.strip().split('/')[-1] @@ -47,7 +47,7 @@ class StemcellsTranscriptomeMetadata(BaseMetadata): ckan_data_type = 'stemcells-transcriptomics' spreadsheet = { 'fields': [ - fld("bpa_id", re.compile(r'^.*sample unique id$'), coerce=ingest_utils.extract_bpa_id), + fld("sample_id", re.compile(r'^.*sample unique id$'), coerce=ingest_utils.extract_ands_id), fld("sample_extaction_id", "Sample extraction ID"), fld("insert_size_range", "Insert size range"), fld("library_construction_protocol", "Library construction protocol"), @@ -77,24 +77,24 @@ def _get_packages(self): logger.info("Ingesting Stemcells Transcriptomics metadata from {0}".format(self.path)) packages = [] # duplicate rows are an issue in this project. we filter them out by uniquifying - # this is harmless as they have to precisly match, and BPA_ID is the primary key + # this is harmless as they have to precisly match, and sample_id is the primary key all_rows = set() for fname in glob(self.path + '/*.xlsx'): logger.info("Processing Stemcells Transcriptomics metadata file {0}".format(fname)) all_rows.update(StemcellsTranscriptomeMetadata.parse_spreadsheet(fname, self.metadata_info)) for row in all_rows: - bpa_id = row.bpa_id - if bpa_id is None: + sample_id = row.sample_id + if sample_id is None: continue obj = {} - name = bpa_id_to_ckan_name(bpa_id.split('.')[-1], self.ckan_data_type) + name = sample_id_to_ckan_name(sample_id.split('.')[-1], self.ckan_data_type) track_meta = self.track_meta.get(row.ticket) obj.update({ 'name': name, 'id': name, - 'bpa_id': bpa_id, - 'notes': 'Stemcell Transcriptomics %s' % (bpa_id), - 'title': 'Stemcell Transcriptomics %s' % (bpa_id), + 'sample_id': sample_id, + 'notes': 'Stemcell Transcriptomics %s' % (sample_id), + 'title': 'Stemcell Transcriptomics %s' % (sample_id), 'omics': 'transcriptomics', 'insert_size_range': row.insert_size_range, 'library_construction_protocol': row.library_construction_protocol, @@ -115,7 +115,7 @@ def _get_packages(self): 'private': True, }) for contextual_source in self.contextual_metadata: - obj.update(contextual_source.get(bpa_id)) + obj.update(contextual_source.get(sample_id)) tag_names = ['transcriptome', 'raw'] obj['tags'] = [{'name': t} for t in tag_names] packages.append(obj) @@ -130,10 +130,10 @@ def _get_resources(self): resource = file_info.copy() resource['md5'] = resource['id'] = md5 resource['name'] = filename - bpa_id = ingest_utils.extract_bpa_id(file_info.get('id')) + sample_id = ingest_utils.extract_ands_id(file_info.get('id')) xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) - resources.append(((bpa_id,), legacy_url, resource)) + resources.append(((sample_id,), legacy_url, resource)) return resources @@ -148,7 +148,7 @@ class StemcellsSmallRNAMetadata(BaseMetadata): ckan_data_type = 'stemcells-smallrna' spreadsheet = { 'fields': [ - fld("bpa_id", re.compile(r'^.*sample unique id$'), coerce=ingest_utils.extract_bpa_id), + fld("sample_id", re.compile(r'^.*sample unique id$'), coerce=ingest_utils.extract_ands_id), fld("sample_extaction_id", "Sample extraction ID"), fld("insert_size_range", "Insert size range"), fld("library_construction_protocol", "Library construction protocol"), @@ -178,24 +178,24 @@ def _get_packages(self): logger.info("Ingesting Stemcells SmallRNA metadata from {0}".format(self.path)) packages = [] # duplicate rows are an issue in this project. we filter them out by uniquifying - # this is harmless as they have to precisly match, and BPA_ID is the primary key + # this is harmless as they have to precisly match, and sample_id is the primary key all_rows = set() for fname in glob(self.path + '/*.xlsx'): logger.info("Processing Stemcells SmallRNA metadata file {0}".format(fname)) all_rows.update(StemcellsSmallRNAMetadata.parse_spreadsheet(fname, self.metadata_info)) for row in all_rows: - bpa_id = row.bpa_id - if bpa_id is None: + sample_id = row.sample_id + if sample_id is None: continue obj = {} - name = bpa_id_to_ckan_name(bpa_id.split('.')[-1], self.ckan_data_type) + name = sample_id_to_ckan_name(sample_id.split('.')[-1], self.ckan_data_type) track_meta = self.track_meta.get(row.ticket) obj.update({ 'name': name, 'id': name, - 'bpa_id': bpa_id, - 'notes': 'Stemcell SmallRNA %s' % (bpa_id), - 'title': 'Stemcell SmallRNA %s' % (bpa_id), + 'sample_id': sample_id, + 'notes': 'Stemcell SmallRNA %s' % (sample_id), + 'title': 'Stemcell SmallRNA %s' % (sample_id), 'omics': 'transcriptomics', 'insert_size_range': row.insert_size_range, 'library_construction_protocol': row.library_construction_protocol, @@ -216,7 +216,7 @@ def _get_packages(self): 'private': True, }) for contextual_source in self.contextual_metadata: - obj.update(contextual_source.get(bpa_id)) + obj.update(contextual_source.get(sample_id)) tag_names = ['small-rna', 'raw'] obj['tags'] = [{'name': t} for t in tag_names] packages.append(obj) @@ -231,10 +231,10 @@ def _get_resources(self): resource = file_info.copy() resource['md5'] = resource['id'] = md5 resource['name'] = filename - bpa_id = ingest_utils.extract_bpa_id(file_info.get('id')) + sample_id = ingest_utils.extract_ands_id(file_info.get('id')) xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) - resources.append(((bpa_id,), legacy_url, resource)) + resources.append(((sample_id,), legacy_url, resource)) return resources @@ -247,10 +247,10 @@ class StemcellsSingleCellRNASeqMetadata(BaseMetadata): technology = 'singlecellrna' auth = ('stemcell', 'stemcell') ckan_data_type = 'stemcells-singlecellrnaseq' - resource_linkage = ('bpa_id_range',) + resource_linkage = ('sample_id_range',) spreadsheet = { 'fields': [ - fld("bpa_id_range", re.compile(r'^.*sample unique id$'), coerce=parse_bpa_id_range), + fld("sample_id_range", re.compile(r'^.*sample unique id$'), coerce=parse_sample_id_range), fld("sample_extaction_id", "Sample extraction ID"), fld("insert_size_range", "Insert size range"), fld("library_construction_protocol", "Library construction protocol"), @@ -281,31 +281,31 @@ def _get_packages(self): logger.info("Ingesting Stemcells SingleCellRNASeq metadata from {0}".format(self.path)) packages = [] # duplicate rows are an issue in this project. we filter them out by uniquifying - # this is harmless as they have to precisly match, and BPA_ID is the primary key + # this is harmless as they have to precisly match, and sample_id is the primary key all_rows = set() for fname in glob(self.path + '/*.xlsx'): logger.info("Processing Stemcells SingleCellRNASeq metadata file {0}".format(fname)) all_rows.update(StemcellsSingleCellRNASeqMetadata.parse_spreadsheet(fname, self.metadata_info)) for row in all_rows: - bpa_id_range = row.bpa_id_range - if bpa_id_range is None: + sample_id_range = row.sample_id_range + if sample_id_range is None: continue obj = {} - name = bpa_id_to_ckan_name(bpa_id_range, self.ckan_data_type) + name = sample_id_to_ckan_name(sample_id_range, self.ckan_data_type) track_meta = self.track_meta.get(row.ticket) # check that it really is a range - if '-' not in bpa_id_range: - logger.error("Skipping row with BPA ID Range `%s'" % (bpa_id_range)) + if '-' not in sample_id_range: + logger.error("Skipping row with BPA ID Range `%s'" % (sample_id_range)) continue # NB: this isn't really the BPA ID, it's the first BPA ID - bpa_id = ingest_utils.extract_bpa_id(bpa_id_range.split('-', 1)[0]) + sample_id = ingest_utils.extract_ands_id(sample_id_range.split('-', 1)[0]) obj.update({ 'name': name, 'id': name, - 'bpa_id': bpa_id, - 'bpa_id_range': bpa_id_range, - 'notes': 'Stemcell SingleCellRNASeq %s' % (bpa_id_range), - 'title': 'Stemcell SingleCellRNASeq %s' % (bpa_id_range), + 'sample_id': sample_id, + 'sample_id_range': sample_id_range, + 'notes': 'Stemcell SingleCellRNASeq %s' % (sample_id_range), + 'title': 'Stemcell SingleCellRNASeq %s' % (sample_id_range), 'insert_size_range': row.insert_size_range, 'library_construction_protocol': row.library_construction_protocol, 'sequencer': row.sequencer, @@ -327,7 +327,7 @@ def _get_packages(self): }) for contextual_source in self.contextual_metadata: # NB: the rows in the contextual metadata are all identical across the range, so this works - obj.update(contextual_source.get(bpa_id)) + obj.update(contextual_source.get(sample_id)) tag_names = ['single-cell-rnaseq', 'raw'] obj['tags'] = [{'name': t} for t in tag_names] packages.append(obj) @@ -344,10 +344,10 @@ def _get_resources(self): resource = file_info.copy() resource['md5'] = resource['id'] = md5 resource['name'] = filename - bpa_id_range = file_info.get('id') + sample_id_range = file_info.get('id') xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) - resources.append(((bpa_id_range,), legacy_url, resource)) + resources.append(((sample_id_range,), legacy_url, resource)) return resources @@ -359,11 +359,11 @@ class StemcellsMetabolomicsMetadata(BaseMetadata): organization = 'bpa-stemcells' omics = 'metabolomics' auth = ('stemcell', 'stemcell') - resource_linkage = ('bpa_id', 'analytical_platform') + resource_linkage = ('sample_id', 'analytical_platform') ckan_data_type = 'stemcells-metabolomic' spreadsheet = { 'fields': [ - fld("bpa_id", re.compile(r'^.*sample unique id$'), coerce=ingest_utils.extract_bpa_id), + fld("sample_id", re.compile(r'^.*sample unique id$'), coerce=ingest_utils.extract_ands_id), fld("sample_fractionation_extraction_solvent", "sample fractionation / extraction solvent"), fld("analytical_platform", "platform", coerce=fix_analytical_platform), fld("instrument_column_type", "instrument/column type"), @@ -394,25 +394,25 @@ def _get_packages(self): logger.info("Ingesting Stemcells Metabolomics metadata from {0}".format(self.path)) packages = [] # duplicate rows are an issue in this project. we filter them out by uniquifying - # this is harmless as they have to precisly match, and BPA_ID is the primary key + # this is harmless as they have to precisly match, and sample_id is the primary key all_rows = set() for fname in glob(self.path + '/*.xlsx'): logger.info("Processing Stemcells Metabolomics metadata file {0}".format(fname)) all_rows.update(StemcellsMetabolomicsMetadata.parse_spreadsheet(fname, self.metadata_info)) for row in all_rows: - bpa_id = row.bpa_id - if bpa_id is None: + sample_id = row.sample_id + if sample_id is None: continue obj = {} - name = bpa_id_to_ckan_name(bpa_id.split('.')[-1] + '-' + row.analytical_platform, self.ckan_data_type) + name = sample_id_to_ckan_name(sample_id.split('.')[-1] + '-' + row.analytical_platform, self.ckan_data_type) track_meta = self.track_meta.get(row.ticket) analytical_platform = fix_analytical_platform(row.analytical_platform) obj.update({ 'name': name, 'id': name, - 'bpa_id': bpa_id, - 'notes': 'Stemcell Metabolomics %s %s' % (bpa_id, analytical_platform), - 'title': 'Stemcell Metabolomics %s %s' % (bpa_id, analytical_platform), + 'sample_id': sample_id, + 'notes': 'Stemcell Metabolomics %s %s' % (sample_id, analytical_platform), + 'title': 'Stemcell Metabolomics %s %s' % (sample_id, analytical_platform), 'omics': 'metabolomics', 'sample_fractionation_extraction_solvent': row.sample_fractionation_extraction_solvent, 'analytical_platform': analytical_platform, @@ -435,7 +435,7 @@ def _get_packages(self): 'private': True, }) for contextual_source in self.contextual_metadata: - obj.update(contextual_source.get(bpa_id, analytical_platform)) + obj.update(contextual_source.get(sample_id, analytical_platform)) tag_names = ['metabolomic', clean_tag_name(analytical_platform), 'raw'] obj['tags'] = [{'name': t} for t in tag_names] packages.append(obj) @@ -451,10 +451,10 @@ def _get_resources(self): resource['md5'] = resource['id'] = md5 resource['name'] = filename resource['analytical_platform'] = fix_analytical_platform(resource['analytical_platform']) - bpa_id = ingest_utils.extract_bpa_id(file_info.get('id')) + sample_id = ingest_utils.extract_ands_id(file_info.get('id')) xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) - resources.append(((bpa_id, resource['analytical_platform']), legacy_url, resource)) + resources.append(((sample_id, resource['analytical_platform']), legacy_url, resource)) return resources @@ -489,7 +489,7 @@ def read_all_rows(self, mode): def parse_spreadsheet(self, fname, additional_context, mode): if mode == '1d': field_spec = [ - fld("bpa_id", re.compile(r'^.*sample unique id$'), coerce=ingest_utils.extract_bpa_id_silent), + fld("sample_id", re.compile(r'^.*sample unique id$'), coerce=ingest_utils.extract_ands_id_silent), ] elif mode == '2d': field_spec = [ @@ -541,22 +541,22 @@ def _get_packages(self): logger.info("Ingesting Stemcells Proteomics metadata from {0}".format(self.path)) packages = [] # duplicate rows are an issue in this project. we filter them out by uniquifying - # this is harmless as they have to precisly match, and BPA_ID is the primary key + # this is harmless as they have to precisly match, and sample_id is the primary key # # we also have rows relating to pooled data, and non-pooled data (this class # considers only non-pooled data) all_rows = self.read_all_rows('1d') - bpa_id_ticket_facility = dict((t.bpa_id, (t.ticket, t.facility_code)) for t in all_rows if t.bpa_id) - for bpa_id, (ticket, facility_code) in sorted(bpa_id_ticket_facility.items()): + sample_id_ticket_facility = dict((t.sample_id, (t.ticket, t.facility_code)) for t in all_rows if t.sample_id) + for sample_id, (ticket, facility_code) in sorted(sample_id_ticket_facility.items()): obj = {} - name = bpa_id_to_ckan_name(bpa_id.split('.')[-1], self.ckan_data_type) + name = sample_id_to_ckan_name(sample_id.split('.')[-1], self.ckan_data_type) track_meta = self.track_meta.get(ticket) obj.update({ 'name': name, 'id': name, - 'bpa_id': bpa_id, - 'notes': 'Stemcell Proteomics %s' % (bpa_id), - 'title': 'Stemcell Proteomics %s' % (bpa_id), + 'sample_id': sample_id, + 'notes': 'Stemcell Proteomics %s' % (sample_id), + 'title': 'Stemcell Proteomics %s' % (sample_id), 'omics': 'proteomics', 'type': self.ckan_data_type, 'date_of_transfer': ingest_utils.get_date_isoformat(track_meta.date_of_transfer), @@ -573,7 +573,7 @@ def _get_packages(self): 'private': True, }) for contextual_source in self.contextual_metadata: - obj.update(contextual_source.get(bpa_id)) + obj.update(contextual_source.get(sample_id)) tag_names = ['proteomic', 'raw'] obj['tags'] = [{'name': t} for t in tag_names] packages.append(obj) @@ -595,10 +595,10 @@ def _get_resources(self): resource_meta = self.filename_metadata.get(filename, {}) for k in ("sample_fractionation", "lc_column_type", "gradient_time", "sample_on_column", "mass_spectrometer", "acquisition_mode", "database", "database_size"): resource[k] = getattr(resource_meta, k) - bpa_id = ingest_utils.extract_bpa_id(file_info.get('id')) + sample_id = ingest_utils.extract_ands_id(file_info.get('id')) xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], filename) - resources.append(((bpa_id, ), legacy_url, resource)) + resources.append(((sample_id, ), legacy_url, resource)) return resources @@ -624,7 +624,7 @@ def _get_packages(self): logger.info("Ingesting Stemcells Proteomics Pool metadata from {0}".format(self.path)) packages = [] # duplicate rows are an issue in this project. we filter them out by uniquifying - # this is harmless as they have to precisly match, and BPA_ID is the primary key + # this is harmless as they have to precisly match, and sample_id is the primary key # # we also have rows relating to pooled data, and non-pooled data (this class # considers only non-pooled data) @@ -632,7 +632,7 @@ def _get_packages(self): pool_id_ticket_facility = dict((t.pool_id, (t.ticket, t.facility_code)) for t in all_rows if t.pool_id) for pool_id, (ticket, facility_code) in sorted(pool_id_ticket_facility.items()): obj = {} - name = bpa_id_to_ckan_name(pool_id, self.ckan_data_type) + name = sample_id_to_ckan_name(pool_id, self.ckan_data_type) track_meta = self.track_meta.get(ticket) obj.update({ 'name': name, @@ -705,7 +705,7 @@ class StemcellsProteomicsAnalysedMetadata(BaseMetadata): 'fields': [ fld('date_submission', 'date submission yy/mm/dd', coerce=ingest_utils.get_date_isoformat), fld('facility_project_code_experiment_code', 'facility project_code _facility experiment code'), - fld('bpa_id', 'bpa unique identifier', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'bpa unique identifier', coerce=ingest_utils.extract_ands_id), fld('sample_name', 'sample name'), fld('replicate_group_id', 'replicate group id'), fld('species', 'species'), @@ -762,17 +762,17 @@ def _get_packages(self): for ticket, rows in list(ticket_rows.items()): obj = common_values([t._asdict() for t in rows]) track_meta = self.track_meta.get(ticket) - name = bpa_id_to_ckan_name(track_meta.folder_name, self.ckan_data_type) + name = sample_id_to_ckan_name(track_meta.folder_name, self.ckan_data_type) # folder names can be quite long, truncate name = name[:100] - bpa_ids = sorted(set([ingest_utils.extract_bpa_id(t.bpa_id) for t in rows])) + sample_ids = sorted(set([ingest_utils.extract_ands_id(t.sample_id) for t in rows])) obj.update({ 'name': name, 'id': name, 'notes': 'Stemcell Proteomics Analysed %s' % (track_meta.folder_name), 'title': 'Stemcell Proteomics Analysed %s' % (track_meta.folder_name), 'omics': 'proteomics', - 'bpa_ids': ', '.join(bpa_ids), + 'sample_ids': ', '.join(sample_ids), 'type': self.ckan_data_type, 'date_of_transfer': ingest_utils.get_date_isoformat(track_meta.date_of_transfer), 'data_type': track_meta.data_type, @@ -823,7 +823,7 @@ class StemcellsMetabolomicsAnalysedMetadata(BaseMetadata): spreadsheet = { 'fields': [ fld('data_analysis_date', 'data analysis date'), - fld('bpa_id_range', 'bpa unique identifier **'), + fld('sample_id_range', 'bpa unique identifier **'), fld('sample_name', 'sample name **'), fld('replicate_group_id', 'replicate group id**'), fld('species', 'species**'), @@ -876,16 +876,16 @@ def _get_packages(self): packages = [] for (ticket, folder_name), rows in list(folder_rows.items()): obj = common_values([t._asdict() for t in rows]) - name = bpa_id_to_ckan_name(folder_name, self.ckan_data_type) + name = sample_id_to_ckan_name(folder_name, self.ckan_data_type) track_meta = self.track_meta.get(ticket) - bpa_ids = sorted(set([t.bpa_id_range.strip() for t in rows])) + sample_ids = sorted(set([t.sample_id_range.strip() for t in rows])) obj.update({ 'name': name, 'id': name, 'notes': '%s' % (folder_name), 'title': '%s' % (folder_name), 'omics': 'metabolomics', - 'bpa_ids': ', '.join(bpa_ids), + 'sample_ids': ', '.join(sample_ids), 'type': self.ckan_data_type, 'date_of_transfer': ingest_utils.get_date_isoformat(track_meta.date_of_transfer), 'data_type': track_meta.data_type, @@ -939,7 +939,7 @@ class StemcellsTranscriptomeAnalysedMetadata(BaseMetadata): spreadsheet = { 'fields': [ fld('date', 'date (yyyy-mm-dd)', coerce=ingest_utils.get_date_isoformat), - fld('bpa_id', 'bpa identifier', coerce=ingest_utils.extract_bpa_id), + fld('sample_id', 'bpa identifier', coerce=ingest_utils.extract_ands_id), fld('plate_number', 'plate number'), fld('well_no', 'well no'), fld('sample_name', 'sample name'), @@ -1000,16 +1000,16 @@ def _get_packages(self): packages = [] for (ticket, folder_name), rows in list(folder_rows.items()): obj = common_values([t._asdict() for t in rows]) - name = bpa_id_to_ckan_name(folder_name, self.ckan_data_type) + name = sample_id_to_ckan_name(folder_name, self.ckan_data_type) track_meta = self.track_meta.get(ticket) - bpa_ids = sorted(set([t.bpa_id.strip() for t in rows])) + sample_ids = sorted(set([t.sample_id.strip() for t in rows])) obj.update({ 'name': name, 'id': name, 'notes': '%s' % (folder_name), 'title': '%s' % (folder_name), 'omics': 'transcriptomics', - 'bpa_ids': ', '.join(bpa_ids), + 'sample_ids': ', '.join(sample_ids), 'type': self.ckan_data_type, 'date_of_transfer': ingest_utils.get_date_isoformat(track_meta.date_of_transfer), 'data_type': track_meta.data_type, diff --git a/bpaingest/projects/wheat_cultivars/files.py b/bpaingest/projects/wheat_cultivars/files.py index ab36b0ed..102a6e2d 100755 --- a/bpaingest/projects/wheat_cultivars/files.py +++ b/bpaingest/projects/wheat_cultivars/files.py @@ -1,7 +1,7 @@ from collections import namedtuple -from ...libs import bpa_id_utils from ...util import make_logger +from ...libs.ingest_utils import extract_ands_id logger = make_logger(__name__) @@ -13,25 +13,19 @@ def parse_base_pair(val): return int(val[:-2]) * 1000 -def make_protocol(**kwargs): - fields = ('library_type', 'base_pairs', 'library_construction_protocol', 'sequencer') - return dict((t, kwargs.get(t)) for t in fields) - - def make_file_metadata(md5_lines): """ Add md5 data """ for md5_line in md5_lines: - bpa_idx = md5_line.bpa_id - bpa_id = bpa_id_utils.get_bpa_id(bpa_idx) - if bpa_id is None: + sample_id = extract_ands_id(md5_line.sample_id) + if sample_id is None: continue - run_key = md5_line.bpa_id + md5_line.flowcell + md5_line.lib_type + md5_line.lib_size + run_key = md5_line.sample_id + md5_line.flowcell + md5_line.lib_type + md5_line.lib_size yield { 'run': run_key, - 'bpa_id': bpa_id, + 'sample_id': sample_id, 'library_type': md5_line.lib_type, 'base_pairs': parse_base_pair(md5_line.lib_size), 'flowcell': md5_line.flowcell, @@ -50,7 +44,7 @@ def cultivars_parse_md5_file(md5_file): """ class MD5ParsedLine(object): - Cultivar = namedtuple('Cultivar', 'desc bpa_id') + Cultivar = namedtuple('Cultivar', 'desc sample_id') cultivars = { 'DRY': Cultivar('Drysdale', '102.100.100.13703'), 'GLA': Cultivar('Gladius', '102.100.100.13704'), @@ -75,7 +69,7 @@ def __init__(self, line): self.cultivar_key = None self.cultivar = None - self.bpa_id = None + self.sample_id = None self.lib_type = None self.lib_size = None self.flowcell = None @@ -123,7 +117,7 @@ def __parse_line(self): self._ok = False return - self.bpa_id = self.cultivar.bpa_id + self.sample_id = self.cultivar.sample_id # WYA_PE_300bp_AD0ALYACXX_ATCACG_L003_R2.fastq.gz # [Cultivar_key]_[Library_Type]_[Library_Size]_[FLowcel]_[Barcode]_L[Lane_number]_R[Read_Number]. diff --git a/bpaingest/projects/wheat_cultivars/ingest.py b/bpaingest/projects/wheat_cultivars/ingest.py index 61aa9d56..42e6f862 100755 --- a/bpaingest/projects/wheat_cultivars/ingest.py +++ b/bpaingest/projects/wheat_cultivars/ingest.py @@ -7,7 +7,7 @@ from ...libs.excel_wrapper import make_field_definition as fld from ...libs import ingest_utils -from ...util import make_logger, bpa_id_to_ckan_name +from ...util import make_logger, sample_id_to_ckan_name from ...abstract import BaseMetadata from ...util import clean_tag_name from . import files @@ -24,7 +24,7 @@ class WheatCultivarsMetadata(BaseMetadata): 'fields': [ fld("source_name", "BPA ID"), fld("code", "CODE"), - fld("bpa_id", "BPA ID", coerce=lambda s: s.replace("/", ".")), + fld("sample_id", "BPA ID", coerce=lambda s: s.replace("/", ".")), fld("characteristics", "Characteristics"), fld("organism", "Organism"), fld("variety", "Variety"), @@ -57,15 +57,15 @@ def _get_packages(self): for fname in glob(self.path + '/*.xlsx'): logger.info("Processing Stemcells Transcriptomics metadata file {0}".format(fname)) for row in self.parse_spreadsheet(fname, self.metadata_info): - bpa_id = row.bpa_id - if bpa_id is None: + sample_id = row.sample_id + if sample_id is None: continue - name = bpa_id_to_ckan_name(bpa_id) + name = sample_id_to_ckan_name(sample_id) obj = { 'name': name, - 'id': bpa_id, - 'bpa_id': bpa_id, - 'title': bpa_id, + 'id': sample_id, + 'sample_id': sample_id, + 'title': sample_id, 'notes': '%s (%s): %s' % (row.variety, row.code, row.classification), 'type': self.ckan_data_type, 'private': False, @@ -91,8 +91,8 @@ def _get_resources(self): resource['md5'] = resource['id'] = md5 resource['name'] = filename resource.update(self.runs.get(resource['run'], BLANK_RUN)) - bpa_id = ingest_utils.extract_bpa_id(file_info['bpa_id']) + sample_id = ingest_utils.extract_ands_id(file_info['sample_id']) xlsx_info = self.metadata_info[os.path.basename(md5_file)] legacy_url = urljoin(xlsx_info['base_url'], '../all/' + filename) - resources.append(((bpa_id, ), legacy_url, resource)) + resources.append(((sample_id, ), legacy_url, resource)) return resources diff --git a/bpaingest/projects/wheat_cultivars/runs.py b/bpaingest/projects/wheat_cultivars/runs.py index a4495f3f..0c48f09d 100755 --- a/bpaingest/projects/wheat_cultivars/runs.py +++ b/bpaingest/projects/wheat_cultivars/runs.py @@ -18,7 +18,7 @@ def get_run_data(file_name): The run metadata for this set """ - field_spec = [fld('bpa_id', 'Soil sample unique ID', coerce=lambda s: s.replace('/', '.')), + field_spec = [fld('sample_id', 'Soil sample unique ID', coerce=lambda s: s.replace('/', '.')), fld('variety', 'Variety'), fld('cultivar_code', 'Code'), fld('library', 'Library code'), @@ -40,7 +40,7 @@ def get_run_data(file_name): def parse_run_data(path): """ Run data is uniquely defined by - - bpa_id + - sample_id - flowcell - library type - library size @@ -67,7 +67,7 @@ def is_metadata(path): run_lookup = {} for run in run_data: - key = run.bpa_id + run.flowcell + run.library + run.library_construction + key = run.sample_id + run.flowcell + run.library + run.library_construction run_lookup[key] = make_run(number=run.run_number, casava_version=run.casava_version, library_construction_protocol=run.library_construction_protocol, diff --git a/bpaingest/projects/wheat_pathogens_genomes/ingest.py b/bpaingest/projects/wheat_pathogens_genomes/ingest.py index 602af9bd..b78b5a3a 100755 --- a/bpaingest/projects/wheat_pathogens_genomes/ingest.py +++ b/bpaingest/projects/wheat_pathogens_genomes/ingest.py @@ -10,7 +10,7 @@ from ...libs.excel_wrapper import make_field_definition as fld from ...libs import ingest_utils -from ...util import make_logger, bpa_id_to_ckan_name, common_values +from ...util import make_logger, sample_id_to_ckan_name, common_values from ...abstract import BaseMetadata logger = make_logger(__name__) @@ -23,7 +23,7 @@ class WheatPathogensGenomesMetadata(BaseMetadata): omics = 'genomics' spreadsheet = { 'fields': [ - fld("bpa_id", "BPA ID", coerce=ingest_utils.extract_bpa_id), + fld("sample_id", "BPA ID", coerce=ingest_utils.extract_ands_id), fld("official_variety", "Isolate name"), fld("kingdom", "Kingdom"), fld("phylum", "Phylum"), @@ -77,21 +77,21 @@ def _get_packages(self): # including MD5s. Common values per BPA ID extracted to be package metadata by_bpaid = defaultdict(list) for row in self.parse_spreadsheet(fname, self.metadata_info): - by_bpaid[row.bpa_id].append(row) - for bpa_id, rows in list(by_bpaid.items()): + by_bpaid[row.sample_id].append(row) + for sample_id, rows in list(by_bpaid.items()): data = common_values([t._asdict() for t in rows]) - bpa_id = data['bpa_id'] - if bpa_id is None: + sample_id = data['sample_id'] + if sample_id is None: continue - name = bpa_id_to_ckan_name(bpa_id) + name = sample_id_to_ckan_name(sample_id) obj = { 'name': name, 'id': name, - 'bpa_id': bpa_id, - 'title': bpa_id, + 'sample_id': sample_id, + 'title': sample_id, 'notes': '%s' % (data['official_variety']), 'type': self.ckan_data_type, - 'bpa_id': bpa_id, + 'sample_id': sample_id, 'kingdom': data['kingdom'], 'phylum': data['phylum'], 'species': data['species'], @@ -121,7 +121,7 @@ def get_file_name(s): for fname in glob(self.path + '/Wheat_pathogens_genomic_metadata.xlsx'): xlsx_info = self.metadata_info[os.path.basename(fname)] for row in self.parse_spreadsheet(fname, self.metadata_info): - bpa_id = row.bpa_id + sample_id = row.sample_id resource = { 'flowcell': row.flow_cell_id, 'run_number': ingest_utils.get_clean_number(row.run_number), @@ -139,5 +139,5 @@ def get_file_name(s): } resource['md5'] = resource['id'] = row.md5_checksum legacy_url = urljoin(xlsx_info['base_url'], '../../all/' + resource['name']) - resources.append(((bpa_id,), legacy_url, resource)) + resources.append(((sample_id,), legacy_url, resource)) return resources diff --git a/bpaingest/util.py b/bpaingest/util.py index cfd11462..2135a218 100755 --- a/bpaingest/util.py +++ b/bpaingest/util.py @@ -18,11 +18,11 @@ def one(l): return l[0] -def bpa_id_to_ckan_name(bpa_id, suborg=None, postfix=None): +def sample_id_to_ckan_name(sample_id, suborg=None, postfix=None): r = 'bpa-' if suborg is not None: r += suborg + '-' - r += bpa_id.replace('/', '_').replace('.', '_').replace(' ', '') + r += sample_id.replace('/', '_').replace('.', '_').replace(' ', '') if postfix is not None: r += '-' + postfix # CKAN insists upon lowercase From 2e2da4ebed775136622931f70992ee9db1a6beca Mon Sep 17 00:00:00 2001 From: Grahame Bowland Date: Thu, 15 Nov 2018 23:33:46 +0800 Subject: [PATCH 6/6] dumpstate runs through --- bpaingest/ncbi.py | 4 ++-- bpaingest/projects/wheat_pathogens_genomes/ingest.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bpaingest/ncbi.py b/bpaingest/ncbi.py index 2966922a..8d437873 100644 --- a/bpaingest/ncbi.py +++ b/bpaingest/ncbi.py @@ -37,7 +37,7 @@ def _read_2016_accessions(self): if not os.access(fname, os.R_OK): return {} _, biosample_rows = csv_to_named_tuple('BioSample', fname, mode='rU') - return dict((ingest_utils.extract_sample_id(t.sample_name), t.accession.strip()) for t in biosample_rows) + return dict((ingest_utils.extract_ands_id(t.sample_name), t.accession.strip()) for t in biosample_rows) def _read_accessions(self): """ @@ -48,7 +48,7 @@ def _read_accessions(self): accessions = {} for fname in sample_objects: _, rows = csv_to_named_tuple('SRARow', fname, mode='rU', dialect='excel-tab') - accessions.update(dict((ingest_utils.extract_sample_id(t.sample_name), t.accession) for t in rows)) + accessions.update(dict((ingest_utils.extract_ands_id(t.sample_name), t.accession) for t in rows)) return accessions def _read_ncbi_sra(self): diff --git a/bpaingest/projects/wheat_pathogens_genomes/ingest.py b/bpaingest/projects/wheat_pathogens_genomes/ingest.py index b78b5a3a..4c0e1983 100755 --- a/bpaingest/projects/wheat_pathogens_genomes/ingest.py +++ b/bpaingest/projects/wheat_pathogens_genomes/ingest.py @@ -28,7 +28,7 @@ class WheatPathogensGenomesMetadata(BaseMetadata): fld("kingdom", "Kingdom"), fld("phylum", "Phylum"), fld("species", "Species"), - fld("sample_id", "Researcher Sample ID"), + fld("researcher_sample_id", "Researcher Sample ID"), fld("other_id", "Other IDs"), fld("original_source_host_species", "Original source host species"), fld("collection_date", "Isolate collection date"), @@ -95,7 +95,7 @@ def _get_packages(self): 'kingdom': data['kingdom'], 'phylum': data['phylum'], 'species': data['species'], - 'sample_id': data['sample_id'], + 'researcher_sample_id': data['researcher_sample_id'], 'sample_label': data['other_id'], 'dna_source': data['sample_dna_source'], 'official_variety_name': data['official_variety'],