v1.1.5 geo downloader fixes and documentation fixes (#29)

* added batch_size parameter to run_pipeline * added CLI functionality * batch_size python/CLI and tests * removed test; changed default behavior: won't raise error if file-to-be-downloaded already exists * Update setup.py * Update test_batch_size.py * Rename test_batch_size.py to test_pipeline_batch_size.py * dropped redundant tests and sped up one * Feature/public data (#21) * download command, as well as some batch_size adjustments * fixed string issue * renaming update and removed redundant tests * bs4 required for Array ingester * tests * workaround to return objects with batch size changes * workaround to return objects with batch size changes * bug * tests pass for batch_size * version 1.1 (#22) * download command, as well as some batch_size adjustments * fixed string issue * renaming update and removed redundant tests * bs4 required for Array ingester * tests * workaround to return objects with batch size changes * workaround to return objects with batch size changes * bug * tests pass for batch_size * progress bars * documenting `download` * Update cli.py * restore sample_name filter * added rawMetaDataset class and moved get_sample_sheet_s3 to more logical place here (#24) * updated docs for 1.1.1 * Update README.md * Update setup.py * exposed create_sample_sheet and download no_clean options * manifest file download in lambda * manifest file download in lambda * manifest file download in lambda * v1.1.3 bump bug fix * handles blank sample_name and ensures names are unique. * Update setup.py * geo downloader tweaks, fixed docs
FoxoTech · Oct 8, 2019 · 8840d1a · 8840d1a
1 parent aa95129
commit 8840d1a
Show file tree

Hide file tree

Showing 5 changed files with 61 additions and 32 deletions.
diff --git a/methylprep/download/geo.py b/methylprep/download/geo.py
@@ -13,20 +13,28 @@
 import pandas as pd
 from tqdm import tqdm
 
+#logging.basicConfig(level=logging.DEBUG) # always verbose
 LOGGER = logging.getLogger(__name__)
+LOGGER.setLevel( logging.INFO )
 
 def geo_download(geo_id, series_path, geo_platforms, clean=True):
     """Downloads the IDATs and metadata for a GEO series
 
     Arguments:
         geo_id [required]
-            the GEO Accension for the desired series
+            the GEO Accension for the desired series (e.g. GSE134293)
         series_path [required]
             the directory to download the data to
         geo_platforms [required]
             the list of supported GEO platforms
         clean
-            whether or not to delete files once they are no longer need (True by default)"""
+            whether or not to delete files once they are no longer need (True by default)
+
+    Note about GEO IDs:
+        You can use the NIH online search to find data sets, then click "Send to:" at the button of a results page,
+        and export a list of unique IDs as text file. These IDs are not GEO_IDs used here. First, remove the first
+        three digits from the number, so Series ID: 200134293 is GEO accension ID: 134293, then include the GSE part,
+        like "GSE134293" in your CLI parameters. """
     series_dir = Path(series_path)
     raw_filename = f"{geo_id}_RAW.tar"
     miniml_filename = f"{geo_id}_family.xml"
@@ -38,15 +46,28 @@ def geo_download(geo_id, series_path, geo_platforms, clean=True):
         if not os.path.exists(f"{series_path}/{platform}"):
             os.mkdir(f"{series_path}/{platform}")
 
-    ftp = FTP('ftp.ncbi.nlm.nih.gov')
+    ftp = FTP('ftp.ncbi.nlm.nih.gov', timeout=1200) # 20 mins
     ftp.login()
     ftp.cwd(f"geo/series/{geo_id[:-3]}nnn/{geo_id}")
 
-    #LOGGER.info(f"Downloading {geo_id}")
+    if not os.path.exists(f"{series_path}/{miniml_filename}"):
+        if not os.path.exists(f"{series_path}/{miniml_filename}.tgz"):
+            LOGGER.info(f"Downloading {miniml_filename}")
+            miniml_file = open(f"{series_path}/{miniml_filename}.tgz", 'wb')
+            ftp.retrbinary(f"RETR miniml/{miniml_filename}.tgz", miniml_file.write)
+            miniml_file.close()
+            LOGGER.info(f"Downloaded {miniml_filename}")
+        LOGGER.info(f"Unpacking {miniml_filename}")
+        min_tar = tarfile.open(f"{series_path}/{miniml_filename}.tgz")
+        for file in min_tar.getnames():
+            if file == miniml_filename:
+                min_tar.extract(file, path=series_path)
+        if clean:
+            os.remove(f"{series_path}/{miniml_filename}.tgz")
+
     if not list(series_dir.glob('**/*.idat')):
         if not list(series_dir.glob('*.idat.gz')):
             if not os.path.exists(f"{series_path}/{raw_filename}"):
-                #LOGGER.info(f"Downloading {raw_filename}")
                 raw_file = open(f"{series_path}/{raw_filename}", 'wb')
                 filesize = ftp.size(f"suppl/{raw_filename}")
                 try:
@@ -58,6 +79,7 @@ def tqdm_callback(data):
                 except Exception as e:
                     LOGGER.info('tqdm: Failed to create a progress bar, but it is downloading...')
                     ftp.retrbinary(f"RETR suppl/{raw_filename}", raw_file.write)
+                LOGGER.info(f"Closing file {raw_filename}")
                 raw_file.close()
                 LOGGER.info(f"Downloaded {raw_filename}")
             LOGGER.info(f"Unpacking {raw_filename}")
@@ -80,21 +102,6 @@ def tqdm_callback(data):
             if clean:
                 os.remove(gz_string)
 
-    if not os.path.exists(f"{series_path}/{miniml_filename}"):
-        if not os.path.exists(f"{series_path}/{miniml_filename}.tgz"):
-            LOGGER.info(f"Downloading {miniml_filename}")
-            miniml_file = open(f"{series_path}/{miniml_filename}.tgz", 'wb')
-            ftp.retrbinary(f"RETR miniml/{miniml_filename}.tgz", miniml_file.write)
-            miniml_file.close()
-            LOGGER.info(f"Downloaded {miniml_filename}")
-        LOGGER.info(f"Unpacking {miniml_filename}")
-        min_tar = tarfile.open(f"{series_path}/{miniml_filename}.tgz")
-        for file in min_tar.getnames():
-            if file == miniml_filename:
-                min_tar.extract(file, path=series_path)
-        if clean:
-            os.remove(f"{series_path}/{miniml_filename}.tgz")
-
     LOGGER.info(f"Downloaded and unpacked {geo_id}")
 
     ftp.quit()
@@ -136,24 +143,42 @@ def geo_metadata(geo_id, series_path, geo_platforms, path):
         attributes_dir['title'] = title
         for char in sample.find_all('Characteristics'):
             attributes_dir[char['tag']] = char.text.strip()
-        split_idat = sample.find('Supplementary-Data').text.split("/")[-1].split("_")
-        attributes_dir['methylprep_name'] = f"{split_idat[1]}_{split_idat[2]}"
+        if sample.find('Description'):
+            attributes_dir['description'] = sample.find('Description').text.strip()
+
+        # only some MINiML files have this.
+        try:
+            split_idat = sample.find('Supplementary-Data').text.split("/")[-1].split("_")
+            attributes_dir['methylprep_name'] = f"{split_idat[1]}_{split_idat[2]}"
+        except:
+            LOGGER.info( "MINiML file does not provide `methylprep_name` (sentrix_id_R00C00)" )
+        # MUST have methylprep_name match filename
 
         if platform in geo_platforms:
             for idat in sample.find_all('Supplementary-Data'):
                 if idat['type'] == 'IDAT':
                     file_name = (idat.text.split("/")[-1]).strip()[:-3]
-                    shutil.move(f"{series_path}/{file_name}", f"{series_path}/{platform}/{file_name}")
+                    try:
+                        shutil.move(f"{series_path}/{file_name}", f"{series_path}/{platform}/{file_name}")
+                    except FileNotFoundError:
+                        # this doesn't throw an error if file is already in the right folder
+                        if not Path(f"{series_path}/{platform}/{file_name}").is_file():
+                            raise FileNotFoundError ("Could not move file after downloading.")
 
             meta_dicts[platform][accession] = attributes_dir
             samples_dict[platform][accession] = title
         else:
-            raise ValueError(f'Sample: {title} has unrecognized platform: {platform}')
+            # this ought to keep other idat files from being included in the package.
+            LOGGER.warning(f'Sample: {title} has unrecognized platform: {platform}; not moving data file')
+            fp.close()
+            #raise ValueError(f'Sample: {title} has unrecognized platform: {platform}')
+    LOGGER.info(f"Found {len(attributes_dir)} tags for {len(samples)} samples: {attributes_dir}")
     fp.close()
 
     seen_platforms = []
 
     for platform in geo_platforms:
+        LOGGER.debug(f"exporting {platform}")
         if meta_dicts[platform]:
             meta_dict_filename = f"{geo_id}_{platform}_dict.pkl"
             pickle.dump(meta_dicts[platform], open(f"{series_path}/{meta_dict_filename}", 'wb'))

diff --git a/methylprep/download/process_data.py b/methylprep/download/process_data.py
@@ -18,13 +18,13 @@
 
 LOGGER = logging.getLogger(__name__)
 
-GEO_PLATFORMS = ['GPL21145', 'GPL13534']
+GEO_PLATFORMS = ['GPL21145', 'GPL13534'] # GPL23976 -- is an 850k genotyping array that is bundled with some datasets
 AE_PLATFORMS = ['A-MEXP-2255', 'A-GEOD-21145']
 PLATFORMS = GEO_PLATFORMS + AE_PLATFORMS
 BATCH_SIZE = 100
 
 
-def run_series(id, path, dict_only=False, batch_size=BATCH_SIZE, clean=True):
+def run_series(id, path, dict_only=False, batch_size=BATCH_SIZE, clean=True, verbose=False):
     """Downloads the IDATs and metadata for a series then generates one metadata dictionary and one beta value matrix for each platform in the series
 
     Arguments:
@@ -34,10 +34,14 @@ def run_series(id, path, dict_only=False, batch_size=BATCH_SIZE, clean=True):
             the path to the directory to download the data to. It is assumed a dictionaries and beta values
             directory has been created for each platform (and will create one for each if not)
         dict_only
-            if True, only downloads data and creates dictionaries for each platform
+            if True, downloads idat files and meta data and creates data dictionaries for each platform
         batch_size
             the batch_size to use when processing samples (number of samples run at a time).
-            By default is set to the constant 100."""
+            By default is set to the constant 100.
+        clean
+            if True, removes intermediate processing files
+        verbose
+            if True, adds additional debugging information"""
     if not os.path.exists(f"{str(path)}/{PLATFORMS[0]}_beta_values"):
         initialize(str(path))
 

diff --git a/methylprep/processing/pipeline.py b/methylprep/processing/pipeline.py
@@ -91,11 +91,11 @@ def run_pipeline(data_dir, array_type=None, export=False, manifest_filepath=None
         betas
             if True, will return a single data frame of betavalues instead of a list of SampleDataContainer objects.
             Format is a "wide matrix": columns contain probes and rows contain samples.
-        m_factor
+        m_value
             if True, will return a single data frame of m_factor values instead of a list of SampleDataContainer objects.
             Format is a "wide matrix": columns contain probes and rows contain samples.
 
-        if batch_size is set to more than 200 samples, nothing is returned but, all the files are saved.
+        if batch_size is set to more than 200 samples, nothing is returned but all the files are saved. You can recreate the output by loading the files.
 
     Processing note:
         The sample_sheet parser will ensure every sample has a unique name and assign one (e.g. Sample1) if missing, or append a number (e.g. _1) if not unique.

diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 
 setup(
     name='methylprep',
-    version='1.1.4',
+    version='1.1.5',
     description='Python-based Illumina methylation array preprocessing software',
     long_description=open('README.md').read(),
     long_description_content_type='text/markdown',

diff --git a/tests/processing/test_pipeline.py b/tests/processing/test_pipeline.py
@@ -1,9 +1,9 @@
 import sys
 import numpy as np
+from pathlib import Path
 # App
 from methylprep.processing import pipeline
 from methylprep.utils.files import download_file
-from pathlib import Path
 #patching
 try:
     # python 3.4+ should use builtin unittest.mock not mock package