Skip to content

Commit

Permalink
v1.1.5 geo downloader fixes and documentation fixes (#29)
Browse files Browse the repository at this point in the history
* added batch_size parameter to run_pipeline

* added CLI functionality

* batch_size python/CLI and tests

* removed test; changed default behavior: won't raise error if file-to-be-downloaded already exists

* Update setup.py

* Update test_batch_size.py

* Rename test_batch_size.py to test_pipeline_batch_size.py

* dropped redundant tests and sped up one

* Feature/public data (#21)

* download command, as well as some batch_size adjustments

* fixed string issue

* renaming update and removed redundant tests

* bs4 required for Array ingester

* tests

* workaround to return objects with batch size changes

* workaround to return objects with batch size changes

* bug

* tests pass for batch_size

* version 1.1 (#22)

* download command, as well as some batch_size adjustments

* fixed string issue

* renaming update and removed redundant tests

* bs4 required for Array ingester

* tests

* workaround to return objects with batch size changes

* workaround to return objects with batch size changes

* bug

* tests pass for batch_size

* progress bars

* documenting `download`

* Update cli.py

* restore sample_name filter

* added rawMetaDataset class and moved get_sample_sheet_s3 to more logical place here (#24)

* updated docs for 1.1.1

* Update README.md

* Update setup.py

* exposed create_sample_sheet and download no_clean options

* manifest file download in lambda

* manifest file download in lambda

* manifest file download in lambda

* v1.1.3 bump bug fix

* handles blank sample_name and ensures names are unique.

* Update setup.py

* geo downloader tweaks, fixed docs
  • Loading branch information
Marc Maxmeister committed Oct 8, 2019
1 parent aa95129 commit 8840d1a
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 32 deletions.
73 changes: 49 additions & 24 deletions methylprep/download/geo.py
Expand Up @@ -13,20 +13,28 @@
import pandas as pd
from tqdm import tqdm

#logging.basicConfig(level=logging.DEBUG) # always verbose
LOGGER = logging.getLogger(__name__)
LOGGER.setLevel( logging.INFO )

def geo_download(geo_id, series_path, geo_platforms, clean=True):
"""Downloads the IDATs and metadata for a GEO series
Arguments:
geo_id [required]
the GEO Accension for the desired series
the GEO Accension for the desired series (e.g. GSE134293)
series_path [required]
the directory to download the data to
geo_platforms [required]
the list of supported GEO platforms
clean
whether or not to delete files once they are no longer need (True by default)"""
whether or not to delete files once they are no longer need (True by default)
Note about GEO IDs:
You can use the NIH online search to find data sets, then click "Send to:" at the button of a results page,
and export a list of unique IDs as text file. These IDs are not GEO_IDs used here. First, remove the first
three digits from the number, so Series ID: 200134293 is GEO accension ID: 134293, then include the GSE part,
like "GSE134293" in your CLI parameters. """
series_dir = Path(series_path)
raw_filename = f"{geo_id}_RAW.tar"
miniml_filename = f"{geo_id}_family.xml"
Expand All @@ -38,15 +46,28 @@ def geo_download(geo_id, series_path, geo_platforms, clean=True):
if not os.path.exists(f"{series_path}/{platform}"):
os.mkdir(f"{series_path}/{platform}")

ftp = FTP('ftp.ncbi.nlm.nih.gov')
ftp = FTP('ftp.ncbi.nlm.nih.gov', timeout=1200) # 20 mins
ftp.login()
ftp.cwd(f"geo/series/{geo_id[:-3]}nnn/{geo_id}")

#LOGGER.info(f"Downloading {geo_id}")
if not os.path.exists(f"{series_path}/{miniml_filename}"):
if not os.path.exists(f"{series_path}/{miniml_filename}.tgz"):
LOGGER.info(f"Downloading {miniml_filename}")
miniml_file = open(f"{series_path}/{miniml_filename}.tgz", 'wb')
ftp.retrbinary(f"RETR miniml/{miniml_filename}.tgz", miniml_file.write)
miniml_file.close()
LOGGER.info(f"Downloaded {miniml_filename}")
LOGGER.info(f"Unpacking {miniml_filename}")
min_tar = tarfile.open(f"{series_path}/{miniml_filename}.tgz")
for file in min_tar.getnames():
if file == miniml_filename:
min_tar.extract(file, path=series_path)
if clean:
os.remove(f"{series_path}/{miniml_filename}.tgz")

if not list(series_dir.glob('**/*.idat')):
if not list(series_dir.glob('*.idat.gz')):
if not os.path.exists(f"{series_path}/{raw_filename}"):
#LOGGER.info(f"Downloading {raw_filename}")
raw_file = open(f"{series_path}/{raw_filename}", 'wb')
filesize = ftp.size(f"suppl/{raw_filename}")
try:
Expand All @@ -58,6 +79,7 @@ def tqdm_callback(data):
except Exception as e:
LOGGER.info('tqdm: Failed to create a progress bar, but it is downloading...')
ftp.retrbinary(f"RETR suppl/{raw_filename}", raw_file.write)
LOGGER.info(f"Closing file {raw_filename}")
raw_file.close()
LOGGER.info(f"Downloaded {raw_filename}")
LOGGER.info(f"Unpacking {raw_filename}")
Expand All @@ -80,21 +102,6 @@ def tqdm_callback(data):
if clean:
os.remove(gz_string)

if not os.path.exists(f"{series_path}/{miniml_filename}"):
if not os.path.exists(f"{series_path}/{miniml_filename}.tgz"):
LOGGER.info(f"Downloading {miniml_filename}")
miniml_file = open(f"{series_path}/{miniml_filename}.tgz", 'wb')
ftp.retrbinary(f"RETR miniml/{miniml_filename}.tgz", miniml_file.write)
miniml_file.close()
LOGGER.info(f"Downloaded {miniml_filename}")
LOGGER.info(f"Unpacking {miniml_filename}")
min_tar = tarfile.open(f"{series_path}/{miniml_filename}.tgz")
for file in min_tar.getnames():
if file == miniml_filename:
min_tar.extract(file, path=series_path)
if clean:
os.remove(f"{series_path}/{miniml_filename}.tgz")

LOGGER.info(f"Downloaded and unpacked {geo_id}")

ftp.quit()
Expand Down Expand Up @@ -136,24 +143,42 @@ def geo_metadata(geo_id, series_path, geo_platforms, path):
attributes_dir['title'] = title
for char in sample.find_all('Characteristics'):
attributes_dir[char['tag']] = char.text.strip()
split_idat = sample.find('Supplementary-Data').text.split("/")[-1].split("_")
attributes_dir['methylprep_name'] = f"{split_idat[1]}_{split_idat[2]}"
if sample.find('Description'):
attributes_dir['description'] = sample.find('Description').text.strip()

# only some MINiML files have this.
try:
split_idat = sample.find('Supplementary-Data').text.split("/")[-1].split("_")
attributes_dir['methylprep_name'] = f"{split_idat[1]}_{split_idat[2]}"
except:
LOGGER.info( "MINiML file does not provide `methylprep_name` (sentrix_id_R00C00)" )
# MUST have methylprep_name match filename

if platform in geo_platforms:
for idat in sample.find_all('Supplementary-Data'):
if idat['type'] == 'IDAT':
file_name = (idat.text.split("/")[-1]).strip()[:-3]
shutil.move(f"{series_path}/{file_name}", f"{series_path}/{platform}/{file_name}")
try:
shutil.move(f"{series_path}/{file_name}", f"{series_path}/{platform}/{file_name}")
except FileNotFoundError:
# this doesn't throw an error if file is already in the right folder
if not Path(f"{series_path}/{platform}/{file_name}").is_file():
raise FileNotFoundError ("Could not move file after downloading.")

meta_dicts[platform][accession] = attributes_dir
samples_dict[platform][accession] = title
else:
raise ValueError(f'Sample: {title} has unrecognized platform: {platform}')
# this ought to keep other idat files from being included in the package.
LOGGER.warning(f'Sample: {title} has unrecognized platform: {platform}; not moving data file')
fp.close()
#raise ValueError(f'Sample: {title} has unrecognized platform: {platform}')
LOGGER.info(f"Found {len(attributes_dir)} tags for {len(samples)} samples: {attributes_dir}")
fp.close()

seen_platforms = []

for platform in geo_platforms:
LOGGER.debug(f"exporting {platform}")
if meta_dicts[platform]:
meta_dict_filename = f"{geo_id}_{platform}_dict.pkl"
pickle.dump(meta_dicts[platform], open(f"{series_path}/{meta_dict_filename}", 'wb'))
Expand Down
12 changes: 8 additions & 4 deletions methylprep/download/process_data.py
Expand Up @@ -18,13 +18,13 @@

LOGGER = logging.getLogger(__name__)

GEO_PLATFORMS = ['GPL21145', 'GPL13534']
GEO_PLATFORMS = ['GPL21145', 'GPL13534'] # GPL23976 -- is an 850k genotyping array that is bundled with some datasets
AE_PLATFORMS = ['A-MEXP-2255', 'A-GEOD-21145']
PLATFORMS = GEO_PLATFORMS + AE_PLATFORMS
BATCH_SIZE = 100


def run_series(id, path, dict_only=False, batch_size=BATCH_SIZE, clean=True):
def run_series(id, path, dict_only=False, batch_size=BATCH_SIZE, clean=True, verbose=False):
"""Downloads the IDATs and metadata for a series then generates one metadata dictionary and one beta value matrix for each platform in the series
Arguments:
Expand All @@ -34,10 +34,14 @@ def run_series(id, path, dict_only=False, batch_size=BATCH_SIZE, clean=True):
the path to the directory to download the data to. It is assumed a dictionaries and beta values
directory has been created for each platform (and will create one for each if not)
dict_only
if True, only downloads data and creates dictionaries for each platform
if True, downloads idat files and meta data and creates data dictionaries for each platform
batch_size
the batch_size to use when processing samples (number of samples run at a time).
By default is set to the constant 100."""
By default is set to the constant 100.
clean
if True, removes intermediate processing files
verbose
if True, adds additional debugging information"""
if not os.path.exists(f"{str(path)}/{PLATFORMS[0]}_beta_values"):
initialize(str(path))

Expand Down
4 changes: 2 additions & 2 deletions methylprep/processing/pipeline.py
Expand Up @@ -91,11 +91,11 @@ def run_pipeline(data_dir, array_type=None, export=False, manifest_filepath=None
betas
if True, will return a single data frame of betavalues instead of a list of SampleDataContainer objects.
Format is a "wide matrix": columns contain probes and rows contain samples.
m_factor
m_value
if True, will return a single data frame of m_factor values instead of a list of SampleDataContainer objects.
Format is a "wide matrix": columns contain probes and rows contain samples.
if batch_size is set to more than 200 samples, nothing is returned but, all the files are saved.
if batch_size is set to more than 200 samples, nothing is returned but all the files are saved. You can recreate the output by loading the files.
Processing note:
The sample_sheet parser will ensure every sample has a unique name and assign one (e.g. Sample1) if missing, or append a number (e.g. _1) if not unique.
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -3,7 +3,7 @@

setup(
name='methylprep',
version='1.1.4',
version='1.1.5',
description='Python-based Illumina methylation array preprocessing software',
long_description=open('README.md').read(),
long_description_content_type='text/markdown',
Expand Down
2 changes: 1 addition & 1 deletion tests/processing/test_pipeline.py
@@ -1,9 +1,9 @@
import sys
import numpy as np
from pathlib import Path
# App
from methylprep.processing import pipeline
from methylprep.utils.files import download_file
from pathlib import Path
#patching
try:
# python 3.4+ should use builtin unittest.mock not mock package
Expand Down

0 comments on commit 8840d1a

Please sign in to comment.