Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v1.1.5 geo downloader fixes and documentation fixes #29

Merged
merged 31 commits into from Oct 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
cda035f
added batch_size parameter to run_pipeline
markt Aug 14, 2019
6e18690
added CLI functionality
markt Aug 14, 2019
4ac726d
batch_size python/CLI and tests
markt Aug 19, 2019
738cb70
removed test; changed default behavior: won't raise error if file-to-…
marcmaxson Sep 3, 2019
2c83ace
merging batch_size into dev
marcmaxson Sep 5, 2019
a0eec8c
Merge branch 'feature/batch_size' into dev
marcmaxson Sep 5, 2019
6e023c3
Update setup.py
Sep 5, 2019
278fa79
Update test_batch_size.py
Sep 5, 2019
bc88f56
Rename test_batch_size.py to test_pipeline_batch_size.py
Sep 5, 2019
3ce41d7
dropped redundant tests and sped up one
Sep 5, 2019
0239c7c
Feature/public data (#21)
Sep 5, 2019
6ec0262
version 1.1 (#22)
Sep 5, 2019
7904833
Merge branch 'master' into dev
Sep 5, 2019
e7ff5d7
documenting `download`
Sep 5, 2019
f5361f6
Update cli.py
Sep 11, 2019
e96fc17
restore sample_name filter
Sep 11, 2019
2734ac0
added rawMetaDataset class and moved get_sample_sheet_s3 to more logi…
Sep 16, 2019
6599053
updated docs for 1.1.1
Sep 18, 2019
ba59d97
Update README.md
Sep 18, 2019
64ef237
Update setup.py
Sep 18, 2019
3683bce
exposed create_sample_sheet and download no_clean options
marcmaxson Sep 18, 2019
20690a8
Merge branch 'dev' of https://github.com/LifeEGX/methylprep into dev
marcmaxson Sep 18, 2019
d2eca62
manifest file download in lambda
marcmaxson Sep 19, 2019
647615e
manifest file download in lambda
marcmaxson Sep 19, 2019
0c06039
manifest file download in lambda
marcmaxson Sep 19, 2019
076a8c2
v1.1.3 bump bug fix
marcmaxson Sep 19, 2019
217ba6a
handles blank sample_name and ensures names are unique.
marcmaxson Sep 24, 2019
514a700
Update setup.py
Sep 24, 2019
b616000
Merge branch 'master' into dev
Sep 24, 2019
cabd58f
geo downloader tweaks, fixed docs
marcmaxson Oct 8, 2019
7b24881
Merge branch 'master' into dev
Oct 8, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
73 changes: 49 additions & 24 deletions methylprep/download/geo.py
Expand Up @@ -13,20 +13,28 @@
import pandas as pd
from tqdm import tqdm

#logging.basicConfig(level=logging.DEBUG) # always verbose
LOGGER = logging.getLogger(__name__)
LOGGER.setLevel( logging.INFO )

def geo_download(geo_id, series_path, geo_platforms, clean=True):
"""Downloads the IDATs and metadata for a GEO series

Arguments:
geo_id [required]
the GEO Accension for the desired series
the GEO Accension for the desired series (e.g. GSE134293)
series_path [required]
the directory to download the data to
geo_platforms [required]
the list of supported GEO platforms
clean
whether or not to delete files once they are no longer need (True by default)"""
whether or not to delete files once they are no longer need (True by default)

Note about GEO IDs:
You can use the NIH online search to find data sets, then click "Send to:" at the button of a results page,
and export a list of unique IDs as text file. These IDs are not GEO_IDs used here. First, remove the first
three digits from the number, so Series ID: 200134293 is GEO accension ID: 134293, then include the GSE part,
like "GSE134293" in your CLI parameters. """
series_dir = Path(series_path)
raw_filename = f"{geo_id}_RAW.tar"
miniml_filename = f"{geo_id}_family.xml"
Expand All @@ -38,15 +46,28 @@ def geo_download(geo_id, series_path, geo_platforms, clean=True):
if not os.path.exists(f"{series_path}/{platform}"):
os.mkdir(f"{series_path}/{platform}")

ftp = FTP('ftp.ncbi.nlm.nih.gov')
ftp = FTP('ftp.ncbi.nlm.nih.gov', timeout=1200) # 20 mins
ftp.login()
ftp.cwd(f"geo/series/{geo_id[:-3]}nnn/{geo_id}")

#LOGGER.info(f"Downloading {geo_id}")
if not os.path.exists(f"{series_path}/{miniml_filename}"):
if not os.path.exists(f"{series_path}/{miniml_filename}.tgz"):
LOGGER.info(f"Downloading {miniml_filename}")
miniml_file = open(f"{series_path}/{miniml_filename}.tgz", 'wb')
ftp.retrbinary(f"RETR miniml/{miniml_filename}.tgz", miniml_file.write)
miniml_file.close()
LOGGER.info(f"Downloaded {miniml_filename}")
LOGGER.info(f"Unpacking {miniml_filename}")
min_tar = tarfile.open(f"{series_path}/{miniml_filename}.tgz")
for file in min_tar.getnames():
if file == miniml_filename:
min_tar.extract(file, path=series_path)
if clean:
os.remove(f"{series_path}/{miniml_filename}.tgz")

if not list(series_dir.glob('**/*.idat')):
if not list(series_dir.glob('*.idat.gz')):
if not os.path.exists(f"{series_path}/{raw_filename}"):
#LOGGER.info(f"Downloading {raw_filename}")
raw_file = open(f"{series_path}/{raw_filename}", 'wb')
filesize = ftp.size(f"suppl/{raw_filename}")
try:
Expand All @@ -58,6 +79,7 @@ def tqdm_callback(data):
except Exception as e:
LOGGER.info('tqdm: Failed to create a progress bar, but it is downloading...')
ftp.retrbinary(f"RETR suppl/{raw_filename}", raw_file.write)
LOGGER.info(f"Closing file {raw_filename}")
raw_file.close()
LOGGER.info(f"Downloaded {raw_filename}")
LOGGER.info(f"Unpacking {raw_filename}")
Expand All @@ -80,21 +102,6 @@ def tqdm_callback(data):
if clean:
os.remove(gz_string)

if not os.path.exists(f"{series_path}/{miniml_filename}"):
if not os.path.exists(f"{series_path}/{miniml_filename}.tgz"):
LOGGER.info(f"Downloading {miniml_filename}")
miniml_file = open(f"{series_path}/{miniml_filename}.tgz", 'wb')
ftp.retrbinary(f"RETR miniml/{miniml_filename}.tgz", miniml_file.write)
miniml_file.close()
LOGGER.info(f"Downloaded {miniml_filename}")
LOGGER.info(f"Unpacking {miniml_filename}")
min_tar = tarfile.open(f"{series_path}/{miniml_filename}.tgz")
for file in min_tar.getnames():
if file == miniml_filename:
min_tar.extract(file, path=series_path)
if clean:
os.remove(f"{series_path}/{miniml_filename}.tgz")

LOGGER.info(f"Downloaded and unpacked {geo_id}")

ftp.quit()
Expand Down Expand Up @@ -136,24 +143,42 @@ def geo_metadata(geo_id, series_path, geo_platforms, path):
attributes_dir['title'] = title
for char in sample.find_all('Characteristics'):
attributes_dir[char['tag']] = char.text.strip()
split_idat = sample.find('Supplementary-Data').text.split("/")[-1].split("_")
attributes_dir['methylprep_name'] = f"{split_idat[1]}_{split_idat[2]}"
if sample.find('Description'):
attributes_dir['description'] = sample.find('Description').text.strip()

# only some MINiML files have this.
try:
split_idat = sample.find('Supplementary-Data').text.split("/")[-1].split("_")
attributes_dir['methylprep_name'] = f"{split_idat[1]}_{split_idat[2]}"
except:
LOGGER.info( "MINiML file does not provide `methylprep_name` (sentrix_id_R00C00)" )
# MUST have methylprep_name match filename

if platform in geo_platforms:
for idat in sample.find_all('Supplementary-Data'):
if idat['type'] == 'IDAT':
file_name = (idat.text.split("/")[-1]).strip()[:-3]
shutil.move(f"{series_path}/{file_name}", f"{series_path}/{platform}/{file_name}")
try:
shutil.move(f"{series_path}/{file_name}", f"{series_path}/{platform}/{file_name}")
except FileNotFoundError:
# this doesn't throw an error if file is already in the right folder
if not Path(f"{series_path}/{platform}/{file_name}").is_file():
raise FileNotFoundError ("Could not move file after downloading.")

meta_dicts[platform][accession] = attributes_dir
samples_dict[platform][accession] = title
else:
raise ValueError(f'Sample: {title} has unrecognized platform: {platform}')
# this ought to keep other idat files from being included in the package.
LOGGER.warning(f'Sample: {title} has unrecognized platform: {platform}; not moving data file')
fp.close()
#raise ValueError(f'Sample: {title} has unrecognized platform: {platform}')
LOGGER.info(f"Found {len(attributes_dir)} tags for {len(samples)} samples: {attributes_dir}")
fp.close()

seen_platforms = []

for platform in geo_platforms:
LOGGER.debug(f"exporting {platform}")
if meta_dicts[platform]:
meta_dict_filename = f"{geo_id}_{platform}_dict.pkl"
pickle.dump(meta_dicts[platform], open(f"{series_path}/{meta_dict_filename}", 'wb'))
Expand Down
12 changes: 8 additions & 4 deletions methylprep/download/process_data.py
Expand Up @@ -18,13 +18,13 @@

LOGGER = logging.getLogger(__name__)

GEO_PLATFORMS = ['GPL21145', 'GPL13534']
GEO_PLATFORMS = ['GPL21145', 'GPL13534'] # GPL23976 -- is an 850k genotyping array that is bundled with some datasets
AE_PLATFORMS = ['A-MEXP-2255', 'A-GEOD-21145']
PLATFORMS = GEO_PLATFORMS + AE_PLATFORMS
BATCH_SIZE = 100


def run_series(id, path, dict_only=False, batch_size=BATCH_SIZE, clean=True):
def run_series(id, path, dict_only=False, batch_size=BATCH_SIZE, clean=True, verbose=False):
"""Downloads the IDATs and metadata for a series then generates one metadata dictionary and one beta value matrix for each platform in the series

Arguments:
Expand All @@ -34,10 +34,14 @@ def run_series(id, path, dict_only=False, batch_size=BATCH_SIZE, clean=True):
the path to the directory to download the data to. It is assumed a dictionaries and beta values
directory has been created for each platform (and will create one for each if not)
dict_only
if True, only downloads data and creates dictionaries for each platform
if True, downloads idat files and meta data and creates data dictionaries for each platform
batch_size
the batch_size to use when processing samples (number of samples run at a time).
By default is set to the constant 100."""
By default is set to the constant 100.
clean
if True, removes intermediate processing files
verbose
if True, adds additional debugging information"""
if not os.path.exists(f"{str(path)}/{PLATFORMS[0]}_beta_values"):
initialize(str(path))

Expand Down
4 changes: 2 additions & 2 deletions methylprep/processing/pipeline.py
Expand Up @@ -91,11 +91,11 @@ def run_pipeline(data_dir, array_type=None, export=False, manifest_filepath=None
betas
if True, will return a single data frame of betavalues instead of a list of SampleDataContainer objects.
Format is a "wide matrix": columns contain probes and rows contain samples.
m_factor
m_value
if True, will return a single data frame of m_factor values instead of a list of SampleDataContainer objects.
Format is a "wide matrix": columns contain probes and rows contain samples.

if batch_size is set to more than 200 samples, nothing is returned but, all the files are saved.
if batch_size is set to more than 200 samples, nothing is returned but all the files are saved. You can recreate the output by loading the files.

Processing note:
The sample_sheet parser will ensure every sample has a unique name and assign one (e.g. Sample1) if missing, or append a number (e.g. _1) if not unique.
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -3,7 +3,7 @@

setup(
name='methylprep',
version='1.1.4',
version='1.1.5',
description='Python-based Illumina methylation array preprocessing software',
long_description=open('README.md').read(),
long_description_content_type='text/markdown',
Expand Down
2 changes: 1 addition & 1 deletion tests/processing/test_pipeline.py
@@ -1,9 +1,9 @@
import sys
import numpy as np
from pathlib import Path
# App
from methylprep.processing import pipeline
from methylprep.utils.files import download_file
from pathlib import Path
#patching
try:
# python 3.4+ should use builtin unittest.mock not mock package
Expand Down