Skip to content

Commit

Permalink
Merge abf7b43 into 1b7a67a
Browse files Browse the repository at this point in the history
  • Loading branch information
netsettler committed Aug 14, 2020
2 parents 1b7a67a + abf7b43 commit f20531e
Show file tree
Hide file tree
Showing 12 changed files with 756 additions and 126 deletions.
80 changes: 80 additions & 0 deletions dcicutils/data_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""
Tools for handling data files, formats, etc.
"""

import gzip
import io
import os
import random

from .misc_utils import remove_suffix, check_true


def gunzip_content(content):
""" Helper that will gunzip content """
f_in = io.BytesIO()
f_in.write(content)
f_in.seek(0)
with gzip.GzipFile(fileobj=f_in, mode='rb') as f:
gunzipped_content = f.read()
return gunzipped_content.decode('utf-8')


def generate_sample_fastq_content(num=10, length=10):
""" Creates (pseudo)randomly generated fastq content."""

content = ''
bases = 'ACTG'

for i in range(num):
content += '@SEQUENCE{} length={}\n'.format(i, length)
content += ''.join(random.choice(bases) for i in range(length)) + '\n'
content += '+\n'
content += 'I' * length + '\n'

return content


FASTQ_SUFFIXES = [".fastq", ".fq"]


def generate_sample_fastq_file(filename, num=10, length=10, compressed=None):
"""
Creates a new fastq file with the given name, containing (pseudo)randomly generated content.
Example usage:
generate_sample_fastq_file('fastq_sample.fastq.gz', num=25, length=50)
creates a new fastq file with 25 sequences, each of length 50.
generate_sample_fastq_file('fastq_sample.fastq.gz')
creates a new fastq file with default characteristics (10 sequences, each of length 10).
Args:
filename str: the name of a file to create
num int: the number of random sequences (default 10)
length int: the length of the random sequences (default 10)
Returns:
the filename
"""
if compressed is None:
if filename.endswith('.gz'):
compressed = True
filename = remove_suffix(".gz", filename)
else:
compressed = False
_, ext = os.path.splitext(filename)
if ext not in FASTQ_SUFFIXES:
filename = filename + ".fastq"
check_true(isinstance(compressed, bool), "compressed must be one of True, False, or None (for autoselection)")
content = generate_sample_fastq_content(num=num, length=length)
if compressed:
filename += ".gz"
with gzip.open(filename, 'w') as zipfile:
zipfile.write(content.encode('ascii'))
else:
with io.open(filename, 'w') as outfile:
outfile.write(content)
return filename
18 changes: 18 additions & 0 deletions dcicutils/misc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,3 +616,21 @@ def check_true(test_value: object,
"""
if not test_value:
raise error_class(message)


def remove_prefix(prefix, text, required=False):
if not text.startswith(prefix):
if required:
raise ValueError('Prefix %s is not the initial substring of %s' % (prefix, text))
else:
return text
return text[len(prefix):]


def remove_suffix(suffix, text, required=False):
if not text.endswith(suffix):
if required:
raise ValueError('Suffix %s is not the final substring of %s' % (suffix, text))
else:
return text
return text[:len(text)-len(suffix)]
78 changes: 53 additions & 25 deletions dcicutils/qa_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,11 +334,30 @@ def retry_options(cls, name_key, retries_allowed=None, wait_seconds=None,
FILE_SYSTEM_VERBOSE = True


class MockFileWriter:

def __init__(self, file_system, file, binary=False, encoding='utf-8'):
self.file_system = file_system
self.file = file
self.encoding = encoding
self.stream = io.BytesIO() if binary else io.StringIO()

def __enter__(self):
return self.stream

def __exit__(self, exc_type, exc_val, exc_tb):
content = self.stream.getvalue()
if FILE_SYSTEM_VERBOSE:
print("Writing %r to %s." % (content, self.file))
self.file_system.files[self.file] = content if isinstance(content, bytes) else content.encode(self.encoding)


class MockFileSystem:
"""Extremely low-tech mock file system."""

def __init__(self):
self.files = {}
def __init__(self, files=None, default_encoding='utf-8'):
self.default_encoding = default_encoding
self.files = {filename: content.encode(default_encoding) for filename, content in (files or {}).items()}

def exists(self, file):
return bool(self.files.get(file))
Expand All @@ -351,36 +370,45 @@ def open(self, file, mode='r'):
if FILE_SYSTEM_VERBOSE:
print("Opening %r in mode %r." % (file, mode))
if mode == 'w':
return self._open_for_write(file_system=self, file=file)
return self._open_for_write(file_system=self, file=file, binary=False)
elif mode == 'wb':
return self._open_for_write(file_system=self, file=file, binary=True)
elif mode == 'r':
return self._open_for_read(file)
return self._open_for_read(file, binary=False)
elif mode == 'rb':
return self._open_for_read(file, binary=True)
else:
raise AssertionError("Mocked io.open doesn't handle mode=%r." % mode)

def _open_for_read(self, file):
text = self.files.get(file)
if text is None:
def _open_for_read(self, file, binary=False, encoding=None):
content = self.files.get(file)
if content is None:
raise FileNotFoundError("No such file or directory: %s" % file)
if FILE_SYSTEM_VERBOSE:
print("Read %s to %s." % (text, file))
return io.StringIO(text)

def _open_for_write(self, file_system, file):
print("Read %r to %s." % (content, file))
return io.BytesIO(content) if binary else io.StringIO(content.decode(encoding or self.default_encoding))

class MockFileWriter:
def _open_for_write(self, file_system, file, binary=False, encoding=None):
return MockFileWriter(file_system=file_system, file=file, binary=binary,
encoding=encoding or self.default_encoding)

def __init__(self, file_system, file):
self.file_system = file_system
self.file = file
self.stream = io.StringIO()

def __enter__(self):
return self.stream
class NotReallyRandom:

def __exit__(self, exc_type, exc_val, exc_tb):
text = self.stream.getvalue()
if FILE_SYSTEM_VERBOSE:
print("Writing %s to %s." % (text, file))
self.file_system.files[file] = text

return MockFileWriter(file_system=file_system, file=file)
def __init__(self):
self.counter = 0

def _random_int(self, n):
"""Returns an integer between 0 and n, upper-exclusive, not one of the published 'random' operations."""
result = self.counter % n
self.counter += 1
return result

def randint(self, a, b):
"""Returns a number between a and b, inclusive at both ends, though not especially randomly."""
assert isinstance(a, int) and isinstance(b, int) and a < b, "Arguments must be two strictly ascending ints."
rangesize = int(abs(b-a))+1
return a + self._random_int(rangesize)

def choice(self, things):
return things[self._random_int(len(things))]

0 comments on commit f20531e

Please sign in to comment.