In [1]:
import os, re, sys
from pymongo import MongoClient
from pprint import pprint
import datetime
import json
import xml.etree.ElementTree as et
import glob

In [2]:
client = MongoClient()
db = client.tg3

In [147]:
def to_camel_case(snake_str):
    if not re.search('^_', snake_str):
        components = snake_str.split('_')
        return components[0] + "".join(x.title() for x in components[1:])
    else:
        return snake_str

def convert_keys(obj):
    if isinstance(obj, list):
        return [convert_keys(i) for i in obj]
    elif isinstance(obj, dict):
        return {to_camel_case(k): convert_keys(obj[k])
                for k in obj}
    else:
        return obj
                   
def read_fc_run_params(run_params_file):
    tree = et.parse(run_params_file)
    return {param.tag: param.text.rstrip() for param in tree.getroot()[0]
            if param.text is not None}


def find_dir(top_dir, dir_name, max_depth):
    for d in range(1, max_depth + 1):
        max_glob = "/".join("*" * d)
        top_glob = os.path.join(top_dir, max_glob)
        for f in glob.glob(top_glob):
            if dir_name in f:
                return f
                break
                
class TG3Object(dict):
    '''
    Generic functions for objects in TG3 collections.
    '''
    
    def __init__(self, _id=None, type=None):
        
        self._id = _id
        self.type = type

    def to_db(self):
        return convert_keys(self.__dict__)


class Run(TG3Object):
    '''
    GenLIMS object in the 'runs' collection
    '''
    
    def __init__(self, *args, **kwargs):
        if 'protocol_id' in kwargs:
            self.protocol_id = kwargs.pop('procedure_id')
        else:
            self.protocol_id = None
        TG3Object.__init__(self, *args, **kwargs)


class FlowcellRun(Run):
    '''
    GenLIMS object in 'runs' collection of type 'flowcell'
    '''
    
    def __init__(self, *args, **kwargs):
        Run.__init__(self, *args, **kwargs)
        
        # overwrite sample type
        self.type = 'flowcell'
    
    def _init_from_fc_packet(self, fc_run_id, fc_packet):
        self._id = fc_run_id
        
        fc_dir = fc_packet.pop('flowcell_dir')
        for k, v in fc_packet.items():
            setattr(self, k, v)
        
        self._get_fc_params(fc_dir)
        self._get_fc_protocol()
        
    def _get_fc_params(self, fc_dir):
        root_dir = find_dir('/', 'genomics', 3)
        local_fc_dir = re.sub('.*genomics', root_dir, fc_dir)
        run_params_file = os.path.join(local_fc_dir, self._id, 'runParameters.xml')
        
        self.parameters = read_fc_run_params(run_params_file)
                
    def _get_fc_protocol(self):

        instrument_dict = {'D00565': 'HiSeq2500',
                           'H135': 'HiScanSQ'}
        
        fc_param = self.parameters.get('Flowcell')
        is_rapid = 'rapid' in fc_param.lower()
        version = matchdefault('v[0-9]+', fc_param)
        if is_rapid:
            self.protocol_id = '_'.join([instrument_dict[r.instrument], 
                                     'rapid', version])
        else:
            self.protocol_id = '_'.join([instrument_dict[r.instrument], 
                                      version])

    
class Sample(TG3Object):
    '''
    GenLIMS object in the 'samples' collection
    '''
    
    def __init__(self, *args, **kwargs):
        
        sample_fields = ['protocol_id', 'project_id', 'subproject_id']
        for field in sample_fields:
            if field in kwargs:
                setattr(self, field, kwargs.pop('protocol_id'))
            else:
                setattr(self, field, None)
        TG3Object.__init__(self, *args, **kwargs)
        
        
class SequencedLibrarySample(Sample):
    '''
    GenLIMS object in 'samples' collection of type 'sequenced library'
    '''
    
    def __init__(self, *args, **kwargs):
        
        if 'parent_id' in kwargs:
            self.parent_id = kwargs.pop('parent_id')
        else:
            self.parent_id = None
        Sample.__init__(self, *args, **kwargs)
        
        # overwrite sample type
        self.type = 'sequenced library'
    
    def _init_from_lib_packet(self, lib_id, lib_packet):
        self.parent_id = lib_id
        self.run_id = lib_packet.get('run_id')
        self._id = '%s_%s' % (lib_id, lib_packet.get('run_tag'))
        self.project_id = lib_packet.get('project_id')
        self.subproject_id = lib_packet.get('subproject_id')
        
        self._get_raw_data(lib_packet)
        
    def _get_raw_data(self, lib_packet):
        self.raw_data = get_lib_fastqs(lib_packet.get('fastq_dir'))

In [130]:
# wrapper function for regex matching
def matchdefault(pattern, string, default=''):
    
    regex = re.compile(pattern)
    match = regex.search(string)
    if match is not None:
        return match.group()
    else:
        return default

### file/path/string parsing functions ###

def get_lib_id(lib_str):
    lib_id = matchdefault('lib[1-9]+[0-9]*', lib_str)
    
    return lib_id

def parse_fc_run_id(fc_run_id):
    fc_parts = fc_run_id.split('_')

    d = datetime.datetime.strptime(fc_parts[0], '%y%m%d')
    
    date = datetime.date.isoformat(d)
    instrument_id = fc_parts[1]
    run_num = int(fc_parts[2])
    
    fc_id = matchdefault('(?<=(_(A|B|D)))([A-Z]|[0-9])*XX', fc_run_id)
    fc_pos = matchdefault('.{1}(?=%s)' % fc_id, fc_run_id)

    return date, instrument_id, run_num, fc_id, fc_pos
    
def get_proj_id(proj_str):
    proj = matchdefault('P+[0-9]+(-[0-9]+){,1}', proj_str)
    proj_id = int(matchdefault('(?<=P)[0-9]+', proj))
    subproj_id = int(matchdefault('(?<=-)[0-9]+', proj))
    
    return proj_id, subproj_id

### file parsing/annotating functions ###

def get_file_type(file_path):
    ext = os.path.splitext(file_path)
    if 'z' in ext[-1]:
        compression = ext[-1].lstrip('.')
        ext = os.path.splitext(ext[0])[-1].lstrip('.')
    return ext, compression

def get_fastq_source(file_path):
    lane_id = matchdefault('(?<=_)L00[1-8]', file_path)
    read_id = matchdefault('(?<=_)R[1-2]', file_path)
    sample_num = int(matchdefault('(?<=_S)[0-9]+', file_path))
    return lane_id, read_id, sample_num
    
def collect_fastq_info(file_path):
    file_type,compression = get_file_type(file_path)
    
    if file_type == 'fastq':
        lane_id, read_id, sample_num = get_fastq_source(file_path)
    
    file_path = re.sub('.*(?=/genomics)', '', file_path)
    
    return {'path': file_path, 'lane_id': lane_id, 
            'read_id': read_id, 'sample_number': sample_num}

# describe raw files for current lib
def get_lib_fastqs(fastq_dir):
    # check if logged into server or accessing mounted volume
    if not os.path.isdir(fastq_dir):
        fastq_dir = re.sub('mnt', 'Volumes', fastq_dir)
        
    return [collect_fastq_info(os.path.join(fastq_dir, f))
            for f in os.listdir(fastq_dir)]

class LibListParser(object):
    
    def __init__(self, lib_list_file):
        
        self.lib_list_file = lib_list_file

    def _parse_lib_line(self, line):
        l_parts = line.strip().split('\t')

        lib_id = get_lib_id(l_parts[0])

        project_id, subproject_id = get_proj_id(l_parts[3])
        fastq_dir = l_parts[-1]

        lib_packet = {'project_id': project_id,
                      'subproject_id': subproject_id,
                      'fastq_dir': fastq_dir}

        flowcell_run_id = l_parts[2]
        fc_dir = l_parts[4]
        fc_date, instr, run_num, fc_id, fc_pos = parse_fc_run_id(flowcell_run_id)

        flowcell_packet = {'date': fc_date,
                           'instrument': instr,
                           'run_number': run_num,
                           'flowcell_id': fc_id,
                           'flowcell_pos': fc_pos,
                           'flowcell_dir': fc_dir}

        return lib_id, lib_packet, flowcell_run_id, flowcell_packet

    # read and extract info from library list file
    def read_lib_list(self):
        lib_dict = {}
        fc_dict = {}
        with open(self.lib_list_file) as f:
            for i, l in enumerate(f):
                if i > 0:
                    lib_id, lib_packet, fc_run_id, fc_packet = self._parse_lib_line(l)
                    lib_packet['run_id'] = fc_run_id
                    lib_packet['run_tag'] = fc_packet.get('flowcell_id')

                    lib_dict.setdefault(lib_id, []).append(lib_packet)

                    if fc_run_id not in fc_dict:
                        fc_dict[fc_run_id] = fc_packet

        return fc_dict, lib_dict

In [142]:
class LibListImporter(object):
    
    def __init__(self, lib_list_file):
        
        self.lib_list_file = lib_list_file
        
    def get_fc_run_info(self):
        fc_dict, lib_dict = LibListParser(self.lib_list_file).read_lib_list()

        self.runs_objects = []
        protocol_dict = {}
        for fc_id, fc_packet in fc_dict.items():
            run_obj = FlowcellRun()
            run_obj._init_from_fc_packet(fc_id, fc_packet)
            if run_obj._id not in protocol_dict:
                protocol_dict[run_obj._id] = run_obj.protocol_id

            self.runs_objects.append(run_obj.to_db())

        self.samples_objects = []
        for lib_id, lib_packet in lib_dict.items():
            for p_i in lib_packet:
                lib_run_id = p_i.get('run_id')
                lib_protocol = protocol_dict.get(lib_run_id, None)
                seq_lib_obj = SequencedLibrarySample(protocol_id = lib_protocol)
                seq_lib_obj._init_from_lib_packet(lib_id, p_i)

                self.samples_objects.append(seq_lib_obj.to_db())

        return self.runs_objects, self.samples_objects
    
    def write_to_json(self, file_base):
        
        runs_json_file = file_base + '_fc_runs.json'
        samples_json_file = file_base + '_seq_libs.json'
        
        with open(runs_json_file, 'wb+') as f:
            json.dump(self.runs_objects, f)
            
        with open(samples_json_file, 'wb+') as f:
            json.dump(self.samples_objects, f)

In [6]:
!grep 160122 ../data/lib_list_master.txt > ../data/lib_list_160122.txt

In [93]:
lib_list_file = "../data/lib_list_160122.txt"
fc_dict, lib_dict = read_lib_list(lib_list_file)

print fc_dict

fc_id = '160122_D00565_0101_BC81LMANXX'
fc_packet = fc_dict[fc_id]
print fc_packet

lib_id = 'lib10320'
lib_packet = lib_dict[lib_id][0]
lib_packet

{'160122_D00565_0101_BC81LMANXX': {'run_number': 101, 'flowcell_pos': 'B', 'flowcell_dir': '/mnt/genomics/Illumina', 'instrument': 'D00565', 'flowcell_id': 'C81LMANXX', 'date': '2016-01-22'}}
{'run_number': 101, 'flowcell_pos': 'B', 'flowcell_dir': '/mnt/genomics/Illumina', 'instrument': 'D00565', 'flowcell_id': 'C81LMANXX', 'date': '2016-01-22'}


{'fastq_dir': '/mnt/genomics/Illumina/160122_D00565_0101_BC81LMANXX/Unaligned/P54-18-28092065/lib10320-32561562',
 'project_id': 54,
 'run_id': '160122_D00565_0101_BC81LMANXX',
 'run_tag': 'C81LMANXX',
 'subproject_id': 18}

In [148]:
lli = LibListImporter(lib_list_file)
_, _ = lli.get_fc_run_info()
lli.write_to_json('../data/lib_list_import')

In [149]:
#!grep /mnt/genomics/Illumina data/lib_list_master.txt | awk '($4 ~ /\-[0-9]+\-/ || $3 ~ /^16/) { print }'

[
    {
        "_id": "lib10473_C81LMANXX",
        "parentId": "lib10473",
        "projectId": 127,
        "protocolId": "HiSeq2500_v4",
        "rawData": [
            {
                "laneId": "L004",
                "path": "/genomics/Illumina/160122_D00565_0101_BC81LMANXX/Unaligned/P127-1-28085060/lib10473-32550613/10202-mono_S125_L004_R1_001.fastq.gz",
                "readId": "R1",
                "sampleNumber": 125
            }
        ],
        "runId": "160122_D00565_0101_BC81LMANXX",
        "subprojectId": 1,
        "type": "sequenced library"
    },
    {
        "_id": "lib10472_C81LMANXX",
        "parentId": "lib10472",
        "projectId": 127,
        "protocolId": "HiSeq2500_v4",
        "rawData": [
            {
                "laneId": "L004",
                "path": "/genomics/Illumina/160122_D00565_0101_BC81LMANXX/Unaligned/P127-1-28085060/lib10472-32558547/10202-Bcell_S124_L004_R1_001.fastq.gz",
                "readId": "R1",
                "sampl

In [121]:
runs_collection, samples_collection = create_lib_list_objects(lib_list_file)

{'160122_D00565_0101_BC81LMANXX': {'run_number': 101, 'flowcell_pos': 'B', 'flowcell_dir': '/mnt/genomics/Illumina', 'instrument': 'D00565', 'flowcell_id': 'C81LMANXX', 'date': '2016-01-22'}}


In [138]:
print json.dumps(samples_objects, indent=4, sort_keys=True)

[
    {
        "_id": "lib10473_C81LMANXX", 
        "parentId": "lib10473", 
        "protocolId": "HiSeq2500_v4", 
        "rawData": [
            {
                "laneId": "L004", 
                "path": "/genomics/Illumina/160122_D00565_0101_BC81LMANXX/Unaligned/P127-1-28085060/lib10473-32550613/10202-mono_S125_L004_R1_001.fastq.gz", 
                "readId": "R1", 
                "sampleNumber": 125
            }
        ], 
        "runId": "160122_D00565_0101_BC81LMANXX", 
        "type": "sequenced library"
    }, 
    {
        "_id": "lib10472_C81LMANXX", 
        "parentId": "lib10472", 
        "protocolId": "HiSeq2500_v4", 
        "rawData": [
            {
                "laneId": "L004", 
                "path": "/genomics/Illumina/160122_D00565_0101_BC81LMANXX/Unaligned/P127-1-28085060/lib10472-32558547/10202-Bcell_S124_L004_R1_001.fastq.gz", 
                "readId": "R1", 
                "sampleNumber": 124
            }
        ], 
        "runId": "160122