In [1]:
import os, re, sys
from pymongo import MongoClient
from pprint import pprint
import datetime
import json

In [2]:
client = MongoClient()
db = client.tg3

In [26]:

# wrapper function for regex matching
def matchdefault(pattern, string, default=''):
    
    regex = re.compile(pattern)
    match = regex.search(string)
    if match is not None:
        return match.group()
    else:
        return default

### file/path/string parsing functions ###

def get_lib_id(lib_str):
    lib_id = matchdefault('lib[1-9]+[0-9]*', lib_str)
    
    return lib_id

def parse_fc_run_id(fc_run_id):
    fc_parts = fc_run_id.split('_')

    d = datetime.datetime.strptime(fc_parts[0], '%y%m%d')
    
    date = datetime.date.isoformat(d)
    instrument_id = fc_parts[1]
    run_num = int(fc_parts[2])
    
    fc_id = matchdefault('(?<=(_(A|B|D)))([A-Z]|[0-9])*XX', fc_run_id)
    fc_pos = matchdefault('.{1}(?=%s)' % fc_id, fc_run_id)

    return date, instrument_id, run_num, fc_id, fc_pos
    
def get_proj_id(proj_str):
    proj = matchdefault('P+[0-9]+(-[0-9]+){,1}', proj_str)
    proj_id = int(matchdefault('(?<=P)[0-9]+', proj))
    subproj_id = int(matchdefault('(?<=-)[0-9]+', proj))
    
    return proj_id, subproj_id

def parse_lib_line(line):
    l_parts = line.strip().split('\t')
    
    lib_id = get_lib_id(l_parts[0])
    
    project_id, subproject_id = get_proj_id(l_parts[3])
    fastq_dir = l_parts[-1]
    
    lib_packet = {'project_id': project_id,
                  'subproject_id': subproject_id,
                  'fastq_dir': fastq_dir}
    
    flowcell_run_id = l_parts[2]
    fc_dir = l_parts[4]
    fc_date, instr, run_num, fc_id, fc_pos = parse_fc_run_id(flowcell_run_id)
    
    flowcell_packet = {'date': fc_date,
                       'instrument': instr,
                       'run_number': run_num,
                       'flowcell_id': fc_id,
                       'flowcell_pos': fc_pos,
                       'flowcell_dir': fc_dir}
    
    return lib_id, lib_packet, flowcell_run_id, flowcell_packet

### file parsing/annotating functions ###

def get_file_type(file_path):
    ext = os.path.splitext(file_path)
    if 'z' in ext[-1]:
        compression = ext[-1].lstrip('.')
        ext = os.path.splitext(ext[0])[-1].lstrip('.')
    return ext, compression

def get_fastq_source(file_path):
    lane_id = matchdefault('(?<=_)L00[1-8]', file_path)
    read_id = matchdefault('(?<=_)R[1-2]', file_path)
    sample_num = int(matchdefault('(?<=_S)[0-9]+', file_path))
    return lane_id, read_id, sample_num
    
def collect_fastq_info(file_path):
    file_type,compression = get_file_type(file_path)
    
    if file_type == 'fastq':
        lane_id, read_id, sample_num = get_fastq_source(file_path)
    
    file_path = re.sub('.*(?=/genomics)', '', file_path)
    
    return {'path': file_path, 'lane_id': lane_id, 
            'read_id': read_id, 'sample_number': sample_num}


# describe raw files for current lib
def get_lib_fastqs(fastq_dir):
    # check if logged into server or accessing mounted volume
    if not os.path.isdir(fastq_dir):
        fastq_dir = re.sub('mnt', 'Volumes', fastq_dir)
        
    return [collect_fastq_info(os.path.join(fastq_dir, f))
            for f in os.listdir(fastq_dir)]

# read and extract info from library list file
def read_lib_list(lib_list_file):
    lib_dict = {}
    fc_dict = {}
    with open(lib_list_file) as f:
        for i, l in enumerate(f):
            if i > 0:
                lib_id, lib_packet, fc_run_id, fc_packet = parse_lib_line(l)
                lib_packet['run_id'] = fc_run_id
                lib_packet['run_tag'] = fc_packet.get('flowcell_id')
                        
                lib_dict.setdefault(lib_id, []).append(lib_packet)

                if fc_run_id not in fc_dict:
                    fc_dict[fc_run_id] = fc_packet
                    
    return fc_dict, lib_dict

In [4]:
def to_camel_case(snake_str):
    components = snake_str.split('_')
    return components[0] + "".join(x.title() for x in components[1:])

def convert_dict_keys(obj):
    if isinstance(obj, list):
        new_obj = [ convert_dict_keys(i) for i in obj ]
    elif isinstance(obj, dict):
        new_obj = { to_camel_case(k): convert_dict_keys(obj[k]) \
                    for k in obj }
    else:
        new_obj = obj
    return new_obj

def merge_two_dicts(x, y):
    '''Given two dicts, merge them into a new dict as a shallow copy.'''
    z = x.copy()
    z.update(y)
    return z

def lib_dict_to_db(lib_dict):
    lib_db = [ merge_two_dicts({'_id': i}, convert_dict_keys(lib_dict[i])) \
               for i in lib_dict ]
    return lib_db

def fc_dict_to_db(fc_dict):
    fc_db = [ merge_two_dicts({'_id': i}, convert_dict_keys(fc_dict[i])) \
              for i in fc_dict ]
    return fc_db

In [7]:
def to_camel_case(snake_str):
    if not re.search('^_', snake_str):
        components = snake_str.split('_')
        return components[0] + "".join(x.title() for x in components[1:])
    else:
        return snake_str

def convert_keys(obj):
    if isinstance(obj, list):
        return [convert_keys(i) for i in obj]
    elif isinstance(obj, dict):
        return {to_camel_case(k): convert_keys(obj[k])
                for k in obj}
    else:
        return obj

class TG3Object(dict):
    '''
    Generic functions for objects in TG3 collections.
    '''
    
    def __init__(self, _id=None, type=None):
        
        self._id = _id
        self.type = type

    def to_db(self):
        return convert_keys(self.__dict__)
        
    
class Sample(TG3Object):
    '''
    GenLIMS object in the 'samples' collection
    '''
    
    def __init__(self, *args, **kwargs):
        if 'protocol_id' in kwargs:
            self.procedure_id = kwargs.pop('procedure_id')
        else:
            self.procedure_id = None
        TG3Object.__init__(self, *args, **kwargs)
        
        
class SequencedLibrarySample(Sample):
    '''
    GenLIMS object in 'samples' collection of type 'sequenced library'
    '''
    
    def __init__(self, *args, **kwargs):
        if 'parent_id' in kwargs:
            self.parent_id = kwargs.pop('parent_id')
        else:
            self.parent_id = None
        Sample.__init__(self, *args, **kwargs)
        
        # overwrite sample type
        self.type = 'sequenced library'
    
    def _init_from_lib_packet(self, lib_id, lib_packet):
        self.parent_id = lib_id
        self.run_id = lib_packet.get('run_id')
        self._id = '%s_%s' % (lib_id, lib_packet.get('run_tag'))
        
        self._get_raw_data(lib_packet)
        
    def _get_raw_data(self, lib_packet):
        self.raw_data = get_lib_fastqs(lib_packet.get('fastq_dir'))
    
# def create_seq_lib_object(lib_dict_item):
#     seq_lib_dict = {}
#     for lib in lib_dict:
#         for run in lib_dict[lib]:
#             seq_lib_id = lib + '_' + run.get('run_id')
#             print seq_lib_id
o = SequencedLibrarySample()
o._init_from_lib_packet(lib_id, lib_packet)
o.to_db()

{'_id': 'lib10320_C81LMANXX',
 'parentId': 'lib10320',
 'procedureId': None,
 'rawData': [{'laneId': 'L003',
   'path': '/genomics/Illumina/160122_D00565_0101_BC81LMANXX/Unaligned/P54-18-28092065/lib10320-32561562/1-930954-44-C03_S68_L003_R1_001.fastq.gz',
   'readId': 'R1',
   'sampleNumber': 68}],
 'runId': '160122_D00565_0101_BC81LMANXX',
 'type': 'sequenced library'}

In [14]:
!grep 160122 ../data/lib_list_master.txt > ../data/lib_list_160122.txt

In [51]:
lib_list_file = "../data/lib_list_160122.txt"
fc_dict, lib_dict = read_lib_list(lib_list_file)
lib_id = 'lib10320'
lib_packet = lib_dict[lib_id][0]
lib_packet

{'fastq_dir': '/mnt/genomics/Illumina/160122_D00565_0101_BC81LMANXX/Unaligned/P54-18-28092065/lib10320-32561562',
 'project_id': 54,
 'run_id': '160122_D00565_0101_BC81LMANXX',
 'run_tag': 'C81LMANXX',
 'subproject_id': 18}

In [52]:
fc_dict
fc_id = '160122_D00565_0101_BC81LMANXX'
fc_packet = fc_dict[fc_id]
fc_packet

{'date': '2016-01-22',
 'flowcell_dir': '/mnt/genomics/Illumina',
 'flowcell_id': 'C81LMANXX',
 'flowcell_pos': 'B',
 'instrument': 'D00565',
 'run_number': 101}

In [53]:
import xml.etree.ElementTree as et
import glob

def find_dir(top_dir, dir_name, max_depth):
    for d in range(1, max_depth + 1):
        max_glob = "/".join("*" * d)
        top_glob = os.path.join(top_dir, max_glob)
        for f in glob.glob(top_glob):
            if dir_name in f:
                return f
                break
                
def read_fc_run_params(run_params_file):
    tree = et.parse(run_params_file)
    return {param.tag: param.text.rstrip() for param in tree.getroot()[0]
            if param.text is not None}

def get_fc_protocol(param_dict):
    fc_version = matchdefault('v[0-9]+', param_dict.get('Flowcell'))
    print fc_version

    
class Run(TG3Object):
    '''
    GenLIMS object in the 'runs' collection
    '''
    
    def __init__(self, *args, **kwargs):
        if 'protocol_id' in kwargs:
            self.procedure_id = kwargs.pop('procedure_id')
        else:
            self.procedure_id = None
        TG3Object.__init__(self, *args, **kwargs)
        
        
class FlowcellRun(Run):
    '''
    GenLIMS object in 'runs' collection of type 'flowcell'
    '''
    
    def __init__(self, *args, **kwargs):
        if 'parent_id' in kwargs:
            self.parent_id = kwargs.pop('parent_id')
        else:
            self.parent_id = None
        Run.__init__(self, *args, **kwargs)
        
        # overwrite sample type
        self.type = 'flowcell'
    
    def _init_from_fc_packet(self, fc_run_id, fc_packet):
        self._id = fc_run_id
        
        fc_dir = fc_packet.pop('flowcell_dir')
        for k, v in fc_packet.items():
            self.update({k: v})
        
        self._get_fc_params(fc_dir)
        
    def _get_fc_params(self, fc_dir):
        root_dir = find_dir('/', 'genomics', 3)
        local_fc_dir = re.sub('.*genomics', root_dir, fc_dir)
        print fc_dir
        run_params_file = os.path.join(local_fc_dir, self._id, 'runParameters.xml')
        print run_params_file
        
        print(read_fc_run_params(run_params_file))
        
        self.raw_data = get_lib_fastqs(lib_packet.get('fastq_dir'))
# param_dict = read_run_params(runparams_file)
# get_fc_protocol(param_dict)

r = FlowcellRun()
r._init_from_fc_packet(fc_id, fc_packet)
r.__dict__

/mnt/genomics/Illumina
/Volumes/genomics/Illumina/160122_D00565_0101_BC81LMANXX/runParameters.xml
{'EnableCameraLogging': 'false', 'Index': 'HiSeq v4 Dual Index', 'TemplateCycleCount': '5', 'AlignToPhiX': '', 'RunStartDate': '160122', 'UseExistingRecipe': 'false', 'TileWidth': '2048', 'SelectedSections': '', 'EnableLft': 'true', 'RecipeFragmentVersion': '1.5.21.0', 'Flowcell': 'HiSeq Flow Cell v4', 'EnableBasecalling': 'true', 'SlideHolder': 'HiSeq Flow Cell Holder', 'CPLDVersion': '3.0.0', 'NumSwaths': '3', 'WashBarcode': 'C81A1ANXX', 'SelectedSurface': 'BothLaneSurfaces', 'PeriodicSave': 'Save All Thumbnails', 'Sbs': 'HiSeq SBS Kit v4', 'ScanID': '-999', 'NumTilesPerSwath': '16', 'ReagentKits': '', 'ScanNumber': '101', 'RunMode': 'RapidHighOutput', 'ReagentBottles': '', 'Barcode': 'C81LMANXX', 'Read1': '58', 'KeepIntensityFiles': 'false', 'Read2': '0', 'SampleSheet': 'Z:\\Illumina HiSeq2500\\BaseSpace Sample Sheets\\2016Jan21_BC81LMANXX_V4HO_P54-17-18-19_P82-4_P66-3_P127-1_P117-4_P11

{'_id': '160122_D00565_0101_BC81LMANXX',
 'parent_id': None,
 'procedure_id': None,
 'raw_data': [{'lane_id': 'L003',
   'path': '/genomics/Illumina/160122_D00565_0101_BC81LMANXX/Unaligned/P54-18-28092065/lib10320-32561562/1-930954-44-C03_S68_L003_R1_001.fastq.gz',
   'read_id': 'R1',
   'sample_number': 68}],
 'type': 'flowcell'}

In [30]:
root_dir = find_dir('/', 'genomics', 3)
fc_dir = '/mnt/genomics/Illumina'
re.sub('.*genomics', root_dir, fc_dir)

'/Volumes/genomics/Illumina'

In [33]:
print json.dumps(lib_dict_to_db(lib_dict)[1], indent=4, sort_keys=True)

{
    "_id": "lib10205", 
    "assays": [
        {
            "assayId": "C81A1ANXX", 
            "rawData": [
                {
                    "compression": "gz", 
                    "laneId": 7, 
                    "path": "/genomics/Illumina/151216_D00565_0100_AC81A1ANXX/Unaligned/P43-36-27317307/lib10205-31660697/PA342-17_S271_L007_R1_001.fastq.gz", 
                    "readId": 1, 
                    "type": "fastq"
                }
            ]
        }
    ], 
    "projectId": 43, 
    "subprojectId": 36
}


In [36]:
lib10000 = [lib for lib in lib_dict_to_db(lib_dict) if lib['_id'] == 'lib10000']

In [37]:
print json.dumps(lib10000, indent=4, sort_keys=True)

[
    {
        "_id": "lib10000", 
        "assays": [
            {
                "assayId": "C81A1ANXX", 
                "rawData": [
                    {
                        "compression": "gz", 
                        "laneId": 1, 
                        "path": "/genomics/Illumina/151216_D00565_0100_AC81A1ANXX/Unaligned/P43-29-27330309/lib10000-31653654/1D-HC29-C04_S27_L001_R1_001.fastq.gz", 
                        "readId": 1, 
                        "type": "fastq"
                    }
                ]
            }
        ], 
        "projectId": 43, 
        "subprojectId": 29
    }
]


In [31]:
print json.dumps(fc_dict_to_db(fc_dict)[0], indent=4, sort_keys=True)

{
    "_id": "C81A1ANXX", 
    "date": "2015-12-16", 
    "instrumentId": "D00565", 
    "lanes": [
        {
            "laneId": 1, 
            "libraries": [
                {
                    "libId": "lib9974"
                }, 
                {
                    "libId": "lib10000"
                }
            ]
        }, 
        {
            "laneId": 2, 
            "libraries": [
                {
                    "libId": "lib10032"
                }, 
                {
                    "libId": "lib10039"
                }
            ]
        }, 
        {
            "laneId": 3, 
            "libraries": [
                {
                    "libId": "lib10091"
                }, 
                {
                    "libId": "lib10086"
                }
            ]
        }, 
        {
            "laneId": 4, 
            "libraries": [
                {
                    "libId": "lib10121"
                }, 
                {
             

In [201]:
[ {'lane_id': l} for l in range(1, 9) ]

[{'lane_id': 1},
 {'lane_id': 2},
 {'lane_id': 3},
 {'lane_id': 4},
 {'lane_id': 5},
 {'lane_id': 6},
 {'lane_id': 7},
 {'lane_id': 8}]