In [1]:
import os, re, sys
from pymongo import MongoClient
from pprint import pprint
import datetime
import json

In [2]:
client = MongoClient()
db = client.tg3

In [93]:
# wrapper function for regex matching
def match_or_empty(string, pattern):
    regex = re.compile(pattern)
    match = regex.search(string)
    if match is not None:
        return match.group()
    else:
        return ''
    
# wrapper function for regex matching
def matchdefault(pattern, string, default=''):
    match = re.search(pattern, string)
    if match is not None:
        return match.group()
    else:
        return default

### file/path/string parsing functions ###

def get_lib_id(lib_str):
    lib_id = match_or_empty(lib_str, 'lib[1-9]+[0-9]*')
    
    return lib_id

def parse_fc_str(fc_str):
    fc_parts = fc_str.split('_')

    d = datetime.datetime.strptime(fc_parts[0], '%y%m%d')
    
    date = datetime.date.isoformat(d)
    instrument_id = fc_parts[1]
    run_id = int(fc_parts[2])
    
    fc_id = match_or_empty(fc_str, '(?<=(_(A|B|D)))([A-Z]|[0-9])*XX')

    return date, instrument_id, run_id, fc_id
    
    
def get_proj_id(proj_str):
    proj = match_or_empty(proj_str, 'P+[0-9]+(-[0-9]+){,1}')
    proj_id = int(match_or_empty(proj, '(?<=P)[0-9]+'))
    subproj_id = int(match_or_empty(proj, '(?<=-)[0-9]+'))
    
    return proj_id, subproj_id

def parse_lib_line(line):
    l_parts = line.strip().split('\t')
    
    lib_id = get_lib_id(l_parts[0])
    
    project_id, subproject_id = get_proj_id(l_parts[3])
    fastq_dir = l_parts[-1]
    
    lib_packet = {'project_id': project_id,
                  'subproject_id': subproject_id,
                  'fastq_dir': fastq_dir}
    
    flowcell_str = l_parts[2]
    fc_date, instr, run_num, flowcell_id = parse_fc_str(flowcell_str)
    
    flowcell_packet = {'date': fc_date,
                       'instrument': instr,
                       'run_number': run_num}
    
    return lib_id, lib_packet, flowcell_id, flowcell_packet

### file parsing/annotating functions ###

def get_file_type(file_path):
    ext = os.path.splitext(file_path)
    if 'z' in ext[-1]:
        compression = ext[-1].lstrip('.')
        ext = os.path.splitext(ext[0])[-1].lstrip('.')
    return ext, compression

def get_fastq_source(file_path):
    lane_id = matchdefault('(?<=_)L00[1-8]', file_path)
    read_id = matchdefault('(?<=_)R[1-2]', file_path)
    return lane_id, read_id
    
def collect_fastq_info(file_path):
    file_type,compression = get_file_type(file_path)
    
    if file_type == 'fastq':
        lane_id, read_id = get_fastq_source(file_path)
    
    file_path = re.sub('.*(?=/genomics)', '', file_path)
    
    return {'path': file_path, 'lane_id': lane_id, 'read_id': read_id}


# describe raw files for current lib
def get_lib_fastqs(fastq_dir):
    # check if logged into server or accessing mounted volume
    if not os.path.isdir(fastq_dir):
        fastq_dir = re.sub('mnt', 'Volumes', fastq_dir)
        
    return [collect_fastq_info(os.path.join(fastq_dir, f))
            for f in os.listdir(fastq_dir)]

# read and extract info from library list file
def read_lib_list(lib_list_file):
    lib_dict = {}
    fc_dict = {}
    with open(lib_list_file) as f:
        for i, l in enumerate(f):
            if i > 0:
                lib_id, lib_packet, fc_id, fc_packet = parse_lib_line(l)
                lib_packet['run_id'] = fc_id
                        
                lib_dict.setdefault(lib_id, []).append(lib_packet)

                if fc_id not in fc_dict:
                    fc_dict[fc_id] = fc_packet
                    
    return fc_dict, lib_dict

In [17]:
def to_camel_case(snake_str):
    components = snake_str.split('_')
    return components[0] + "".join(x.title() for x in components[1:])

def convert_dict_keys(obj):
    if isinstance(obj, list):
        new_obj = [ convert_dict_keys(i) for i in obj ]
    elif isinstance(obj, dict):
        new_obj = { to_camel_case(k): convert_dict_keys(obj[k]) \
                    for k in obj }
    else:
        new_obj = obj
    return new_obj

def merge_two_dicts(x, y):
    '''Given two dicts, merge them into a new dict as a shallow copy.'''
    z = x.copy()
    z.update(y)
    return z

def lib_dict_to_db(lib_dict):
    lib_db = [ merge_two_dicts({'_id': i}, convert_dict_keys(lib_dict[i])) \
               for i in lib_dict ]
    return lib_db

def fc_dict_to_db(fc_dict):
    fc_db = [ merge_two_dicts({'_id': i}, convert_dict_keys(fc_dict[i])) \
              for i in fc_dict ]
    return fc_db

In [5]:
!grep 160122 ../data/lib_list_master.txt > ../data/lib_list_160122.txt

In [162]:
def to_camel_case(snake_str):
    if not re.search('^_', snake_str):
        components = snake_str.split('_')
        return components[0] + "".join(x.title() for x in components[1:])
    else:
        return snake_str

def convert_keys(obj):
    if isinstance(obj, list):
        return [convert_keys(i) for i in obj]
    elif isinstance(obj, dict):
        return {to_camel_case(k): convert_keys(obj[k])
                for k in obj}
    else:
        return obj

class TG3Object(dict):
    '''
    Generic functions for objects in TG3 collections.
    '''
    
    def __init__(self, _id=None, type=None):
        
        self._id = _id
        self.type = type

    def to_db(self):
        return convert_keys(self.__dict__)
        
    
class Sample(TG3Object):
    '''
    GenLIMS object in the 'samples' collection
    '''
    
    def __init__(self, *args, **kwargs):
        if 'procedure_id' in kwargs:
            self.procedure_id = kwargs.pop('procedure_id')
        else:
            self.procedure_id = None
        TG3Object.__init__(self, *args, **kwargs)
        
        
class SequencedLibrarySample(Sample):
    '''
    GenLIMS object in 'samples' collection of type 'sequenced library'
    '''
    
    def __init__(self, *args, **kwargs):
        if 'parent_id' in kwargs:
            self.parent_id = kwargs.pop('parent_id')
        else:
            self.parent_id = None
        Sample.__init__(self, *args, **kwargs)
    
    def _init_from_library(self, lib_id, lib_packet):
        self.parent_id = lib_id
        self.run_id = lib_packet.get('run_id')
        self._id = lib_id + '_' + lib_packet.get('run_id')
        
        self._get_raw_data(lib_packet)
        
    def _get_raw_data(self, lib_packet):
        self.raw_data = get_lib_fastqs(lib_packet.get('fastq_dir'))
    
# def create_seq_lib_object(lib_dict_item):
#     seq_lib_dict = {}
#     for lib in lib_dict:
#         for run in lib_dict[lib]:
#             seq_lib_id = lib + '_' + run.get('run_id')
#             print seq_lib_id
o = SequencedLibrarySample()
o._init_from_library(lib_id, lib_packet)
o.to_db()

{'_id': 'lib10320_C81LMANXX',
 'parentId': 'lib10320',
 'procedureId': None,
 'rawData': [{'laneId': 'L003',
   'path': '/genomics/Illumina/160122_D00565_0101_BC81LMANXX/Unaligned/P54-18-28092065/lib10320-32561562/1-930954-44-C03_S68_L003_R1_001.fastq.gz',
   'readId': 'R1'}],
 'runId': 'C81LMANXX',
 'type': None}

In [160]:
lib_packet

{'fastq_dir': '/mnt/genomics/Illumina/160122_D00565_0101_BC81LMANXX/Unaligned/P54-18-28092065/lib10320-32561562',
 'project_id': 54,
 'run_id': 'C81LMANXX',
 'subproject_id': 18}

In [14]:
lib_list_file = "../data/lib_list_160122.txt"
with open(lib_list_file) as f:
    ll = f.readlines()

In [63]:
lib_id, lib_packet, fc_id, fc_packet = parse_lib_line(ll[0])
get_lib_fastqs(lib_packet.get('fastq_dir'))
# fastq_dir = lib_packet.get('fastq_dir')
# if not os.path.isdir(fastq_dir):
#     fastq_dir = re.sub('mnt', 'Volumes', fastq_dir)
# fq = [os.path.join(fastq_dir, f) for f in os.listdir(fastq_dir)][0]
# get_fastq_source(fq)

[{'lane_id': 'L001',
  'path': '/genomics/Illumina/160122_D00565_0101_BC81LMANXX/Unaligned/P54-17-28096068/lib10397-32552614/927495-25_S5_L001_R1_001.fastq.gz',
  'read_id': 'R1'},
 {'lane_id': 'L002',
  'path': '/genomics/Illumina/160122_D00565_0101_BC81LMANXX/Unaligned/P54-17-28096068/lib10397-32552614/927495-25_S5_L002_R1_001.fastq.gz',
  'read_id': 'R1'}]

In [99]:
fc_dict, lib_dict = read_lib_list(lib_list_file)

In [133]:
lib_id = 'lib10320'
lib_packet = lib_dict[lib_id][0]
lib_packet

{'fastq_dir': '/mnt/genomics/Illumina/160122_D00565_0101_BC81LMANXX/Unaligned/P54-18-28092065/lib10320-32561562',
 'project_id': 54,
 'run_id': 'C81LMANXX',
 'subproject_id': 18}

In [212]:
[ f.get('lane_id') for f in raw_files ]

[7]

In [33]:
print json.dumps(lib_dict_to_db(lib_dict)[1], indent=4, sort_keys=True)

{
    "_id": "lib10205", 
    "assays": [
        {
            "assayId": "C81A1ANXX", 
            "rawData": [
                {
                    "compression": "gz", 
                    "laneId": 7, 
                    "path": "/genomics/Illumina/151216_D00565_0100_AC81A1ANXX/Unaligned/P43-36-27317307/lib10205-31660697/PA342-17_S271_L007_R1_001.fastq.gz", 
                    "readId": 1, 
                    "type": "fastq"
                }
            ]
        }
    ], 
    "projectId": 43, 
    "subprojectId": 36
}


In [36]:
lib10000 = [lib for lib in lib_dict_to_db(lib_dict) if lib['_id'] == 'lib10000']

In [37]:
print json.dumps(lib10000, indent=4, sort_keys=True)

[
    {
        "_id": "lib10000", 
        "assays": [
            {
                "assayId": "C81A1ANXX", 
                "rawData": [
                    {
                        "compression": "gz", 
                        "laneId": 1, 
                        "path": "/genomics/Illumina/151216_D00565_0100_AC81A1ANXX/Unaligned/P43-29-27330309/lib10000-31653654/1D-HC29-C04_S27_L001_R1_001.fastq.gz", 
                        "readId": 1, 
                        "type": "fastq"
                    }
                ]
            }
        ], 
        "projectId": 43, 
        "subprojectId": 29
    }
]


In [31]:
print json.dumps(fc_dict_to_db(fc_dict)[0], indent=4, sort_keys=True)

{
    "_id": "C81A1ANXX", 
    "date": "2015-12-16", 
    "instrumentId": "D00565", 
    "lanes": [
        {
            "laneId": 1, 
            "libraries": [
                {
                    "libId": "lib9974"
                }, 
                {
                    "libId": "lib10000"
                }
            ]
        }, 
        {
            "laneId": 2, 
            "libraries": [
                {
                    "libId": "lib10032"
                }, 
                {
                    "libId": "lib10039"
                }
            ]
        }, 
        {
            "laneId": 3, 
            "libraries": [
                {
                    "libId": "lib10091"
                }, 
                {
                    "libId": "lib10086"
                }
            ]
        }, 
        {
            "laneId": 4, 
            "libraries": [
                {
                    "libId": "lib10121"
                }, 
                {
             

In [201]:
[ {'lane_id': l} for l in range(1, 9) ]

[{'lane_id': 1},
 {'lane_id': 2},
 {'lane_id': 3},
 {'lane_id': 4},
 {'lane_id': 5},
 {'lane_id': 6},
 {'lane_id': 7},
 {'lane_id': 8}]