In [1]:
import sys, os, re, zipfile, StringIO
import pandas as pd

In [None]:
def get_project_list(fc_dir):
    project_list = [ os.path.join(fc_dir, f) \
                     for f in os.listdir(fc_dir) if re.search('^Project_', f) ]
    
    return project_list

def get_lib_list(qc_dir):
        lib_list = [ lib_file \
                     for lib_file in os.listdir(qc_dir) \
                    if 'lib' in lib_file]
        lib_list = list(set(lib_list))
        lib_list.sort()
        return lib_list

class FastQCReport(object):
    def __init__(self, fastqc_file):
        
        self.fastqc_file = fastqc_file
        
    def read_fastqc(fastqc_path):
        with zipfile.ZipFile(fastqc_path) as f:
            f_src = StringIO.StringIO(f.read('fastqc_data.txt'))
            fastqc_lines = f_src.readlines()

        return fastqc_lines

    def find_adapter_seqs(fastqc_lines, lib):
        adapter_lines = [ [lib] + l.strip().split('\t') for l in fastqc_lines if 'Index' in l ]
        if not len(adapter_lines):
            adapter_lines = [ [lib, '', '0', '0', ''] ]

        return adapter_lines

    def count_adapter_seqs(fastqc_lines):
        adapter_lines = [l for l in fastqc_lines if 'TruSeq Adapter' in l]
        num_adapters = len(adapter_lines)

        return num_adapters

    def collect_fastqc_data(qc_dir, lib_list):
        adapter_list = []
        for lib in lib_list:
            fastqc_path = os.path.join(qc_dir, lib, 'qcR1.zip')
            adapter_lines = find_adapter_seqs(read_fastqc(fastqc_path), lib)
            adapter_list += adapter_lines

        return adapter_list

def summarize_fastqc_data(qc_dir, lib_list):
    adapter_count = 0
    for lib in lib_list:
        fastqc_path = os.path.join(qc_dir, lib, 'qcR1.zip')
        num_adapters = count_adapter_seqs(read_fastqc(fastqc_path))
        if num_adapters:
            adapter_count += 1
    
    return (len(lib_list), adapter_count))

In [2]:
fastqc_file = "/Volumes/genomics/Illumina/160307_D00565_0103_BC893JANXX/Project_P90-17Processed_160310/QC/lib11034_C893JANXX/qcR1.zip"

In [3]:
def read_fastqc(fastqc_path):
        with zipfile.ZipFile(fastqc_path) as f:
            f_src = StringIO.StringIO(f.read('fastqc_data.txt'))
            fastqc_lines = f_src.readlines()

        return fastqc_lines

fastqc_lines = read_fastqc(fastqc_file)

In [61]:
def find_modules(fastqc_lines):
    module_pos = {}
    for idx, l in enumerate(fastqc_lines):
        if re.search('^>>', l):
            if 'END' not in l:
                module_name = l.lstrip('>>').rstrip().split('\t')[0]
                module_pos.setdefault(module_name, []).append(idx)
            else:
                module_pos.setdefault(module_name, []).append(idx)
    return module_pos

def get_module(fastqc_lines, module_pos, module_name):
    module_start = module_pos[module_name][0] + 1
    module_stop = module_pos[module_name][1]
    module_lines = fastqc_lines[module_start:module_stop]
    header_line = re.sub('#', '', module_lines[0].lower())
    return pd.read_table(StringIO.StringIO(('').join([header_line] 
                                                     + module_lines[1:])))

module = get_module(fastqc_lines, find_modules(fastqc_lines), 'Overrepresented sequences')

In [159]:
def check_type(seq, source):
    type_dict = {'adapter': 'adapter',
                 'primer': 'primer',
                  'No': 'no hit'}
    
    if not re.search('[^N]', seq):
        return pd.Series('empty')
    else:
        for key, value in type_dict.items():
            if re.compile(key).search(source):
                return pd.Series(value)

#     return seq_type

# def label_overrep_seqs(module):
#     return module.assign(seq_type = lambda x: )

# label_overrep_seqs(module)
module['seq_type'] = module.apply(lambda x: check_type(x['sequence'], x['possible source']), axis=1)
module

Unnamed: 0,sequence,count,percentage,possible source,seq_type
0,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN,67790,0.494957,No Hit,empty
1,GGCAGACGTTCGAATGGGTCGTCGCCGCCACGGGGGGCGTGCGATC...,20278,0.148056,No Hit,no hit
2,GCTCAATCTCGGGTGGCTGAACGCCACTTGTCCCTCTAAGAAGTTG...,14307,0.10446,No Hit,no hit
3,ACGACTTTTACTTCCTCTAGATAGTCAAGTTCGACCGTCTTCTCAG...,14136,0.103212,No Hit,no hit
4,GTTCGATTAGTCTTTCGCCCCTATACCCAGGTCGGACGACCGATTT...,13885,0.101379,No Hit,no hit


In [52]:
re.sub('#', '', module_lines[0].lower())

'sequence\tcount\tpercentage\tpossible source\n'

In [139]:
re.search('[^N]', 'NA')

<_sre.SRE_Match at 0x1082a4100>