In [129]:
from interop import py_interop_run_metrics, py_interop_run, py_interop_summary
import pandas as pd
import math

In [130]:
def parse_interop_data(run_folder, num_reads, num_lanes):
    """
    Parses summary statistics out of interops data using the Illumina interops package
    """

    # make empty dict to store output
    interop_dict = {'read_summaries': {}}

    
    # taken from illumina interops package documentation, all of this is required, 
    # even though only the summary variable is used further on
    run_metrics = py_interop_run_metrics.run_metrics()
    valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0)
    py_interop_run_metrics.list_summary_metrics_to_load(valid_to_load)
    run_folder = run_metrics.read(run_folder, valid_to_load)
    summary = py_interop_summary.run_summary()
    py_interop_summary.summarize_run_metrics(run_metrics, summary)
    

    
    for read in range(num_reads):
        
        new_read = read + 1
        
        if new_read not in interop_dict['read_summaries']:
            
            interop_dict['read_summaries'][new_read] = {}
        
            
        for lane in range(num_lanes):
            
            new_lane = lane + 1
                
            if new_lane not in interop_dict['read_summaries'][new_read]:
                    
                interop_dict['read_summaries'][new_read][new_lane] = {}
                    
            interop_dict['read_summaries'][read+1][lane+1]['percent_q30'] = summary.at(read).at(lane).percent_gt_q30()
            interop_dict['read_summaries'][read+1][lane+1]['density'] = summary.at(read).at(lane).density().mean()
            interop_dict['read_summaries'][read+1][lane+1]['density_pf'] = summary.at(read).at(lane).density_pf().mean()
            interop_dict['read_summaries'][read+1][lane+1]['cluster_count'] = summary.at(read).at(lane).density_pf().mean()
            interop_dict['read_summaries'][read+1][lane+1]['cluster_count_pf'] = summary.at(read).at(lane).cluster_count_pf().mean()
            interop_dict['read_summaries'][read+1][lane+1]['error_rate'] = summary.at(read).at(lane).error_rate().mean()
            interop_dict['read_summaries'][read+1][lane+1]['percent_aligned'] = summary.at(read).at(lane).percent_aligned().mean()
            interop_dict['read_summaries'][read+1][lane+1]['percent_pf'] = summary.at(read).at(lane).percent_pf().mean()
            interop_dict['read_summaries'][read+1][lane+1]['phasing'] = summary.at(read).at(lane).phasing().mean()
            interop_dict['read_summaries'][read+1][lane+1]['prephasing'] = summary.at(read).at(lane).prephasing().mean()
            interop_dict['read_summaries'][read+1][lane+1]['reads'] = summary.at(read).at(lane).reads()
            interop_dict['read_summaries'][read+1][lane+1]['reads_pf'] = summary.at(read).at(lane).reads_pf()
            interop_dict['read_summaries'][read+1][lane+1]['yield'] = summary.at(read).at(lane).yield_g()

    return interop_dict

In [131]:
parse_interop_data('/media/joseph/Storage/data/archive/nextseq/190913_NB551319_0026_AHT5G5AFXY/', 4, 4)

{'read_summaries': {1: {1: {'percent_q30': 92.2387924194336,
    'density': 244107.328125,
    'density_pf': 214203.265625,
    'cluster_count': 214203.265625,
    'cluster_count_pf': 643285.3125,
    'error_rate': 0.3278927803039551,
    'percent_aligned': 0.9878919124603271,
    'percent_pf': 87.7578353881836,
    'phasing': 0.21830081939697266,
    'prephasing': 0.11702568084001541,
    'reads': 52782588.0,
    'reads_pf': 46316548.0,
    'yield': 3.473161220550537},
   2: {'percent_q30': 92.34395599365234,
    'density': 234868.71875,
    'density_pf': 207354.796875,
    'cluster_count': 207354.796875,
    'cluster_count_pf': 622718.25,
    'error_rate': 0.4515427052974701,
    'percent_aligned': 1.002793312072754,
    'percent_pf': 88.27922058105469,
    'phasing': 0.2153674066066742,
    'prephasing': 0.1224762499332428,
    'reads': 50784988.0,
    'reads_pf': 44835700.0,
    'yield': 3.3620235919952393},
   3: {'percent_q30': 92.49681091308594,
    'density': 247218.6875,
    '

In [132]:
run_folder ='/media/joseph/Storage/data/archive/miseq/190916_M00766_0252_000000000-CJMB5'

run_metrics = py_interop_run_metrics.run_metrics()
valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0)
py_interop_run_metrics.list_index_metrics_to_load(valid_to_load)
run_folder = run_metrics.read(run_folder, valid_to_load)
summary = py_interop_summary.index_flowcell_summary()
py_interop_summary.summarize_index_metrics(run_metrics, summary)




In [133]:
n_lanes = 1

columns = ( ('Index Number', 'id'), ('Sample Id', 'sample_id'), ('Project', 'project_name'), ('Index 1 (I7)', 'index1'), ('Index 2 (I5)', 'index2'), ('% Reads Identified (PF)', 'fraction_mapped'))

df = pd.DataFrame()

for x in range(0, n_lanes):
    
    d = []
    
    for label, func in columns:

        lane_summary = summary.at(x)

        d.append( (label, pd.Series([getattr(lane_summary.at(i), func)() for i in range(lane_summary.size())], index=[lane_summary.at(i).id() for i in range(lane_summary.size())])))

    new_df = pd.DataFrame.from_dict(dict(d))
    new_df['lane'] = x

    df = df.append(new_df )



In [134]:
df_grouped = df.groupby('Sample Id').mean()

In [135]:
#df_grouped.to_dict('index')

In [136]:
index_dict = df.to_dict('records')

In [137]:
def parse_interop_index_metrics(run_folder, n_lanes):
    """
    Get the index stats from the interop files

    """

    run_metrics = py_interop_run_metrics.run_metrics()
    valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0)
    py_interop_run_metrics.list_index_metrics_to_load(valid_to_load)
    run_folder = run_metrics.read(run_folder, valid_to_load)
    my_summary = py_interop_summary.index_flowcell_summary()
    py_interop_summary.summarize_index_metrics(run_metrics, my_summary)

    columns = ( ('Index Number', 'id'),
    ('Sample Id', 'sample_id'),
    ('Project', 'project_name'),
    ('Index 1 (I7)', 'index1'),
    ('Index 2 (I5)', 'index2'),
    ('% Reads Identified (PF)', 'fraction_mapped'))

    df = pd.DataFrame()

    for x in range(0, n_lanes):

        d = []

        for label, func in columns:

            lane_summary = summary.at(x)

            d.append( (label, pd.Series([getattr(lane_summary.at(i), func)() for i in range(lane_summary.size())], index=[lane_summary.at(i).id() for i in range(lane_summary.size())])))

        new_df = pd.DataFrame.from_dict(dict(d))
        new_df['lane'] = x

        df = df.append(new_df )

    df_grouped = df.groupby('Sample Id').mean()

    index_dict = df_grouped.to_dict('index')

    return index_dict

In [138]:
parse_interop_index_metrics('/media/joseph/Storage/data/archive/miseq/190916_M00766_0252_000000000-CJMB5', 1)

{'19M12481': {'Index Number': 1,
  '% Reads Identified (PF)': 4.841100215911865,
  'lane': 0},
 '19M12638': {'Index Number': 2,
  '% Reads Identified (PF)': 4.885499954223633,
  'lane': 0},
 '19M13283': {'Index Number': 3,
  '% Reads Identified (PF)': 5.553899765014648,
  'lane': 0},
 '19M13333': {'Index Number': 4,
  '% Reads Identified (PF)': 4.89139986038208,
  'lane': 0},
 '19M13455': {'Index Number': 5,
  '% Reads Identified (PF)': 5.307199954986572,
  'lane': 0},
 '19M13506': {'Index Number': 6,
  '% Reads Identified (PF)': 3.9500999450683594,
  'lane': 0},
 '19M13507': {'Index Number': 7,
  '% Reads Identified (PF)': 4.873499870300293,
  'lane': 0},
 '19M13508': {'Index Number': 8,
  '% Reads Identified (PF)': 4.499599933624268,
  'lane': 0},
 '19M13509': {'Index Number': 9,
  '% Reads Identified (PF)': 4.046999931335449,
  'lane': 0},
 '19M13510': {'Index Number': 10,
  '% Reads Identified (PF)': 4.175099849700928,
  'lane': 0},
 '19M13511': {'Index Number': 11,
  '% Reads Iden

In [139]:
def parse_interop_data(run_folder_dir, num_reads, num_lanes):
    """
    Parses summary statistics out of interops data using the Illumina interops package
    """

    # make empty dict to store output
    interop_dict = {'read_summaries': {}}


    # taken from illumina interops package documentation, all of this is required, 
    # even though only the summary variable is used further on
    run_metrics = py_interop_run_metrics.run_metrics()
    valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0)
    py_interop_run_metrics.list_summary_metrics_to_load(valid_to_load)
    run_folder = run_metrics.read(run_folder_dir, valid_to_load)
    summary = py_interop_summary.run_summary()
    py_interop_summary.summarize_run_metrics(run_metrics, summary)


    for read in range(num_reads):

        new_read = read + 1

        if new_read not in interop_dict['read_summaries']:

            interop_dict['read_summaries'][new_read] = {}


        for lane in range(num_lanes):

            new_lane = lane + 1

            if new_lane not in interop_dict['read_summaries'][new_read]:

                interop_dict['read_summaries'][new_read][new_lane] = {}

            interop_dict['read_summaries'][read+1][lane+1]['percent_q30'] = summary.at(read).at(lane).percent_gt_q30()
            interop_dict['read_summaries'][read+1][lane+1]['density'] = summary.at(read).at(lane).density().mean()
            interop_dict['read_summaries'][read+1][lane+1]['density_pf'] = summary.at(read).at(lane).density_pf().mean()
            interop_dict['read_summaries'][read+1][lane+1]['cluster_count'] = summary.at(read).at(lane).density_pf().mean()
            interop_dict['read_summaries'][read+1][lane+1]['cluster_count_pf'] = summary.at(read).at(lane).cluster_count_pf().mean()
            interop_dict['read_summaries'][read+1][lane+1]['error_rate'] = summary.at(read).at(lane).error_rate().mean()
            interop_dict['read_summaries'][read+1][lane+1]['percent_aligned'] = summary.at(read).at(lane).percent_aligned().mean()
            interop_dict['read_summaries'][read+1][lane+1]['percent_pf'] = summary.at(read).at(lane).percent_pf().mean()
            interop_dict['read_summaries'][read+1][lane+1]['phasing'] = summary.at(read).at(lane).phasing().mean()
            interop_dict['read_summaries'][read+1][lane+1]['prephasing'] = summary.at(read).at(lane).prephasing().mean()
            interop_dict['read_summaries'][read+1][lane+1]['reads'] = summary.at(read).at(lane).reads()
            interop_dict['read_summaries'][read+1][lane+1]['reads_pf'] = summary.at(read).at(lane).reads_pf()
            interop_dict['read_summaries'][read+1][lane+1]['yield_g'] = summary.at(read).at(lane).yield_g()

            for key in interop_dict['read_summaries'][read+1][lane+1]:

                if math.isnan(interop_dict['read_summaries'][read+1][lane+1][key]):

                    interop_dict['read_summaries'][read+1][lane+1][key] = None


    run_metrics = py_interop_run_metrics.run_metrics()
    valid_to_load = py_interop_run.uchar_vector(py_interop_run.MetricCount, 0)
    py_interop_run_metrics.list_index_metrics_to_load(valid_to_load)
    run_folder = run_metrics.read(run_folder_dir, valid_to_load)
    my_summary = py_interop_summary.index_flowcell_summary()
    py_interop_summary.summarize_index_metrics(run_metrics, my_summary)

    columns = ( ('Index Number', 'id'),
    ('Sample Id', 'sample_id'),
    ('Project', 'project_name'),
    ('Index 1 (I7)', 'index1'),
    ('Index 2 (I5)', 'index2'),
    ('% Reads Identified (PF)', 'fraction_mapped'))

    df = pd.DataFrame()

    for x in range(num_lanes):

        d = []

        for label, func in columns:

            lane_summary = my_summary.at(x)

            d.append( (label, pd.Series([getattr(lane_summary.at(i), func)() for i in range(lane_summary.size())], index=[lane_summary.at(i).id() for i in range(lane_summary.size())])))

        new_df = pd.DataFrame.from_dict(dict(d))
        new_df['lane'] = x

        df = df.append(new_df )

    df_grouped = df.groupby('Sample Id').mean()

    index_dict = df_grouped.to_dict('index')

    interop_dict['index_stats'] = index_dict

    return interop_dict

In [140]:
parse_interop_data('/media/joseph/Storage/data/archive/nextseq/190913_NB551319_0026_AHT5G5AFXY/', 4, 4)

{'read_summaries': {1: {1: {'percent_q30': 92.2387924194336,
    'density': 244107.328125,
    'density_pf': 214203.265625,
    'cluster_count': 214203.265625,
    'cluster_count_pf': 643285.3125,
    'error_rate': 0.3278927803039551,
    'percent_aligned': 0.9878919124603271,
    'percent_pf': 87.7578353881836,
    'phasing': 0.21830081939697266,
    'prephasing': 0.11702568084001541,
    'reads': 52782588.0,
    'reads_pf': 46316548.0,
    'yield_g': 3.473161220550537},
   2: {'percent_q30': 92.34395599365234,
    'density': 234868.71875,
    'density_pf': 207354.796875,
    'cluster_count': 207354.796875,
    'cluster_count_pf': 622718.25,
    'error_rate': 0.4515427052974701,
    'percent_aligned': 1.002793312072754,
    'percent_pf': 88.27922058105469,
    'phasing': 0.2153674066066742,
    'prephasing': 0.1224762499332428,
    'reads': 50784988.0,
    'reads_pf': 44835700.0,
    'yield_g': 3.3620235919952393},
   3: {'percent_q30': 92.49681091308594,
    'density': 247218.6875,
 