In [1]:
import pandas as pd
import numpy as np

In [3]:

def report_alignments(df):
    """A function to generate a dataframe of diagnostic information
    from the alignments table
    
    Parameters:
    -----------------------------
        :  df (pd.DataFrame): the alignment table 
        
    Returns:
    -----------------------------
        :  results (pd.DataFrame): diagnostic information
    """
    grped  = df.groupby('read_name', as_index=False).agg({
        'fragment_id' : 'count',
        'mapping_quality' : np.mean,
        'num_contained_fragments' : np.sum,
    })
    
    params = {
        'n_unique_reads' : df['read_name'].nunique(),
        'n_fragments' : len(df),
        'n_unique_fragments' : df['fragment_id'].nunique(),
        'n_unique_fragments' : df['fragment_id'].nunique(),
        'mean_read_length' : df['read_length'].mean(),
        'std_read_length' : df['read_length'].std(),
        'max_read_length' : df['read_length'].max(),
        'min_read_length' : df['read_length'].min(),
        'mean_read_mapping_qual' : df['mapping_quality'].mean(),
        'std_read_mapping_qual' : df['mapping_quality'].std(),
        'max_read_mapping_qual' : df['mapping_quality'].max(),
        'min_read_mapping_qual' : df['mapping_quality'].min(),
        'mean_read_perc_of_alignment' : df['perc_of_alignment'].mean(),
        'std_read_perc_of_alignment' : df['perc_of_alignment'].std(),
        'max_read_perc_of_alignment' : df['perc_of_alignment'].max(),
        'min_read_perc_of_alignment' : df['perc_of_alignment'].min(),
        'mean_fragments_per_read' : grped['fragment_id'].mean(),
        'std_fragments_per_read' : grped['fragment_id'].std(),
        'max_fragments_per_read' : grped['fragment_id'].max(),
        'min_fragments_per_read' : grped['fragment_id'].min(),
        'mean_contained_fragments_per_read' : grped['num_contained_fragments'].mean(),
        'std_contained_fragments_per_read' : grped['num_contained_fragments'].std(),
        'max_contained_fragments_per_read' : grped['num_contained_fragments'].max(),
        'min_contained_fragments_per_read' : grped['num_contained_fragments'].min(),
    }
    
    new_rows = []
    for k, v in params.items():
        row = {
            'parameter' : k,
            'value' : v
        }
        
        new_rows.append(row)
        
    results = pd.DataFrame(new_rows)
    return results
    
    

In [11]:
my_path = '/nfs/turbo/umms-indikar/shared/projects/poreC/pipeline_test/bc11r2_bwasw/tables/alignment_table.csv'
df = pd.read_csv(my_path)

res = report_alignments(df)
res.head()

Unnamed: 0,parameter,value
0,n_unique_reads,61176.0
1,n_fragments,459023.0
2,n_unique_fragments,101195.0
3,mean_read_length,4504.503175
4,std_read_length,3396.043896


In [12]:
my_path = '/nfs/turbo/umms-indikar/shared/projects/poreC/pipeline_test/bc11r2_nanopore_output/Run02_Barcode11.csv'
df2 = pd.read_csv(my_path)

res = report_alignments(df2)
res.head()

Unnamed: 0,parameter,value
0,n_unique_reads,61176.0
1,n_fragments,459029.0
2,n_unique_fragments,101198.0
3,mean_read_length,4504.507308
4,std_read_length,3396.032432


In [9]:
extra_frags = df2[~df2['fragment_id'].isin(df['fragment_id'])]
len(extra_frags)

223

In [10]:
extra_frags

Unnamed: 0,read_idx,align_idx,align_type,chrom,start,end,strand,read_name,read_length,read_start,...,filter_reason,fragment_id,num_contained_fragments,num_overlapping_fragments,overlap_length,fragment_start,fragment_end,perc_of_alignment,perc_of_fragment,is_contained
138,28,138,primary,NC_000074.7,101937934,101938430,1,9262e178-89db-4bc4-b817-15b485b5e6fa,2447,1955,...,low_mq,5933631,0,2,492,101937938,101939029,99.19355,45.096240,0
1432,245,1432,primary,NC_000087.8,48117392,48118892,1,be5b23cf-db7e-4a1e-8683-d923075bf92b,5388,1865,...,low_mq,12684790,4,6,703,48118189,48118933,46.86666,94.489250,0
1742,291,1742,primary,NC_000079.7,114520346,114521106,1,672a9d76-9c92-4397-bdbb-aad521eb014d,4930,4128,...,low_mq,9011219,1,3,456,114520650,114521180,60.00000,86.037730,0
2657,440,2657,primary,NC_000084.7,86589048,86589107,0,196d7906-040b-48e8-a2db-7f83307f905b,853,170,...,low_mq,11444957,0,1,59,86588821,86589862,100.00000,5.667627,0
3524,554,3524,primary,NC_000080.7,17912483,17913288,0,cb68deb9-5dd9-4f9b-9106-f40bc17a542f,1252,380,...,low_mq,9112710,3,5,345,17912931,17913276,42.85714,100.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
441250,8869,67658,primary,NC_000073.7,25829308,25829773,1,aa1ac69e-1802-4f49-b51e-b3a9b7ee3410,2904,73,...,low_mq,4853132,0,2,380,25829393,25830305,81.72043,41.666670,0
443989,9224,70397,primary,NC_000076.7,4627117,4627173,0,edd01370-914f-48b3-a32d-53633c684bff,1991,1872,...,low_mq,6690585,0,1,56,4626964,4627390,100.00000,13.145540,0
445603,9418,72011,primary,NC_000068.8,176477543,176478998,1,502e2e30-1d6a-44f8-ad45-3e939c77f1a2,6077,2673,...,low_mq,1784605,4,6,472,176477871,176478343,32.43986,100.000000,1
451067,10174,77475,primary,NC_000068.8,97751942,97752979,0,fd5360d1-4211-4c92-b016-607a03e7df35,3072,1279,...,low_mq,1394709,1,1,1037,97751941,97752979,100.00000,99.903660,1


In [17]:
df.columns

Index(['read_idx', 'align_idx', 'align_type', 'chrom', 'start', 'end',
       'strand', 'read_name', 'read_length', 'read_start', 'read_end',
       'mapping_quality', 'align_score', 'align_base_qscore', 'phase_set',
       'phase_qual', 'haplotype', 'pass_filter', 'filter_reason',
       'fragment_id', 'num_contained_fragments', 'num_overlapping_fragments',
       'overlap_length', 'fragment_start', 'fragment_end', 'perc_of_alignment',
       'perc_of_fragment', 'is_contained', 'align_start', 'align_end',
       'Chromosome', 'Total length (bp)', 'GenBank accession',
       'RefSeq accession', 'fragment_midpoint'],
      dtype='object')