In [1]:
import sys
import re
import pandas as pd
import pysam
import numpy as np

In [2]:
print(f"{pysam.__version__=}")
print(f"{pysam.__file__=}")

pysam.__version__='0.19.1'
pysam.__file__='/home/cstansbu/.local/lib/python3.9/site-packages/pysam/__init__.py'


In [52]:
def get_align_type(align):
    """A function to return one of: 
        'primary', 'secondary', 'supplementary'
    based on read
    """
    align_type = 'primary'
    
    if align.is_secondary:
        align_type = 'secondary'
        
    if align.is_supplementary:
        align_type = 'supplementary'
    return align_type
    

def parse_read_name(align, sep):
    """A function to parse the read name """
    read_name = align.qname
    offset = 0
    is_digested = False
    fragment_index = -1

    if sep in read_name:
        read_name = read_name.split(sep)[0]
        offset = int(align.qname.split(sep)[2])
        fragment_index = int(align.qname.split(sep)[1])
        is_digested = True

    return read_name, fragment_index, is_digested, offset


def get_mean_alignment_base_qualities(align):
    """A function to get the alignment-span
    base qualities """
    if align.query_alignment_qualities is None:
        base_qualities = [-1]
    else:
        base_qualities = align.query_alignment_qualities
    
    return np.mean(base_qualities)
    

def bam_to_df(bampath, sep="_"):
    """A function to parse a bam file into 
    a structured table """
    res = []
    bam = pysam.AlignmentFile(bampath)
    for align in bam:

        # parse read name
        read_name, fragment_index, is_digested, offset = parse_read_name(align, sep)

        # check for the alignment tag
        if align.has_tag('AS'):
            as_tag = align.get_tag('AS')
        else:
            as_tag = 0        

        # get the alignment type
        align_type = get_align_type(align)

        # get base qualities
        mean_qual = get_mean_alignment_base_qualities(align)
    
        row = {
            'read_name' : read_name,
            'fragment_index' : fragment_index,
            'is_digested_fragment' : is_digested,
            'align_type' : align_type,
            'is_forward' : align.is_forward,
            'is_mapped' : align.is_mapped,
            'mean_align_base_quality' : mean_qual,
            'read_length' : align.query_length,
            'read_start' : align.qstart + offset,
            'read_end' : align.qend + offset, 
            'chrom' : align.reference_name,
            'reference_start' : align.reference_start,
            'reference_end' : align.reference_end,
            'mapping_quality' : align.mapping_quality,
            'alignment_score' : as_tag,
        }

        res.append(row)
    return pd.DataFrame(res)

bam_path = "/scratch/indikar_root/indikar1/cstansbu/scpc_test/minimap2/o1b24.GRCm39.raw.bam"
df = bam_to_df(bam_path)
print(f"{df.shape=}")
df.head()

[W::hts_idx_load3] The index file is older than the data file: /scratch/indikar_root/indikar1/cstansbu/scpc_test/minimap2/o1b24.GRCm39.raw.bam.bai


df.shape=(2315202, 15)


Unnamed: 0,read_name,fragment_index,is_digested_fragment,align_type,is_forward,is_mapped,mean_align_base_quality,read_length,read_start,read_end,chrom,reference_start,reference_end,mapping_quality,alignment_score
0,c1490127-edfa-40be-96a3-8721c3267924,-1,False,secondary,True,True,-1.0,0,119,542,1,3126059,3126479.0,0,710
1,933283a2-3f3f-4eed-bcb9-a7acc686cc4b,-1,False,primary,True,True,15.893162,536,206,440,1,3132750,3132988.0,60,400
2,2868dda4-c627-4545-bed7-85f2cd6f2907,-1,False,primary,False,True,18.970464,538,177,414,1,3132750,3132988.0,60,426
3,81d9e714-bcc6-47f1-980d-c154043bd172,-1,False,primary,True,True,19.384615,539,205,439,1,3132753,3132985.0,60,370
4,aff66c6b-f9fa-405e-b677-8c1c5054c9d3,-1,False,primary,False,True,17.168103,511,162,394,1,3132753,3132986.0,60,440


In [None]:
break

In [4]:
fpath = "/scratch/indikar_root/indikar1/cstansbu/scpc_test/align_table/o1b24.GRCm39.digested.parquet"
df = pd.read_parquet(fpath)
print(df.shape)
df.head()

(4780553, 15)


Unnamed: 0,read_name,fragment_index,is_digested_fragment,align_type,is_forward,is_mapped,mean_align_base_quality,read_length,read_start,read_end,chrom,reference_start,reference_end,mapping_quality,alignment_score
0,a852d735-5a6e-40bd-a745-a73a1121b273,5,True,primary,True,True,18.423333,300,1026,1326,1,3113340,3113640.0,1,582
1,c1490127-edfa-40be-96a3-8721c3267924,1,True,secondary,True,True,-1.0,0,119,542,1,3126059,3126479.0,0,710
2,c1490127-edfa-40be-96a3-8721c3267924,1,True,secondary,True,True,-1.0,0,119,542,1,3126059,3126479.0,0,710
3,933283a2-3f3f-4eed-bcb9-a7acc686cc4b,1,True,primary,True,True,15.893162,447,206,440,1,3132750,3132988.0,60,400
4,2868dda4-c627-4545-bed7-85f2cd6f2907,1,True,primary,False,True,18.970464,447,212,449,1,3132750,3132988.0,60,426


In [38]:
fid = "o1b24"

paths = [
    f"/scratch/indikar_root/indikar1/cstansbu/scpc_test/minimap2/{fid}.GRCm39.digested.bam",
    f"/scratch/indikar_root/indikar1/cstansbu/scpc_test/minimap2/{fid}.GRCm39.raw.bam",
]

outpaths = [
    f'/scratch/indikar_root/indikar1/shared_data/scPore-C/{fid}.GRCm39.digested.csv',
    f'/scratch/indikar_root/indikar1/shared_data/scPore-C/{fid}.GRCm39.raw.csv',
]

for i, bam_path in enumerate(paths):
    df = bam_to_df(bam_path)
    print(f"{df.shape=}")
    df.to_csv(outpaths[i], index=False)
    print(outpaths[i])

[E::idx_find_and_load] Could not retrieve index file for '/scratch/indikar_root/indikar1/cstansbu/scpc_test/minimap2/o1b24.GRCm39.digested.bam'


df.shape=(4780553, 11)
/scratch/indikar_root/indikar1/shared_data/scPore-C/o1b24.GRCm39.digested.csv


[W::hts_idx_load3] The index file is older than the data file: /scratch/indikar_root/indikar1/cstansbu/scpc_test/minimap2/o1b24.GRCm39.raw.bam.bai


df.shape=(2315202, 11)
/scratch/indikar_root/indikar1/shared_data/scPore-C/o1b24.GRCm39.raw.csv


In [42]:
fpath = "/scratch/indikar_root/indikar1/cstansbu/scpc_test/NlaIII_locations/o1b24.csv"
df = pd.read_csv(fpath, sep='\t')
df.head()

Unnamed: 0,seqID,patternName,pattern,strand,start,end,matched
0,c700b24a-02db-4189-9921-ee954d36e5e2,CATG,CATG,+,46,49,CATG
1,c700b24a-02db-4189-9921-ee954d36e5e2,CATG,CATG,+,143,146,CATG
2,c700b24a-02db-4189-9921-ee954d36e5e2,CATG,CATG,+,188,191,CATG
3,c700b24a-02db-4189-9921-ee954d36e5e2,CATG,CATG,+,363,366,CATG
4,c700b24a-02db-4189-9921-ee954d36e5e2,CATG,CATG,+,539,542,CATG


In [43]:
df['strand'].value_counts()

strand
+    3027675
-    3027675
Name: count, dtype: int64