In [2]:
import pysam 
from Bio.Seq import Seq
import re
import sys
import pandas as pd

In [3]:
file  = "b11"
# load the cut sites
site_path = f"/scratch/indikar_root/indikar1/cstansbu/scpc_test/sites/{file}.cut.sites.pq"
sites = pd.read_parquet(site_path)
print(f"{sites.shape=}")

# load alignments
align_path = f"/scratch/indikar_root/indikar1/cstansbu/scpc_test/align_table/{file}.alignments.pq"

align_raw = pd.read_parquet(align_path)
print(f"{align_raw.shape=}")
align_raw.head()

sites.shape=(99264, 3)
align_raw.shape=(111696, 11)


Unnamed: 0,read_name,alignment_idx,read_start,read_end,chrom,ref_start,ref_end,mapping_quality,is_reverse,is_duplicate,alignment_score
0,000089c8-3e22-4132-a2de-414295fe5107,0,0,247,,-1,,0,False,False,0
1,0000f63b-c9b1-4f0b-9a00-5d836e11c801,0,0,345,,-1,,0,False,False,0
2,00015b8d-f66a-4993-a038-727b21b1f44d,0,0,687,,-1,,0,False,False,0
3,00018ec7-f45b-5a19-8a5d-84a17d84e11f,0,0,250,,-1,,0,False,False,0
4,0002d641-96d6-4e75-a051-19430f9b2639,0,0,1869,,-1,,0,False,False,0


In [4]:
# define params
mapq_threshold = 40.0

# load data
align = pd.read_parquet(align_path)

# filter low-quality alignments
align = align[align['mapping_quality'] > mapq_threshold]

# only consider reads with multiple alignments
align['n_alignments'] = align.groupby('read_name')['alignment_idx'].transform('nunique')
align = align[align['n_alignments'] > 1]

# add alignment length
align['length_on_read'] = align['read_end'] - align['read_start']
print(f"{align.shape=}")

align.shape=(6573, 13)


In [None]:
def get_concatemers(df):
    """A function to resolve concatemers 
    from multiple alignments """
    concatemers = []

    for read_name, align_group in df.groupby('read_name'):
        column_sort = [
            'read_start', 
            'length_on_read', 
            'mapping_quality'
        ]

        sort_order = [
            True, 
            False, 
            False
        ]
        
        # sort nodes by the starting position on the read
        align_group = align_group.sort_values(by=column_sort, 
                                              ascending=sort_order)
        # get the longest, highest quality alignments
        align_group = align_group.drop_duplicates(subset='read_start')

        # find overlapping alignments
        intervals = []
        for rs, re in align_group[['read_start', 'read_end']].values:
            interval = pd.Interval(rs, re)
            intervals.append(interval)

        # drop the lower quality overlapping alignments
        to_drop = []
        for i, align_1 in enumerate(intervals):
            for j, align_2 in enumerate(intervals):
                if i == j:
                    continue
                    
                if align_1.overlaps(align_2):
                    overlaps = align_group.take([i, j])
                    overlaps = overlaps.sort_values(by=column_sort, 
                                                    ascending=sort_order)
                    to_drop.extend(overlaps['alignment_idx'].values[1:])

        # drop unique only
        to_drop = list(set(to_drop))
        align_group = align_group[~align_group['alignment_idx'].isin(to_drop)]

        if len(align_group) > 1:
            concatemers.append(align_group)

    concatemers = pd.concat(concatemers)
    return concatemers
        

cmers = get_concatemers(align)
print(cmers['read_name'].nunique())
cmers.head()

In [13]:
align_group[['read_start', 'read_end']]

NameError: name 'align_group' is not defined

In [9]:
ligated = sites[sites['n_sites'] > 0].copy()
print(f"{ligated.shape=}")
ligated.head()

ligated.shape=(26052, 3)


Unnamed: 0,read_name,n_sites,sites
2,e54d5bd7-8c3b-4ae3-80a6-761195345671,1,6
3,e49a298f-51a9-4c38-b4a6-3dccaefd48c4,6,485;1066;1087;1338;1561;1591
8,9ad4a632-2e1b-40c8-8671-e469bf78f7ee,1,3
11,e0e7f714-c179-4218-9dca-7daa330b1117,2,346;350
12,35b6178a-67d9-477e-9b8c-131acadebaae,3,1058;1130;1672


In [23]:
# filter out short reads
align = align_raw.copy()
align['n_alignments'] = align.groupby('read_name')['alignment_idx'].transform('count')
align = align[align['n_alignments'] > 1]

# filter out the low-order alignments
align = align[align['read_name'].isin(ligated['read_name'].unique())]

print(f"{align.shape=}")

# join 
align = pd.merge(align, ligated, 
                 how='left',
                 left_on='read_name',
                 right_on='read_name')

# remove low-quality mapping
align = align[align['mapping_quality'] > 40]
print(f"{align.shape=}")

# split out sites
align['sites'] = align['sites'].apply(lambda x: x.split(";"))

buffer = 100

align.head(10)


align.shape=(14210, 12)
align.shape=(6750, 14)


Unnamed: 0,read_name,alignment_idx,read_start,read_end,chrom,ref_start,ref_end,mapping_quality,is_reverse,is_duplicate,alignment_score,n_alignments,n_sites,sites
8,005c5b69-cee1-44d2-bcd9-fa83735f929c,1,0,185,13,65458054,65458248.0,52,True,False,276,4,4,"[55, 330, 798, 1111]"
9,005c5b69-cee1-44d2-bcd9-fa83735f929c,2,1015,1381,MT,4871,5240.0,60,True,False,696,4,4,"[55, 330, 798, 1111]"
10,005c5b69-cee1-44d2-bcd9-fa83735f929c,3,0,334,MT,15285,15619.0,60,True,False,648,4,4,"[55, 330, 798, 1111]"
11,0067de08-ee57-4478-8392-9fbfb67cda33,0,1310,1775,6,112134176,112134644.0,60,True,False,886,3,5,"[1336, 1515, 1673, 1910, 2134]"
12,0067de08-ee57-4478-8392-9fbfb67cda33,1,0,249,6,112135049,112135299.0,60,False,False,446,3,5,"[1336, 1515, 1673, 1910, 2134]"
13,0067de08-ee57-4478-8392-9fbfb67cda33,2,0,357,6,112138207,112138565.0,60,True,False,652,3,5,"[1336, 1515, 1673, 1910, 2134]"
14,00f4853e-e697-4518-91c3-3a1857165018,0,935,1524,4,46573150,46573737.0,60,True,False,1038,13,10,"[1126, 1154, 1296, 1409, 1458, 1539, 1671, 225..."
15,00f4853e-e697-4518-91c3-3a1857165018,1,0,134,7,108174814,108174947.0,60,False,False,242,13,10,"[1126, 1154, 1296, 1409, 1458, 1539, 1671, 225..."
18,00f4853e-e697-4518-91c3-3a1857165018,4,0,90,16,35711909,35711999.0,60,True,False,174,13,10,"[1126, 1154, 1296, 1409, 1458, 1539, 1671, 225..."
30,01056986-b130-4e27-a988-5fc165b80dd1,3,0,120,9,65170316,65170436.0,60,True,False,228,7,8,"[1147, 1367, 1408, 2400, 2516, 2599, 2744, 2822]"


In [None]:
# join

