In [1]:
import pandas as pd
import numpy as np
import cooler
import matplotlib.pyplot as plt
import matplotlib.colors
from matplotlib.patches import Patch
import scipy.io
import pysam


from sklearn.decomposition import PCA
from scipy import stats
import networkx as nx
from sklearn.metrics import jaccard_score

import os
import sys
from importlib import reload

import filters
import pore_c_utils as pcu
import binning_utils as binning
import plotting as hicPlot
import nanoporeDataModel as ndm

In [2]:
filepath = "filtered_data/clique1MBfiltered.csv"

df = pd.read_csv(filepath)
print(f"{df.shape=}")
print(df.columns)
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


df.shape=(636130, 65)
Index(['index', 'read_name', 'read_length', 'read_idx', 'contact_is_direct',
       'contact_is_cis', 'contact_read_distance', 'contact_genome_distance',
       'contact_fragment_adjacent', 'contact_fragment_distance',
       'haplotype_pair_type', 'align1_align_idx', 'align1_chrom',
       'align1_start', 'align1_end', 'align1_strand', 'align1_mapping_quality',
       'align1_align_score', 'align1_align_base_qscore', 'align1_phase_set',
       'align1_phase_qual', 'align1_haplotype', 'align1_fragment_id',
       'align1_fragment_start', 'align1_fragment_end', 'align2_align_idx',
       'align2_chrom', 'align2_start', 'align2_end', 'align2_strand',
       'align2_mapping_quality', 'align2_align_score',
       'align2_align_base_qscore', 'align2_phase_set', 'align2_haplotype',
       'align2_fragment_id', 'align2_fragment_start', 'align2_fragment_end',
       'batch_id', 'align1_chromosome_name', 'align1_chrom_length',
       'align1_genbank', 'align1_refSeq', 'ali

Unnamed: 0,index,read_name,read_length,read_idx,contact_is_direct,contact_is_cis,contact_read_distance,contact_genome_distance,contact_fragment_adjacent,contact_fragment_distance,...,align1_absolute_midpoint,align2_absolute_midpoint,mean_mapping_quality,contact_count,align1_fragcount,align2_fragcount,align1_n_reads,align2_n_reads,contact_has_support,cell
0,27771,132f1be4-df3c-4a1f-bc53-40df856b0c42,4312,35703,False,True,1619,11803,False,12375,...,2630943949,2630956324,50.0,1,4,3,,,1,run01
1,206517,b0e13742-1d17-47d9-a017-668f37b6788f,2616,7786,False,True,1432,714701,False,715354,...,2630229012,2630944366,96.0,1,4,2,,,1,run01
2,117363,330b862a-138c-48d7-9534-be879100a320,1022,31146,True,True,1,1378,False,1844,...,2630227938,2630229781,49.5,1,3,2,,,1,run01
3,468866,af44682d-de48-467d-8cd2-f79d45b85f69,1364,36416,True,True,254,729058,False,729603,...,2630227390,2630956993,47.5,1,2,3,,,1,run01
4,180875,7470a373-22cb-49b2-a0df-9e5279d9559c,3386,7678,False,True,866,716159,False,716559,...,2630227390,2630943949,50.5,1,2,4,,,1,run01


In [4]:
bampathDir = '/nfs/turbo/umms-indikar/shared/tools/Pore-C-Snakemake/results_ES5to12/mapping/'

def sam2SequenceFrame(bam):
    """ Convert a bam file to a table """
    newRows = []
    for align in bam:
        read_name, read_idx, align_idx = align.query_name.split(":")
        row = {
            'read_name' : read_name,
            'read_idx' : read_idx,
            'align_idx' : align_idx, 
            'seq' : align.seq,
            'read_length' : align.infer_read_length(),
            'read_start' : align.query_alignment_start,
            'read_end' : align.query_alignment_end,
        }
        newRows.append(row)
    return pd.DataFrame(newRows)
    

seqs = {}

for runId in df['cell'].unique():
    
    print(f"{runId=}")
    dfList = []
    
    for file in os.listdir(bampathDir):
        if runId in file and file.endswith('.coord_sort.bam'):
            filepath = f"{bampathDir}{file}"        

            runId = file.split("_")[1]

            bam = pysam.AlignmentFile(filepath)
            readDf = sam2SequenceFrame(bam)

            readDf['file'] = file
            readDf['runId'] = runId
            
            dfList.append(readDf)
    cellDf = pd.concat(dfList, ignore_index=True)
    print(f"{cellDf.shape=}")
    seqs[runId] = cellDf
    
    
seqs.keys()

runId='run01'
cellDf.shape=(6226401, 9)
cellDf.shape=(6226401, 9)
runId='run02'
cellDf.shape=(5946864, 9)
cellDf.shape=(5946864, 9)
runId='run04'
cellDf.shape=(8193791, 9)
cellDf.shape=(8193791, 9)
runId='run05'
cellDf.shape=(2802500, 9)
cellDf.shape=(2802500, 9)
runId='run06'
cellDf.shape=(3650575, 9)
cellDf.shape=(3650575, 9)
runId='run07'
cellDf.shape=(4258406, 9)
cellDf.shape=(4258406, 9)


dict_keys(['run01', 'run02', 'run04', 'run05', 'run06', 'run07'])

In [5]:
for runId, data in seqs.items():
    print(f"{runId=}")
    print(f"{data.shape=}")
    data = data[data['read_name'].isin(df['read_name'])]
    print(f"{data.shape=}")
    print()
    
    seqs[runId] = data
    
print('done')

runId='run01'
data.shape=(6226401, 9)
data.shape=(322752, 9)

runId='run02'
data.shape=(5946864, 9)
data.shape=(170051, 9)

runId='run04'
data.shape=(8193791, 9)
data.shape=(214386, 9)

runId='run05'
data.shape=(2802500, 9)
data.shape=(233556, 9)

runId='run06'
data.shape=(3650575, 9)
data.shape=(237408, 9)

runId='run07'
data.shape=(4258406, 9)
data.shape=(408510, 9)

done


In [6]:
for runId, data in seqs.items():
    
    filepath = f'filtered_data/{runId}_sequences.csv'
    
    data.to_csv(filepath, index=False)
    print(f'saved: {filepath}')
    
print('done.')

saved: filtered_data/run01_sequences.csv
saved: filtered_data/run02_sequences.csv
saved: filtered_data/run04_sequences.csv
saved: filtered_data/run05_sequences.csv
saved: filtered_data/run06_sequences.csv
saved: filtered_data/run07_sequences.csv
done.
