In [2]:
import pandas as pd
import numpy as np
import scanpy as sc
import sys
import os
import re
import pysam
from Bio.Seq import Seq
import pyranges as pr
from rapidfuzz import fuzz
import mappy as mp
import matplotlib.pyplot as plt
import seaborn as sns

# AS Summary

In [3]:
# fpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/adaptive_sampling/first_AS_test_03122024/other_reports/adaptive_sampling_PAS55331_1cbba94d_30992c93.csv"
fpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/adaptive_sampling/second_AS_test_03132024/other_reports/adaptive_sampling_PAS55331_36a181e6_7709a876.csv"
sampling_summary = pd.read_csv(fpath)
print(f"{sampling_summary.shape=}")
sampling_summary.head()

sampling_summary.shape=(66026, 7)


Unnamed: 0,batch_time,read_number,channel,num_samples,read_id,sequence_length,decision
0,1710350000.0,142,2589,3003,c7433835-805f-4da3-b315-47771ddc364f,57,unblock
1,1710350000.0,61,2591,3005,885a9435-33dd-4266-b48d-b10c49551e2c,134,unblock
2,1710350000.0,409,2567,3007,7c713cfe-2581-420e-9454-abf261d125d4,72,unblock
3,1710350000.0,132,2315,3003,78668dac-c876-4453-a383-4c952be6ecd3,207,unblock
4,1710350000.0,1757,2302,3007,679b0bef-86e7-45c1-9b64-1d099e2a2cd4,212,unblock


In [4]:
sampling_summary[sampling_summary['decision'] == 'unblock']['sequence_length'].median()

389.0

# load barcodes for 2000 closest cells

In [5]:
# load the barcodes in the reference file
fpath = "/home/cstansbu/git_repositories/ONT-single-cell/notebooks/2000_closest.fasta"
truseq = "ACACTCTTTCCCTACACGACGCTCTTCCGATCT"

codes = pd.read_csv(fpath, comment=">", header=None, names=['full_seq'])
codes['barcode'] = codes['full_seq'].apply(lambda x: x.replace(truseq, ""))

codes.head()

Unnamed: 0,full_seq,barcode
0,ACACTCTTTCCCTACACGACGCTCTTCCGATCTCACTAAGAGCGTATGG,CACTAAGAGCGTATGG
1,ACACTCTTTCCCTACACGACGCTCTTCCGATCTTTCACGCCAGCTGTTA,TTCACGCCAGCTGTTA
2,ACACTCTTTCCCTACACGACGCTCTTCCGATCTGATTCGATCTACGCGG,GATTCGATCTACGCGG
3,ACACTCTTTCCCTACACGACGCTCTTCCGATCTTAGACCAGTGCCGTTG,TAGACCAGTGCCGTTG
4,ACACTCTTTCCCTACACGACGCTCTTCCGATCTATCGGATGTTCAAAGA,ATCGGATGTTCAAAGA


# load demultiplexing results

In [6]:
# fpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/adaptive_sampling/first_AS_test_03122024/demux/test_putative_bc.csv"
fpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/adaptive_sampling/second_AS_test_03132024/demux/test_putative_bc.csv"

df = pd.read_csv(fpath)
print(f"{df.shape=}")
df['detected'] = df['putative_bc'].notna()
df['clostest_cell'] = df['putative_bc'].isin(codes['barcode'].values)

# annotote the reads that recived an unblock signal
unblocked = sampling_summary[sampling_summary['decision'] == 'unblock']['read_id'].values
df['unblock_signal'] = df['read_id'].isin(unblocked)

df.head()

df.shape=(356314, 7)


Unnamed: 0,read_id,putative_bc,putative_bc_min_q,putative_umi,umi_end,pre_bc_flanking,post_umi_flanking,detected,clostest_cell,unblock_signal
0,5a5cea8d-efc8-40ba-9bed-4f08fbeb50dc,,,,,,,False,False,False
1,d2003d6f-7384-4468-930d-730b89799430,,,,,,,False,False,False
2,4c4c9594-cef1-498f-8378-c23320957a50,,,,,,,False,False,False
3,edcd1150-0f2f-4e3f-9d36-d50c0e9454c2,,,,,,,False,False,False
4,9c188551-2286-4646-8324-80b8fd79475a,,,,,,,False,False,False


# Load the raw reads 

In [7]:
# load the reads
# fpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/adaptive_sampling/first_AS_test_03122024/merged.fastq.gz"
fpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/adaptive_sampling/second_AS_test_03132024/merged.fastq.gz"

fastq_file = pysam.FastqFile(fpath)

fastq = []

for read in fastq_file:
    quals = read.get_quality_array()
    read_row = {
        'read_name' : read.name,
        'read_length' : len(read.sequence),
        'mean_bqual' : np.mean(quals),
        'median' : np.median(quals),
        'min_bqual' : np.min(quals),
        'seq': read.sequence,
    }
    fastq.append(read_row)

fastq  = pd.DataFrame(fastq)
fastq.head()

Unnamed: 0,read_name,read_length,mean_bqual,median,min_bqual,seq
0,5a5cea8d-efc8-40ba-9bed-4f08fbeb50dc,184,27.478261,28.0,1,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...
1,d2003d6f-7384-4468-930d-730b89799430,170,4.858824,4.0,1,GCTGCTGCTGCTTGCTGCTGGCTGTTGTATTGGTAGAATAAACTGG...
2,4c4c9594-cef1-498f-8378-c23320957a50,153,28.562092,29.0,6,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...
3,edcd1150-0f2f-4e3f-9d36-d50c0e9454c2,304,7.648026,6.0,1,TTTTTTTTTTTCAGCCCAAAAAAAAAAAAAAGGCCAGAGGCAATGG...
4,9c188551-2286-4646-8324-80b8fd79475a,482,8.369295,6.0,1,TTTTCACCCCCTTTGAAGGAGAGGCCACCAGGGCAAAGGCGGTGCT...


# merge raw reads

In [8]:
df = pd.merge(df, fastq,
               how='left',
               left_on='read_id',
               right_on='read_name',)

print(f"{df.shape=}")
df.head()

df.shape=(356314, 16)


Unnamed: 0,read_id,putative_bc,putative_bc_min_q,putative_umi,umi_end,pre_bc_flanking,post_umi_flanking,detected,clostest_cell,unblock_signal,read_name,read_length,mean_bqual,median,min_bqual,seq
0,5a5cea8d-efc8-40ba-9bed-4f08fbeb50dc,,,,,,,False,False,False,5a5cea8d-efc8-40ba-9bed-4f08fbeb50dc,184,27.478261,28.0,1,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...
1,d2003d6f-7384-4468-930d-730b89799430,,,,,,,False,False,False,d2003d6f-7384-4468-930d-730b89799430,170,4.858824,4.0,1,GCTGCTGCTGCTTGCTGCTGGCTGTTGTATTGGTAGAATAAACTGG...
2,4c4c9594-cef1-498f-8378-c23320957a50,,,,,,,False,False,False,4c4c9594-cef1-498f-8378-c23320957a50,153,28.562092,29.0,6,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...
3,edcd1150-0f2f-4e3f-9d36-d50c0e9454c2,,,,,,,False,False,False,edcd1150-0f2f-4e3f-9d36-d50c0e9454c2,304,7.648026,6.0,1,TTTTTTTTTTTCAGCCCAAAAAAAAAAAAAAGGCCAGAGGCAATGG...
4,9c188551-2286-4646-8324-80b8fd79475a,,,,,,,False,False,False,9c188551-2286-4646-8324-80b8fd79475a,482,8.369295,6.0,1,TTTTCACCCCCTTTGAAGGAGAGGCCACCAGGGCAAAGGCGGTGCT...


# load the aligned reads

In [9]:
# fpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/adaptive_sampling/first_AS_test_03122024/minimap_test/AS_alignments.bam"
fpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/adaptive_sampling/second_AS_test_03132024/minimap_test/AS_alignments.bam"

bam_file = pysam.AlignmentFile(fpath)

align = []

for read in bam_file:
    
    if read.reference_name is None:
        continue
        
    align_rec = {
        'read_name' : read.query_name,
        'detected_bc' : read.reference_name.split(":")[0],
        'mapping_quality' : read.mapping_quality,
        'read_start' : read.query_alignment_start,
        'read_end' : read.query_alignment_end,
    }
    align.append(align_rec)

align = pd.DataFrame(align)
align.head()

Unnamed: 0,read_name,detected_bc,mapping_quality,read_start,read_end
0,253eecb9-301b-4813-bfa6-0e6342017b57,TCATGCCCACTTTATC,4,19,58
1,6b075e47-6fde-4129-8cc3-fcd2ec81cca2,CATAGACTCACGGAGA,12,14,53
2,3f6e8771-8a39-43fe-91bc-fbe05cbbb4ee,CCGTTCACAAGACCTT,18,16,44
3,09b79109-a369-40b3-9492-c120253c7f3e,TTCATTGCAGAATGTA,21,44,65
4,0dfa9d78-3eb9-4221-aa4f-f8b0b46af23f,CGGACACGTGCCGGTT,0,37,64


# Merge alignments

In [10]:
df = pd.merge(df, align,
               how='left',
               left_on='read_name',
               right_on='read_name',)

df['align_bc'] = df['detected_bc'].notna()
df['align_demux_match'] = df['putative_bc'] == df['detected_bc']
print(f"{df.shape=}")

df.head()

df.shape=(357115, 22)


Unnamed: 0,read_id,putative_bc,putative_bc_min_q,putative_umi,umi_end,pre_bc_flanking,post_umi_flanking,detected,clostest_cell,unblock_signal,...,mean_bqual,median,min_bqual,seq,detected_bc,mapping_quality,read_start,read_end,align_bc,align_demux_match
0,5a5cea8d-efc8-40ba-9bed-4f08fbeb50dc,,,,,,,False,False,False,...,27.478261,28.0,1,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...,,,,,False,False
1,d2003d6f-7384-4468-930d-730b89799430,,,,,,,False,False,False,...,4.858824,4.0,1,GCTGCTGCTGCTTGCTGCTGGCTGTTGTATTGGTAGAATAAACTGG...,,,,,False,False
2,4c4c9594-cef1-498f-8378-c23320957a50,,,,,,,False,False,False,...,28.562092,29.0,6,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...,,,,,False,False
3,edcd1150-0f2f-4e3f-9d36-d50c0e9454c2,,,,,,,False,False,False,...,7.648026,6.0,1,TTTTTTTTTTTCAGCCCAAAAAAAAAAAAAAGGCCAGAGGCAATGG...,,,,,False,False
4,9c188551-2286-4646-8324-80b8fd79475a,,,,,,,False,False,False,...,8.369295,6.0,1,TTTTCACCCCCTTTGAAGGAGAGGCCACCAGGGCAAAGGCGGTGCT...,,,,,False,False


# summarize

In [11]:
print(df['detected'].value_counts(normalize=True).to_string())
print()
print(df['clostest_cell'].value_counts(normalize=True).to_string())
print()
print(df['unblock_signal'].value_counts(normalize=True).to_string())
print()
print(df['align_bc'].value_counts(normalize=True).to_string())
print()
print(df['align_demux_match'].value_counts(normalize=True).to_string())

detected
False    0.534357
True     0.465643

clostest_cell
False    0.838836
True     0.161164

unblock_signal
False    0.838467
True     0.161533

align_bc
False    0.705997
True     0.294003

align_demux_match
False    0.841295
True     0.158705


In [54]:
# some filtering
min_base_q = 7

pdf = df.copy()

# drop the reads that recieved an unblock signal
pdf = pdf[~pdf['unblock_signal']]
pdf = pdf[pdf['mean_bqual'] > min_base_q]
print(f"{pdf.shape=}")

cols = [
    'putative_bc', 
    'clostest_cell', 
    'umi_end',
    'read_length', 
    'detected_bc', 
    'mapping_quality',
    'read_start',
    'align_demux_match',
]


for idx, row in pdf.sample(2).iterrows():

    print(f"\n================ {row['read_id']} ================")
    seq = row['seq']
    demux = row['putative_bc']
    if len(str(demux)) > 4:
        print(demux)
        

    
    print(row[cols].to_string())
    print()
    print(seq)



pdf.shape=(274747, 22)

putative_bc            NaN
clostest_cell        False
umi_end                NaN
read_length            765
detected_bc            NaN
mapping_quality        NaN
read_start             NaN
align_demux_match    False

TGTTATGTACGTACCTTTAAACCATTGCGTATTGCTAGCAGTGGTATCAACACAAGTACATGGCATCCACCTCCTGCTGGCAGAACTCCTTGACAGTCCGTCCACCCTCGATGAAGTGCTCAAAGAACTTGCTCTCGCGCTGCAGGTAGCCCGAGGTGAGCAGCCGCAGGTAGACCACAAGGTAGTGGGAGGTGCTCTGGTCATTGAAGGAGGCCAGCAGGTCAGCAACAGAGTCTCTCCACCTGCTCAATCAGGTCCATGAACGTGTTGTGGAAATCCTGATTGTAAAATCAGTGAAGCCCTGGGACACCAGGTCTTCCTTGCTCTTGGCAGACACAGCCTTGAACCGCTGCAACTCCTTGCTGTCATCCAGCAGTGCCTCCAAGTGGGAGAATCCAAAGCCCGATAGAAACAGTTGCCGTCAGGCCTGGTATTGCGGTGTACGAGTACTTTTTGTGGAGGTCCTTGATCTTATCTTGATGGTGTCATCTTCAGCATACTCCTTGTATCAGACCAGAGCTCCAGCCGCTCTGACACCAGAGGGTTCTGCACAGCAATCTCTTGCTGAATTCGGTCCTGCTGAGCCATGATGGCTTCGTCATAGGCCAGACGGTCCAGCACGCTGGAGTCGCTCCCAGCGGCTCCTGCTTCTGCTGCTGAGGTTCCTCCGCCGCCATCTTTAAGCAGCGCCGCACTCCCCATGTACTCTGCGTTGATACCACTGCTTGCAATATCAGCACCAACAGAAAGCTGAGCAAT

In [56]:
pdf['sign'] = np.sign(pdf['umi_end'])
pdf['sign'].value_counts(dropna=False, normalize=True)

sign
 NaN    0.496999
-1.0    0.476886
 1.0    0.026115
Name: proportion, dtype: float64

In [17]:
pdf.columns

Index(['read_id', 'putative_bc', 'putative_bc_min_q', 'putative_umi',
       'umi_end', 'pre_bc_flanking', 'post_umi_flanking', 'detected',
       'clostest_cell', 'unblock_signal', 'read_name', 'read_length',
       'mean_bqual', 'median', 'min_bqual', 'seq', 'detected_bc',
       'mapping_quality', 'read_start', 'read_end', 'align_bc',
       'align_demux_match'],
      dtype='object')

In [10]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [19]:
400 * 0.6

240.0

# Throughput summary

In [None]:
fpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/adaptive_sampling/first_AS_test_03122024/throughput_PAS55331_1cbba94d_30992c93.csv"
df = pd.read_csv(fpath)
print(f"{df.shape=}")
df.head()

In [None]:
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 3.5, 2.5

sns.lineplot(data=df, 
             x='Experiment Time (minutes)',
             y='Reads',
             label='Reads')

sns.lineplot(data=df, 
             x='Experiment Time (minutes)',
             y='Basecalled Reads Passed',
             label='Basecalled Reads')

sns.despine()

In [None]:
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 3.5, 2.5

df['Percent Basecalled'] = df['Basecalled Reads Passed'] / df['Reads']

sns.lineplot(data=df, 
             x='Experiment Time (minutes)',
             y='Percent Basecalled')

sns.despine()