In [43]:
import pandas as pd
import numpy as np
import scanpy as sc
import sys
import os
import pysam
import mappy as mp
import re
import matplotlib.pyplot as plt
import seaborn as sns
from Bio.Seq import Seq
from collections import Counter
import io

In [5]:
fpath = "/scratch/indikar_root/indikar1/cstansbu/scpc_test/barcode_locations/o3b24.csv"
codes = pd.read_csv(fpath, sep='\t')

# exclude miscalled barcodes
codes = codes[codes['patternName'] == 'b24']

# count the barcodes per read
codes['n_barcode_per_strand'] = codes.groupby(['seqID', 'strand'])['patternName'].transform('count')
codes['n_barcode'] = codes.groupby('seqID')['patternName'].transform('count')

print(f"{codes.shape=}")
codes.head()

codes.shape=(172653, 9)


Unnamed: 0,seqID,patternName,pattern,strand,start,end,matched,n_barcode_per_strand,n_barcode
0,2a2a0067-90d8-480d-b863-6ec4097df341,b24,GCATAGTTCTGCATGATGGGTTAG,+,40,63,GCATAGTTCTGCATGATGGGTTAG,1,2
1,2a2a0067-90d8-480d-b863-6ec4097df341,b24,GCATAGTTCTGCATGATGGGTTAG,-,1561,1584,GCATAGTTCTGCATGATGGGTTAG,1,2
2,a830eca5-7897-4d18-95cf-0014429e2751,b24,GCATAGTTCTGCATGATGGGTTAG,+,43,66,GCATAGTTCTGCATGATGGGTTAG,1,2
3,a830eca5-7897-4d18-95cf-0014429e2751,b24,GCATAGTTCTGCATGATGGGTTAG,-,1456,1479,GCATAGTTCTGCATGATGGGTTAG,1,2
4,d8ed6771-2c05-4367-aafe-4c9fa9091f82,b24,GCATAGTTCTGCATGATGGGTTAG,+,35,58,GCATAGTTCTGCATGATGGGTTAG,1,2


In [6]:
# get reads with multiple barcodes on the same strand
n_codes = 2
multi = codes.copy()
multi = multi[multi['n_barcode'] == n_codes]
print(f"{multi.shape=}")
print(f"{multi['seqID'].nunique()=}")
multi.head()

multi.shape=(138656, 9)
multi['seqID'].nunique()=69328


Unnamed: 0,seqID,patternName,pattern,strand,start,end,matched,n_barcode_per_strand,n_barcode
0,2a2a0067-90d8-480d-b863-6ec4097df341,b24,GCATAGTTCTGCATGATGGGTTAG,+,40,63,GCATAGTTCTGCATGATGGGTTAG,1,2
1,2a2a0067-90d8-480d-b863-6ec4097df341,b24,GCATAGTTCTGCATGATGGGTTAG,-,1561,1584,GCATAGTTCTGCATGATGGGTTAG,1,2
2,a830eca5-7897-4d18-95cf-0014429e2751,b24,GCATAGTTCTGCATGATGGGTTAG,+,43,66,GCATAGTTCTGCATGATGGGTTAG,1,2
3,a830eca5-7897-4d18-95cf-0014429e2751,b24,GCATAGTTCTGCATGATGGGTTAG,-,1456,1479,GCATAGTTCTGCATGATGGGTTAG,1,2
4,d8ed6771-2c05-4367-aafe-4c9fa9091f82,b24,GCATAGTTCTGCATGATGGGTTAG,+,35,58,GCATAGTTCTGCATGATGGGTTAG,1,2


In [None]:
break

In [36]:
# loop through the fastq records

fpath = "/scratch/indikar_root/indikar1/cstansbu/scpc_test/fastq/o3b24.raw.fastq"

barcode = 'GCATAGTTCTGCATGATGGGTTAG'
pre_code = "GGTGCTG"
post_code = "TTAACCT"
adapters = [
    'TTCTGTTGGTGCTGATATTGC',
    'ACTTGCCTGTCGCTCTATCTTC',
]

fastq_file = pysam.FastqFile(fpath)

buffer = 24
flanking_seqs = []

stop = 10000
count = -1
for read in fastq_file:
    count +=1 
    if count == stop:
        break

    seq = read.sequence 

    for i, adapter in enumerate(adapters):
        search = f"{pre_code}{barcode}{post_code}{adapter}"
        offset = len(search)
        res = seq.find(search)
        if res != -1:
            subseq = seq[res+offset:buffer+res+offset]
            search_row = {
                'read_name' : read.name,
                'position' : res,
                'end_pos' : buffer+res+offset,
                'adapter' : adapter,
                'flanking_sequence' : subseq,
                'seq' : seq,
            
            }
            flanking_seqs.append(search_row)
       

flanking_seqs = pd.DataFrame(flanking_seqs)

print(flanking_seqs['adapter'].value_counts())
print()
print(flanking_seqs[['adapter', 'position']].value_counts().head(5))

adapter
TTCTGTTGGTGCTGATATTGC     4030
ACTTGCCTGTCGCTCTATCTTC    3239
Name: count, dtype: int64

adapter                 position
TTCTGTTGGTGCTGATATTGC   32          479
                        33          455
                        34          418
ACTTGCCTGTCGCTCTATCTTC  32          403
TTCTGTTGGTGCTGATATTGC   37          400
Name: count, dtype: int64


In [61]:
offset

60

In [37]:
t = flanking_seqs['read_name'].value_counts()
t.value_counts(normalize=True)

count
1    0.997241
2    0.002759
Name: proportion, dtype: float64

In [38]:
print()
print(flanking_seqs[['adapter', 'flanking_sequence']].value_counts())


adapter                 flanking_sequence       
TTCTGTTGGTGCTGATATTGC   GGCGTCTGCTTGGGTGTTTAACCT    2759
ACTTGCCTGTCGCTCTATCTTC  GGCGTCTGCTTGGGTGTTTAACCT    2592
TTCTGTTGGTGCTGATATTGC   GGTGCTGAACAACCGAACCTTTGA     111
ACTTGCCTGTCGCTCTATCTTC  AGCATATGCTTGGGTGTTTAACCT      83
TTCTGTTGGTGCTGATATTGC   AGCATATGCTTGGGTGTTTAACCT      80
                                                    ... 
                        AGCATATGCTGGGTGTTTAACCTA       1
                        AGCATATGCTGGGTGTTTAACCTT       1
                        AGCATATGCTTGGGTGCTTAACCT       1
                        AGCATATGCTTGGGTGTTTAAACC       1
                        TGTGCTGTGGAAGATGAGACCCTG       1
Name: count, Length: 713, dtype: int64


# align some sequence

In [44]:
ref_path = "/scratch/indikar_root/indikar1/cstansbu/scpc_test/references/GRCm39.fa"
a = mp.Aligner(ref_path)
print('done')

done


In [54]:
buffer = 250

res = []
for idx, row in flanking_seqs.iterrows():
    start_pos = row['end_pos'] # position where search sequence ends and mouse should begin
    seq = row['seq']

    search_seq = seq[start_pos:start_pos+buffer]

    for hit in a.map(search_seq): # traverse alignments
        map_row = {
            'read_name' : row['read_name'],
            'adapter' : row['adapter'],
            'read_start_pos' : start_pos,
            'chrom' : hit.ctg,
            'ref_start' : hit.r_st,
            'mapq' : hit.mapq,
            'is_primary' : hit.is_primary
        }

        res.append(map_row)
        
res = pd.DataFrame(res)
res.head()

Unnamed: 0,read_name,adapter,read_start_pos,chrom,ref_start,mapq,is_primary
0,2a2a0067-90d8-480d-b863-6ec4097df341,ACTTGCCTGTCGCTCTATCTTC,116,4,106565530,60,True
1,a830eca5-7897-4d18-95cf-0014429e2751,ACTTGCCTGTCGCTCTATCTTC,119,11,72604180,60,True
2,a830eca5-7897-4d18-95cf-0014429e2751,ACTTGCCTGTCGCTCTATCTTC,119,2,98497393,4,True
3,a830eca5-7897-4d18-95cf-0014429e2751,ACTTGCCTGTCGCTCTATCTTC,119,2,98496908,0,False
4,a830eca5-7897-4d18-95cf-0014429e2751,ACTTGCCTGTCGCTCTATCTTC,119,14,3052659,0,False


In [57]:
print(flanking_seqs['read_name'].nunique())
print()
print(res['read_name'].nunique())
t = res[res['mapq'] > 55]
print()
print(t['read_name'].nunique())

7249

6695

5418


In [59]:
5418/ 7249

0.7474134363360464