In [1]:
import sys
import re
import os
import numpy as np
import pandas as pd
import pysam
from Bio.Seq import Seq
from Bio import Restriction
from Bio import SeqIO

In [50]:
def parse_fastq(fpath, rb, barcode, barcode_rc):
    """a function to parse a fastq file """
    res = []
    for read in pysam.FastxFile(fpath):
        read_seq = read.sequence
        
        # look for restriction sites
        search_results = rb.search(Seq(read_seq))
        sites = list(search_results.values())[0]
        n_sites = len(sites)
        if n_sites == 0:
            sites = [-1]

        # barcode searching
        n_barcode_forward = read_seq.count(barcode)
        n_barcode_reverse_comp = read_seq.count(barcode_rc)

        forward_sites = [-1]
        if n_barcode_forward > 0:
            forward_sites = [m.start() for m in re.finditer(barcode, read_seq)]

        reverse_comp_sites = [-1]
        if n_barcode_reverse_comp > 0:
            reverse_comp_sites = [m.start() for m in re.finditer(barcode_rc, read_seq)]

        # get the base call qualities
        quals = read.get_quality_array()
        row = {
            'read_name' : read.name,
            'seq_length' : len(read_seq),
            'n_enzymes' : n_sites,
            'enzyme_sites' : ";".join([str(x) for x in sites]),
            'n_barcode_forward' : n_barcode_forward,
            'forward_sites' : ";".join([str(x) for x in forward_sites]),
            'n_barcode_reverse_comp' : n_barcode_reverse_comp,
            'reverse_comp_sites' : ";".join([str(x) for x in reverse_comp_sites]),
            'mean_base_quality' : int(np.mean(quals)),
            'median_base_quality' : int(np.median(quals)),
            'min_base_quality' : np.min(quals),
            'max_base_quality' : np.max(quals),
        }
        res.append(row)
    return pd.DataFrame(res)


fastq_path = "/scratch/indikar_root/indikar1/cstansbu/scpc_test/fastq/b01.raw.fastq"
barcode_path = "../config/barcodes.txt"
enzyme = 'NlaIII'

# set up restriction enzyme
rb = Restriction.RestrictionBatch([enzyme])

# set up get barcodes
barcode_id = os.path.basename(fastq_path).split(".")[0]
code_df = pd.read_csv(barcode_path)
barcode = code_df[code_df['cell_id'] == barcode_id]['barcode'].values[0]
barcode_rc = str(Seq(barcode).reverse_complement())

df = parse_fastq(fastq_path, rb, barcode, barcode_rc)
df.head()

Unnamed: 0,read_name,seq_length,n_enzymes,enzyme_sites,n_barcode_forward,forward_sites,n_barcode_reverse_comp,reverse_comp_sites,mean_base_quality,median_base_quality,min_base_quality,max_base_quality
0,d2e216fe-c419-430d-9db0-fe09922fdc83,532,0,-1,5,103;162;221;397;456,0,-1,36,41,2,50
1,69df1420-7e2b-43b4-b44d-a2fc93f85617,518,0,-1,4,42;101;283;401,0,-1,34,40,2,50
2,fe7ece18-4bef-42ae-b7ef-e1f4ceeceba0,577,1,13,2,342;401,0,-1,13,11,3,41
3,4fbae019-2d11-48a5-9747-c06a9c890ee4,492,0,-1,0,-1,4,148;229;310;367,34,39,3,50
4,7e329a6b-4628-4d4d-8c02-61d80a3999dd,1158,0,-1,6,58;115;177;305;426;485,0,-1,36,41,2,50


0

In [3]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
rb = Restriction.RestrictionBatch([enzyme])

In [None]:
break

In [None]:
# load ont_barcodes
fpath = "../resources/ONT_barcode_96_sequences.csv"
df = pd.read_csv(fpath)
codes = dict(zip(df.Barcode.values, df.Sequence.values))
df.head()

In [None]:
# for _, record in res.sample(40).iterrows():
#     barcode = record['barcode']
#     read_name = record['read_name']
#     ont_seq = codes[barcode]
#     read_seq = record['sequence']
#     seq_len = record["seq_length"]
#     n_NlaIII = record["n_NlaIII"]

#     rev_comp = str(Seq(barcode).reverse_complement())

#     has_code = ont_seq in read_seq
#     has_code_rev = rev_comp in read_seq
#     print(f"{barcode=} {n_NlaIII=} {seq_len=} {has_code=}")

In [None]:
df = []

for _, record in res.iterrows():
    barcode = record['barcode']
    read_name = record['read_name']
    ont_seq = codes[barcode]
    read_seq = record['sequence']
    seq_len = record["seq_length"]
    n_NlaIII = record["n_NlaIII"]

    hits = read_seq.count(ont_seq)
    hits_rc = read_seq.count(str(Seq(ont_seq).reverse_complement()))

    row = {
        'run' : record['run'],
        'read_name' : read_name,
        'barcode' : barcode,
        'seq_len' : seq_len,
        'n_NlaIII' : n_NlaIII,
        'n_barcodes' : hits,
        'n_barcodes_rc' : hits_rc,
        'mean_base_quality' : record['mean_base_quality'],
    }

    df.append(row)

df  = pd.DataFrame(df)
df.head()


# count NlaIII sites

In [None]:
df['ord'] = np.where(df['n_NlaIII'] == 0, 'none', 'higher')
df['ord'] = np.where(df['n_NlaIII'] == 1, 'pair', df['ord'] )
df['ord'].value_counts(normalize=True)

# count barcodes

In [None]:
df['cat'] = np.where(df['n_barcodes'] == 1, 'one', 'none')
df['cat'] = np.where(df['n_barcodes'] > 1, 'mulitple', df['cat'])

print(df.shape)
print(df['barcode'].nunique())
df['cat'].value_counts(normalize=True)

In [None]:
df['cat_rc'] = np.where(df['n_barcodes_rc'] == 1, 'one', 'none')
df['cat_rc'] = np.where(df['n_barcodes_rc'] > 1, 'mulitple', df['cat_rc'])

print(df.shape)
print(df['barcode'].nunique())
df['cat_rc'].value_counts(normalize=True)

In [None]:
plt.rcParams['figure.figsize'] = 5, 3
plt.rcParams['figure.dpi'] = 200

M = pd.crosstab(df['cat'], df['ord'], normalize=True)
print(M)
sns.heatmap(M, annot=True,  fmt=".2f", lw=1, cbar=False)

In [None]:
plt.rcParams['figure.figsize'] = 5, 3
plt.rcParams['figure.dpi'] = 200

M = pd.crosstab(df['cat_rc'], df['ord'], normalize=True)
print(M)
sns.heatmap(M, annot=True,  fmt=".2f", lw=1, cbar=False)

In [None]:
plt.rcParams['figure.figsize'] = 5, 3
plt.rcParams['figure.dpi'] = 200

M = pd.crosstab(df['cat'], df['cat_rc'], normalize=True)
print(M)
sns.heatmap(M, annot=True,  fmt=".2f", lw=1, cbar=False)

# number of reads with exactly one barcode (RC + forward)

In [None]:
test = df[df['n_barcodes'].isin([0, 1])]
test = test[test['n_barcodes_rc'].isin([0, 1])]

test = test[(test['n_barcodes'] != 0) & (test['n_barcodes_rc'] != 0)]

print(f"{test.shape=}")
print(test['ord'].value_counts(normalize=True))

test.head()

In [None]:
plt.rcParams['figure.figsize'] = 8, 2.5
plt.rcParams['figure.dpi'] = 200

sns.boxplot(data=df, 
                x='n_barcodes',
                y='mean_base_quality',
                showfliers=False)

plt.show()

In [None]:
plt.rcParams['figure.figsize'] = 8, 2.5
plt.rcParams['figure.dpi'] = 200

sns.boxplot(data=df, 
                x='n_barcodes',
                y='seq_len',
                showfliers=False)
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = 4, 3
plt.rcParams['figure.dpi'] = 200

sns.boxplot(data=df, 
                x='run',
                y='n_barcodes')
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = 8, 2.5
plt.rcParams['figure.dpi'] = 200

sns.boxplot(data=df,
            x='barcode',
            y='n_barcodes',
            showfliers=False)

plt.xticks([])
plt.show()

# multiple barcodes

In [None]:
# let's look at a few extreme cases
barcodes = df[df['n_barcodes'] > 15]['read_name'].unique()
pdf = res[res['read_name'].isin(barcodes)]

for _, record in pdf.head(10).iterrows():
    print("===== ", record['run'], record['barcode'], " =====")
    ont_seq = codes[record['barcode']]
    ont_seq_rc = str(Seq(ont_seq).reverse_complement())
    print('READ NAME: ', record['read_name'])
    print('NUMBER NlaIII SITES: ', record['n_NlaIII'])
    print('ONT BARCODE: ', ont_seq)
    print('ONT BARCODE (RC): ', ont_seq_rc)

    red_ont = f"\033[91m{ont_seq}\033[0m"
    blue_ont = f"\033[94m{ont_seq_rc}\033[0m"
    cutter = f"\033[92mCATG\033[0m"
    highlighted_text = re.sub(ont_seq, red_ont, record['sequence'])
    highlighted_text = re.sub(ont_seq_rc, blue_ont, highlighted_text)
    highlighted_text = re.sub('CATG', cutter, highlighted_text)
    print('READ SEQUENCE: ', highlighted_text)
    print()

# Extactly one forward barcode

In [None]:
# let's look at a few extreme cases
barcodes = df[(df['n_barcodes'] == 1) ]['read_name'].unique()
pdf = res[res['read_name'].isin(barcodes)]


for _, record in pdf.head(10).iterrows():
    print("===== ", record['run'], record['barcode'], " =====")
    ont_seq = codes[record['barcode']]
    ont_seq_rc = str(Seq(ont_seq).reverse_complement())
    print('READ NAME: ', record['read_name'])
    print('NUMBER NlaIII SITES: ', record['n_NlaIII'])
    print('ONT BARCODE: ', ont_seq)
    print('ONT BARCODE (RC): ', ont_seq_rc)

    red_ont = f"\033[91m{ont_seq}\033[0m"
    blue_ont = f"\033[94m{ont_seq_rc}\033[0m"
    cutter = f"\033[92mCATG\033[0m"
    highlighted_text = re.sub(ont_seq, red_ont, record['sequence'])
    highlighted_text = re.sub(ont_seq_rc, blue_ont, highlighted_text)
    highlighted_text = re.sub('CATG', cutter, highlighted_text)
    print('READ SEQUENCE: ', highlighted_text)
    print()


In [None]:
# let's look at a few extreme cases
barcodes = df[(df['n_barcodes_rc'] == 1) ]['read_name'].unique()
pdf = res[res['read_name'].isin(barcodes)]


for _, record in pdf.head(10).iterrows():
    print("===== ", record['run'], record['barcode'], " =====")
    ont_seq = codes[record['barcode']]
    ont_seq_rc = str(Seq(ont_seq).reverse_complement())
    print('READ NAME: ', record['read_name'])
    print('NUMBER NlaIII SITES: ', record['n_NlaIII'])
    print('ONT BARCODE: ', ont_seq)
    print('ONT BARCODE (RC): ', ont_seq_rc)

    red_ont = f"\033[91m{ont_seq}\033[0m"
    blue_ont = f"\033[94m{ont_seq_rc}\033[0m"
    cutter = f"\033[92mCATG\033[0m"
    highlighted_text = re.sub(ont_seq, red_ont, record['sequence'])
    highlighted_text = re.sub(ont_seq_rc, blue_ont, highlighted_text)
    highlighted_text = re.sub('CATG', cutter, highlighted_text)
    print('READ SEQUENCE: ', highlighted_text)
    print()

# No forward barcodes, reverse only

In [None]:
# let's look at a few extreme cases
barcodes = df[(df['n_barcodes'] == 0) & (df['n_barcodes_rc'] == 1) ]['read_name'].unique()
pdf = res[res['read_name'].isin(barcodes)]


for _, record in pdf.head(10).iterrows():
    print("===== ", record['run'], record['barcode'], " =====")
    ont_seq = codes[record['barcode']]
    ont_seq_rc = str(Seq(ont_seq).reverse_complement())
    print('READ NAME: ', record['read_name'])
    print('NUMBER NlaIII SITES: ', record['n_NlaIII'])
    print('ONT BARCODE: ', ont_seq)
    print('ONT BARCODE (RC): ', ont_seq_rc)

    red_ont = f"\033[91m{ont_seq}\033[0m"
    blue_ont = f"\033[94m{ont_seq_rc}\033[0m"
    cutter = f"\033[92mCATG\033[0m"
    highlighted_text = re.sub(ont_seq, red_ont, record['sequence'])
    highlighted_text = re.sub(ont_seq_rc, blue_ont, highlighted_text)
    highlighted_text = re.sub('CATG', cutter, highlighted_text)
    print('READ SEQUENCE: ', highlighted_text)
    print()


In [None]:
break

In [None]:
# look for all barcodes in a few sequences
for _, record in res.sample(100).iterrows():
    read_seq = record['sequence']
    barcode = record['barcode']
    for bc, ont_seq in codes.items():
        if ont_seq in read_seq:
            print(f"cell barcode: {barcode} found: {bc}")