In [1]:
import pandas as pd
import numpy as np
import glob
import os
import pysam

In [2]:
file_list = glob.glob("/scratch/indikar_root/indikar1/cstansbu/*/demultiplex/Run3.putative_bc.csv")
file_list

['/scratch/indikar_root/indikar1/cstansbu/low_sensitivity/demultiplex/Run3.putative_bc.csv',
 '/scratch/indikar_root/indikar1/cstansbu/high_sensitivity/demultiplex/Run3.putative_bc.csv']

# Demultiplexing Report

In [3]:
file_paths = {
    'high' : "/scratch/indikar_root/indikar1/cstansbu/high_sensitivity/demultiplex/Run3.putative_bc.csv",
    'low' : "/scratch/indikar_root/indikar1/cstansbu/low_sensitivity/demultiplex/Run3.putative_bc.csv",
}

df = []
for k, fpath in file_paths.items():
    tmp = pd.read_csv(fpath)
    tmp['sensitivity'] = k
    print(f"{k} sensitivity: {tmp.shape=}")
    df.append(tmp)    
    
df = pd.concat(df)
df['n_unique'] = df.groupby('read_id')['putative_bc'].transform('nunique')
df = df.sort_values(by=['read_id', 'sensitivity'])
df.head(6)

high sensitivity: tmp.shape=(238424, 8)
low sensitivity: tmp.shape=(238424, 8)


Unnamed: 0,read_id,putative_bc,putative_bc_min_q,putative_umi,polyT_end,pre_bc_flanking,post_umi_flanking,sensitivity,n_unique
27899,00001862-a60a-45d5-8c54-eeebf9cd617e,TCCCGTTGTTTCAGGT,28.0,CTTCTCACGACG,-119.0,GATCT,TTTTT,high,1
27899,00001862-a60a-45d5-8c54-eeebf9cd617e,TCCCGTTGTTTCAGGT,28.0,CTTCTCACGACG,-119.0,GATCT,TTTTT,low,1
127139,00003087-bb68-44f1-a202-5847b8ccb047,CCATTTCAGAACCAGG,21.0,CCCCTTCGGCAC,-142.0,GATCT,TTTTT,high,1
127139,00003087-bb68-44f1-a202-5847b8ccb047,CCATTTCAGAACCAGG,21.0,CCCCTTCGGCAC,-142.0,GATCT,TTTTT,low,1
58015,00003be1-d42d-4479-aacd-fc0edd146917,ACTCTAAGTTGGTGCC,23.0,AGACATTAGGAC,156.0,GATCT,TTTTT,high,1
58015,00003be1-d42d-4479-aacd-fc0edd146917,ACTCTAAGTTGGTGCC,23.0,AGACATTAGGAC,156.0,GATCT,TTTTT,low,1


In [4]:
df['n_unique'].value_counts(dropna=False)

n_unique
1    438212
0     38636
Name: count, dtype: int64

# Check whitelist

In [5]:
file_paths = {
    'high' : "/scratch/indikar_root/indikar1/cstansbu/high_sensitivity/demultiplex/Run3.whitelist.csv",
    'low' : "/scratch/indikar_root/indikar1/cstansbu/low_sensitivity/demultiplex/Run3.whitelist.csv",
}

df = []
for k, fpath in file_paths.items():
    tmp = pd.read_csv(fpath, header=None, names=['barcode'])
    tmp['sensitivity'] = k
    print(f"{k} sensitivity: {tmp.shape=}")
    df.append(tmp)   
    
df = pd.concat(df)  
print(f"{df['barcode'].nunique()=}")
df['bc_count'] = df.groupby('barcode')['sensitivity'].transform('nunique')

df.head()

high sensitivity: tmp.shape=(20356, 2)
low sensitivity: tmp.shape=(9200, 2)
df['barcode'].nunique()=20356


Unnamed: 0,barcode,sensitivity,bc_count
0,CTATCATCAAGCGTGC,high,2
1,GCATTTGAGATTCAGT,high,2
2,AGGTGTGAGTTGGTGC,high,2
3,GTGATATTCCGGACAT,high,2
4,GCTGCCTTCTGGATTG,high,2


In [6]:
df['bc_count'].value_counts()

bc_count
2    18400
1    11156
Name: count, dtype: int64

In [7]:
df[df['bc_count'] == 1]

Unnamed: 0,barcode,sensitivity,bc_count
9200,TGGTTCCCATTGCACC,high,1
9201,ATCTACTCACTTGGCT,high,1
9202,CTCAACGGTTGCTGGC,high,1
9203,CTTGCTTAGAACGGCA,high,1
9204,CGGCCAATCAATCCTC,high,1
...,...,...,...
20351,AGCCGATTCATGGTAT,high,1
20352,AAGGTTAAGTAGACCC,high,1
20353,CGCATAGCAAACTACA,high,1
20354,ACAGCTCAGGCTACCG,high,1


# Check FASTQ

In [8]:
file_paths = {
    'high' : "/scratch/indikar_root/indikar1/cstansbu/high_sensitivity/demultiplex/Run3.matched_reads.fastq.gz",
    'low' : "/scratch/indikar_root/indikar1/cstansbu/low_sensitivity/demultiplex/Run3.matched_reads.fastq.gz",
}

result = []
for k, fpath in file_paths.items():
    
    fastq = pysam.FastqFile(fpath)
    
    for read in fastq:
        read_name = read.name
        record = {
            'sensitivity' : k,
            'read_name' : read_name.split("#")[-1].split("_")[0],
            'barcode' : read_name.split("_")[0],
            'umi' : read_name.split("_")[1].split("#")[0],
        }
        
        result.append(record)
    
result = pd.DataFrame(result)
result['n_barcode'] = result.groupby('read_name')['barcode'].transform('nunique')
result = result.sort_values(by=['read_name', 'sensitivity'])
result.head()

Unnamed: 0,sensitivity,read_name,barcode,umi,n_barcode
23784,high,00001862-a60a-45d5-8c54-eeebf9cd617e,TCCCGTTGTTTCAGGT,CTTCTCACGACG,1
226412,low,00001862-a60a-45d5-8c54-eeebf9cd617e,TCCCGTTGTTTCAGGT,CTTCTCACGACG,1
108206,high,00003087-bb68-44f1-a202-5847b8ccb047,CCATTTCAGAACCAGG,CCCCTTCGGCAC,1
307222,low,00003087-bb68-44f1-a202-5847b8ccb047,CCATTTCAGAACCAGG,CCCCTTCGGCAC,1
49641,high,00003be1-d42d-4479-aacd-fc0edd146917,ACTCTAAGTTGGTGCC,AGACATTAGGAC,1


In [9]:
result['n_barcode'].value_counts() 

n_barcode
1    391957
2      6548
Name: count, dtype: int64

In [10]:
mismatch = result[result['n_barcode'] == 2]
mismatch.head(6)

Unnamed: 0,sensitivity,read_name,barcode,umi,n_barcode
194738,high,00010160-91e5-4a72-b20a-5663e1008173,TGCTTAGAGGCGAATG,GGCAGACCGAAT,2
389981,low,00010160-91e5-4a72-b20a-5663e1008173,TGCTTAGAGGCGAAGC,TGGGCAGACCGA,2
62321,high,003edd44-6ec6-44be-a1ec-55d5e0d62aca,TGAGTGAGTCTGGTAT,TTGGAGTACGGA,2
263304,low,003edd44-6ec6-44be-a1ec-55d5e0d62aca,GGTGCGAGTCTGGTAT,TTGGAGTACGGA,2
81621,high,005150f7-b849-4c52-89c0-3859f04a9e9b,CAGGGGACAACTAGCG,ACCACACCGGGT,2
281785,low,005150f7-b849-4c52-89c0-3859f04a9e9b,AGGCGGACAACTAAGC,GACCACACCGGG,2


In [11]:
mismatch['read_name'].nunique() / result['read_name'].nunique()

0.01591421682998911