In [28]:
import os
import pandas as pd
import matplotlib.pyplot as plt

In [29]:
# Read metadata
df_meta = pd.read_excel("/home/barbera/Downloads/VASAseq/celseq2_primers.xlsx", engine="openpyxl")
cols = ["Cell Barcode", "Well Position"]
df_meta = df_meta[cols]
df_meta = df_meta.rename(columns={'Cell Barcode': 'barcode'})
df_meta.head()

Unnamed: 0,barcode,Well Position
0,CGTCTAAT,A1
1,AGACTCGT,A2
2,GCACGTCA,A3
3,TCAACGAC,A4
4,ATTTAGCG,A5


In [30]:
mydir = "counts-VJC-R2/"
[x for x in os.listdir(mydir) if x.startswith('count-')]

['count-igl.txt',
 'count-IGH-VJC.txt',
 'count-sequences.txt',
 'count-alignments.txt',
 'count-tra.txt',
 'count-IGL-VJC.txt',
 'count-TRB-VJC.txt',
 'count-ufis.txt',
 'count-IGK-VJC.txt',
 'count-trb.txt',
 'count-igk.txt',
 'count-TRA-VJC.txt',
 'count-igh.txt',
 'count-assembly.txt']

In [31]:
# Read nr of fastq reads
df_reads = pd.read_csv(mydir + "count-sequences.txt", sep=" ", header=None)
df_reads = df_reads.rename(columns={0: 'barcode', 1: 'nr_reads'})
df_reads.head()

Unnamed: 0,barcode,nr_reads
0,AACTCTGG,678540
1,AAGACAGC,1051166
2,AAGCACAT,12065
3,AAGCGAGT,2876236
4,AAGCTGCA,1681712


In [32]:
# Read nr of aligned sequences
nrs = list()
filenames = list()
barcodes = list()
fh = open(mydir + "count-alignments.txt")
for line in fh:
    line = line.lstrip()
    line = line.rstrip()
    nr, filename = line.split(" ")
    if filename == "total":
        continue
    barcode, rest = filename.split("_S1_")
    nrs.append(int(nr))
    filenames.append(filename)
    barcodes.append(barcode)
fh.close()
df_align = pd.DataFrame({'barcode': barcodes, 'nr_alignments': nrs})
df_align.head()

Unnamed: 0,barcode,nr_alignments
0,AAACAGGC,627
1,AAAGCGGA,984
2,AAAGGCTG,690
3,AACACGCA,29
4,AACATGGG,621


In [33]:
# Read nr of assembled sequences
nrs = list()
filenames = list()
barcodes = list()
fh = open(mydir + "count-assembly.txt")
for line in fh:
    line = line.rstrip()
    filename, rest, nr = line.split(":")
    barcode, rest = filename.split("_S1")
    barcode = barcode.split("/")[-1]
    nr = nr.lstrip()
    nr = nr.replace(",", "")
    nr = nr.split(" ")
    nrs.append(int(nr[0]))
    filenames.append(filename)
    barcodes.append(barcode)
fh.close()
df_assembly = pd.DataFrame({'barcode': barcodes, 'nr_assembly': nrs})
df_assembly.head()

Unnamed: 0,barcode,nr_assembly
0,AAACAGGC,332
1,AAAGCGGA,421
2,AAAGGCTG,261
3,AACACGCA,10
4,AACATGGG,292


In [37]:
get_barcode = lambda x: x.split("_S1")[0]

In [39]:
# Read nr of IGH CDR3s
df_cdr3_igh = pd.read_csv(mydir + "count-IGH-VJC.txt", header=None, sep="\t")
df_cdr3_igh = df_cdr3_igh.rename(columns={0: 'filename', 1: 'nr_cdr3_igh', 2: 'perc'})
df_cdr3_igh['barcode'] = [x for x in map(get_barcode, df_cdr3_igh['filename'])]
cols = ['barcode', 'nr_cdr3_igh']
df_cdr3_igh = df_cdr3_igh[cols]
df_cdr3_igh.head()

Unnamed: 0,barcode,nr_cdr3_igh
0,AAACAGGC,6
1,AACCGCTT,2
2,AACCGGAA,3
3,AACCTGCT,3
4,AACGAGGT,2


In [40]:
# Read nr of TRB CDR3s
df_cdr3_trb = pd.read_csv(mydir + "count-TRB-VJC.txt", header=None, sep="\t")
df_cdr3_trb = df_cdr3_trb.rename(columns={0: 'filename', 1: 'nr_cdr3_trb', 2: 'perc'})
df_cdr3_trb['barcode'] = [x for x in map(get_barcode, df_cdr3_trb['filename'])]
cols = ['barcode', 'nr_cdr3_trb']
df_cdr3_trb = df_cdr3_trb[cols]
df_cdr3_trb.head()

Unnamed: 0,barcode,nr_cdr3_trb
0,AAAGCGGA,3
1,AAAGGCTG,2
2,AACATGGG,3
3,AACCTGCT,25
4,AATGGTGG,6


In [41]:
# Read nr of TRA CDR3s
df_cdr3_tra = pd.read_csv(mydir + "count-TRA-VJC.txt", header=None, sep="\t")
df_cdr3_tra = df_cdr3_tra.rename(columns={0: 'filename', 1: 'nr_cdr3_tra', 2: 'perc'})
df_cdr3_tra['barcode'] = [x for x in map(get_barcode, df_cdr3_tra['filename'])]
cols = ['barcode', 'nr_cdr3_tra']
df_cdr3_tra = df_cdr3_tra[cols]
df_cdr3_tra.head()

Unnamed: 0,barcode,nr_cdr3_tra
0,AAAGCGGA,3
1,AAAGGCTG,7
2,AACATGGG,4
3,AACCTGCT,1
4,AAGACAGC,2


In [42]:
# Read nr of IGL CDR3s
df_cdr3_igl = pd.read_csv(mydir + "count-IGL-VJC.txt", header=None, sep="\t")
df_cdr3_igl = df_cdr3_igl.rename(columns={0: 'filename', 1: 'nr_cdr3_igl', 2: 'perc'})
df_cdr3_igl['barcode'] = [x for x in map(get_barcode, df_cdr3_igl['filename'])]
cols = ['barcode', 'nr_cdr3_igl']
df_cdr3_igl = df_cdr3_igl[cols]
df_cdr3_igl.head()

Unnamed: 0,barcode,nr_cdr3_igl
0,AAACAGGC,6
1,AACATGGG,2
2,AACCGCTT,10
3,AACCTGCT,1
4,AACGAGGT,2


In [43]:
# Read nr of IGK CDR3s
df_cdr3_igk = pd.read_csv(mydir + "count-IGK-VJC.txt", header=None, sep="\t")
df_cdr3_igk = df_cdr3_igk.rename(columns={0: 'filename', 1: 'nr_cdr3_igk', 2: 'perc'})
df_cdr3_igk['barcode'] = [x for x in map(get_barcode, df_cdr3_igk['filename'])]
cols = ['barcode', 'nr_cdr3_igk']
df_cdr3_igk = df_cdr3_igk[cols]
df_cdr3_igk.head()

Unnamed: 0,barcode,nr_cdr3_igk
0,AACCGCTT,1
1,AACCGGAA,1
2,AACCTGCT,1
3,AACGAGGT,1
4,AAGACAGC,2


In [44]:
# Read UFIs
df_ufis = pd.read_csv(mydir + "count-ufis.txt", header=None, sep=" ")
df_ufis = df_ufis.rename(columns={0: "barcode", 1: "UFIs"})
df_ufis.head()

Unnamed: 0,barcode,UFIs
0,AAACAGGC,174
1,AAAGCGGA,530
2,AAAGGCTG,392
3,AACACGCA,26
4,AACATGGG,352


## Combine numbers in one table

In [45]:
df = pd.merge(df_meta, df_reads, how='left', on='barcode')
df = pd.merge(df, df_align, how='left', on='barcode')
df = pd.merge(df, df_ufis, how='left', on='barcode')
df = pd.merge(df, df_assembly, how='left', on='barcode')
df = pd.merge(df, df_cdr3_igh, how='left', on='barcode')
df = pd.merge(df, df_cdr3_igk, how='left', on='barcode')
df = pd.merge(df, df_cdr3_igl, how='left', on='barcode')
df = pd.merge(df, df_cdr3_trb, how='left', on='barcode')
df = pd.merge(df, df_cdr3_tra, how='left', on='barcode')
df = df.fillna(0)
df.head()

Unnamed: 0,barcode,Well Position,nr_reads,nr_alignments,UFIs,nr_assembly,nr_cdr3_igh,nr_cdr3_igk,nr_cdr3_igl,nr_cdr3_trb,nr_cdr3_tra
0,CGTCTAAT,A1,429417.0,386.0,183.0,197.0,0.0,0.0,1.0,0.0,0.0
1,AGACTCGT,A2,146023.0,161.0,97.0,57.0,0.0,0.0,0.0,0.0,0.0
2,GCACGTCA,A3,46487.0,35.0,30.0,14.0,0.0,0.0,0.0,0.0,0.0
3,TCAACGAC,A4,652446.0,1178.0,368.0,628.0,0.0,0.0,5.0,0.0,0.0
4,ATTTAGCG,A5,200714.0,229.0,91.0,116.0,1.0,0.0,0.0,0.0,0.0


In [46]:
# Cells without CDR3
len(df[(df['nr_cdr3_igh'] < 1) & (df['nr_cdr3_igk'] < 1) & (df['nr_cdr3_igl'] < 1) & (df['nr_cdr3_trb'] < 1) & (df['nr_cdr3_tra'] < 1)])

59

In [47]:
df.to_excel("overview-vasaseq-vjc-r2.xlsx")
print("Wrote overview-vasaseq-vjc-r2.xlsx to disk")

Wrote overview-vasaseq-vjc-r2.xlsx to disk
