In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
[x for x in os.listdir() if x.startswith('count-')]

['count-IGH-VJC.txt',
 'count-TRA-VJC.txt',
 'count-IGK-VJC.txt',
 'count-TRB-VJC.txt',
 'count-IGL-VJC.txt',
 'count-alignments-VJC.txt',
 'count-assembly-VJC.txt']

In [3]:
# Read nr of aligned sequences
nrs = list()
filenames = list()
barcodes = list()
fh = open("count-alignments-VJC.txt")
for line in fh:
    line = line.lstrip()
    line = line.rstrip()
    nr, filename = line.split(" ")
    if filename == "total":
        continue
    barcode, rest = filename.split("_S1_")
    nrs.append(int(nr))
    filenames.append(filename)
    barcodes.append(barcode)
fh.close()
df_align = pd.DataFrame({'barcode': barcodes, 'nr_alignments': nrs})
df_align.head()

Unnamed: 0,barcode,nr_alignments
0,AAACAGGC,275
1,AAAGCGGA,328
2,AAAGGCTG,180
3,AACACGCA,12
4,AACATGGG,251


In [4]:
# Read nr of assembled sequences
nrs = list()
filenames = list()
barcodes = list()
fh = open("count-assembly-VJC.txt")
for line in fh:
    line = line.rstrip()
    filename, rest, nr = line.split(":")
    barcode, rest = filename.split("_S1")
    nr = nr.lstrip()
    nr = nr.replace(",", "")
    nr = nr.split(" ")
    nrs.append(int(nr[0]))
    filenames.append(filename)
    barcodes.append(barcode)
fh.close()
df_assembly = pd.DataFrame({'barcode': barcodes, 'nr_assembly': nrs})
df_assembly.head()

Unnamed: 0,barcode,nr_assembly
0,AAACAGGC,197
1,AAAGCGGA,219
2,AAAGGCTG,102
3,AACACGCA,5
4,AACATGGG,184


In [5]:
# Read nr of IGH CDR3s
nrs = list()
filenames = list()
barcodes = list()
fh = open("count-IGH-VJC.txt")
for line in fh:
    line = line.rstrip()
    filename, nr = line.split(":")
    barcode, rest = filename.split("_S1")
    nr = nr.split("\t")
    nrs.append(int(nr[1]))
    filenames.append(filename)
    barcodes.append(barcode)
fh.close()
df_cdr3_igh = pd.DataFrame({'barcode': barcodes, 'nr_cdr3_igh': nrs})
df_cdr3_igh.head()

Unnamed: 0,barcode,nr_cdr3_igh
0,AAACAGGC,6
1,AACCGCTT,2
2,AACCGGAA,3
3,AACCTGCT,3
4,AACGAGGT,2


In [6]:
# Read nr of TRB CDR3s
nrs = list()
filenames = list()
barcodes = list()
fh = open("count-TRB-VJC.txt")
for line in fh:
    line = line.rstrip()
    filename, nr = line.split(":")
    barcode, rest = filename.split("_S1")
    nr = nr.split("\t")
    nrs.append(int(nr[1]))
    filenames.append(filename)
    barcodes.append(barcode)
fh.close()
df_cdr3_trb = pd.DataFrame({'barcode': barcodes, 'nr_cdr3_trb': nrs})
df_cdr3_trb.head()

Unnamed: 0,barcode,nr_cdr3_trb
0,AAAGCGGA,3
1,AAAGGCTG,2
2,AACATGGG,3
3,AACCTGCT,25
4,AATGGTGG,6


In [7]:
# Read nr of TRA CDR3s
nrs = list()
filenames = list()
barcodes = list()
fh = open("count-TRA-VJC.txt")
for line in fh:
    line = line.rstrip()
    filename, nr = line.split(":")
    barcode, rest = filename.split("_S1")
    nr = nr.split("\t")
    nrs.append(int(nr[1]))
    filenames.append(filename)
    barcodes.append(barcode)
fh.close()
df_cdr3_tra = pd.DataFrame({'barcode': barcodes, 'nr_cdr3_tra': nrs})
df_cdr3_tra.head()

Unnamed: 0,barcode,nr_cdr3_tra
0,AAAGCGGA,3
1,AAAGGCTG,7
2,AACATGGG,4
3,AACCTGCT,1
4,AAGACAGC,2


In [8]:
# Read nr of IGL CDR3s
nrs = list()
filenames = list()
barcodes = list()
fh = open("count-IGL-VJC.txt")
for line in fh:
    line = line.rstrip()
    filename, nr = line.split(":")
    barcode, rest = filename.split("_S1")
    nr = nr.split("\t")
    nrs.append(int(nr[1]))
    filenames.append(filename)
    barcodes.append(barcode)
fh.close()
df_cdr3_igl = pd.DataFrame({'barcode': barcodes, 'nr_cdr3_igl': nrs})
df_cdr3_igl.head()

Unnamed: 0,barcode,nr_cdr3_igl
0,AAACAGGC,6
1,AACATGGG,2
2,AACCGCTT,10
3,AACCTGCT,1
4,AACGAGGT,2


In [9]:
# Read nr of IGK CDR3s
nrs = list()
filenames = list()
barcodes = list()
fh = open("count-IGK-VJC.txt")
for line in fh:
    line = line.rstrip()
    filename, nr = line.split(":")
    barcode, rest = filename.split("_S1")
    nr = nr.split("\t")
    nrs.append(int(nr[1]))
    filenames.append(filename)
    barcodes.append(barcode)
fh.close()
df_cdr3_igk = pd.DataFrame({'barcode': barcodes, 'nr_cdr3_igk': nrs})
df_cdr3_igk.head()

Unnamed: 0,barcode,nr_cdr3_igk
0,AACCGCTT,1
1,AACCGGAA,1
2,AACCTGCT,1
3,AACGAGGT,1
4,AAGACAGC,2


## Combine numbers in one table

In [10]:
df = pd.merge(df_align, df_assembly, how='left', on='barcode')
df = pd.merge(df, df_cdr3_igh, how='left', on='barcode')
df = pd.merge(df, df_cdr3_igk, how='left', on='barcode')
df = pd.merge(df, df_cdr3_igl, how='left', on='barcode')
df = pd.merge(df, df_cdr3_trb, how='left', on='barcode')
df = pd.merge(df, df_cdr3_tra, how='left', on='barcode')
df = df.fillna(0)
df

Unnamed: 0,barcode,nr_alignments,nr_assembly,nr_cdr3_igh,nr_cdr3_igk,nr_cdr3_igl,nr_cdr3_trb,nr_cdr3_tra
0,AAACAGGC,275,197.0,6.0,0.0,6.0,0.0,0.0
1,AAAGCGGA,328,219.0,0.0,0.0,0.0,3.0,3.0
2,AAAGGCTG,180,102.0,0.0,0.0,0.0,2.0,7.0
3,AACACGCA,12,5.0,0.0,0.0,0.0,0.0,0.0
4,AACATGGG,251,184.0,0.0,0.0,2.0,3.0,4.0
5,AACCCAAC,5,2.0,0.0,0.0,0.0,0.0,0.0
6,AACCGCTT,503,257.0,2.0,1.0,10.0,0.0,0.0
7,AACCGGAA,215,128.0,3.0,1.0,0.0,0.0,0.0
8,AACCTGCT,380,244.0,3.0,1.0,1.0,25.0,1.0
9,AACGAGGT,787,497.0,2.0,1.0,2.0,0.0,0.0


In [11]:
# Cells without CDR3
len(df[(df['nr_cdr3_igh'] < 1) & (df['nr_cdr3_igk'] < 1) & (df['nr_cdr3_igl'] < 1) & (df['nr_cdr3_trb'] < 1) & (df['nr_cdr3_tra'] < 1)])

59

In [12]:
df.to_excel("overview-vasaseq-vjc.xlsx")
print("Wrote overview-vasaseq-vjc.xlsx to disk")

Wrote overview-vasaseq-vjc.xlsx to disk
