In [1]:
import os
import pandas as pd

In [2]:
myfiles = [x for x in os.listdir() if x.endswith("-easy-import.txt")]
myfiles

['AMC-TC-x001-AK1285_S0_L001.assembled-IGHC_CH12_human-easy-import.txt',
 'AMC-TC-x001-AK1283_S0_L001.assembled-IGHC_CH12_human-easy-import.txt',
 'AMC-TC-x001-AK1286_S0_L001.assembled-IGHC_CH12_human-easy-import.txt',
 'AMC-TC-x001-AK1284_S0_L001.assembled-IGHC_CH12_human-easy-import.txt']

In [3]:
get_barcode = lambda x: x.split("_")[0]
get_contig = lambda x: x.split("_")[2]
get_cname = lambda x: x.split("|")[1].split("*")[0]

In [4]:
def readAlignment(f):
    df = pd.read_csv(f, sep="\t", header=None)
    df.columns = ["acc", "field2", "ref"]
    df["barcode"] = [x for x in map(get_barcode, df["acc"])]
    df["contig"] = [x for x in map(get_contig, df["acc"])]
    df["cregion"] = [x for x in map(get_cname, df["ref"])]
    return(df)

In [5]:
df = readAlignment(myfiles[0])
for myfile in myfiles[1:]:
    df = pd.concat([df, readAlignment(myfile)])
df.tail()

Unnamed: 0,acc,field2,ref,barcode,contig,cregion
4428,TGAGCCGAGGATGGTC-1_contig_2,0,J00221|IGHA2*01|CH1|H-CH2,TGAGCCGAGGATGGTC-1,2,IGHA2
4429,TTCCCAGCACGTAAGG-1_contig_1,0,J00221|IGHA2*01|CH1|H-CH2,TTCCCAGCACGTAAGG-1,1,IGHA2
4430,TTGAACGCATCACGAT-1_contig_2,0,J00221|IGHA2*01|CH1|H-CH2,TTGAACGCATCACGAT-1,2,IGHA2
4431,TATCAGGTCTGCTGTC-1_contig_2,0,AJ390279|IGHG3*19|CH1|CH2,TATCAGGTCTGCTGTC-1,2,IGHG3
4432,ACGCCAGAGCTGAACG-1_contig_2,0,AJ390254|IGHG3*14|CH1|CH2,ACGCCAGAGCTGAACG-1,2,IGHG3


In [6]:
concatNames = lambda x: ",".join(list(set(x)))
df_count_per_barcode = df.groupby("barcode").agg({'contig': "nunique", 'cregion': ["nunique", concatNames]})
df_count_per_barcode = df_count_per_barcode.reset_index()
df_count_per_barcode.head()

Unnamed: 0_level_0,barcode,contig,cregion,cregion
Unnamed: 0_level_1,Unnamed: 1_level_1,nunique,nunique,<lambda>
0,AAACCTGAGGCTAGGT-1,1,1,IGHM
1,AAACCTGAGTGCGATG-1,1,1,IGHM
2,AAACCTGCAATCTACG-1,2,1,IGHM
3,AAACCTGGTTCAGTAC-1,2,1,IGHM
4,AAACCTGGTTCATGGT-1,2,1,IGHM


In [7]:
df_count_per_barcode.columns = [' '.join(col).strip() for col in df_count_per_barcode.columns.values]
df_count_per_barcode = df_count_per_barcode.rename(columns={'cregion <lambda>': 'cregion name'})
df_count_per_barcode.head()

Unnamed: 0,barcode,contig nunique,cregion nunique,cregion name
0,AAACCTGAGGCTAGGT-1,1,1,IGHM
1,AAACCTGAGTGCGATG-1,1,1,IGHM
2,AAACCTGCAATCTACG-1,2,1,IGHM
3,AAACCTGGTTCAGTAC-1,2,1,IGHM
4,AAACCTGGTTCATGGT-1,2,1,IGHM


In [8]:
df_count_per_barcode = df_count_per_barcode.sort_values(['cregion nunique', 'contig nunique'], ascending=False)
df_count_per_barcode.head()

Unnamed: 0,barcode,contig nunique,cregion nunique,cregion name
3769,TGGCGCAGTTATTCTC-1,2,4,"IGHG3,IGHM,IGHD,IGHG1"
4029,TTTACTGTCGATCCCT-1,6,3,"IGHA1,IGHM,IGHD"
539,AGATTGCAGCTGCAAG-1,5,3,"IGHA1,IGHM,IGHD"
1730,CGTGTCTTCACAAACC-1,5,3,"IGHA1,IGHM,IGHD"
3242,TACTTACCAGGACGTA-1,4,3,"IGHG3,IGHM,IGHD"


In [9]:
# How many cregions were assigned per cell?
df_count_per_barcode.groupby('cregion nunique').agg({'barcode': "nunique"})

Unnamed: 0_level_0,barcode
cregion nunique,Unnamed: 1_level_1
1,3402
2,664
3,8
4,1


In [10]:
# Show the entries with 4 assigned c regions
df_count_per_barcode[df_count_per_barcode["cregion nunique"] == 4]

Unnamed: 0,barcode,contig nunique,cregion nunique,cregion name
3769,TGGCGCAGTTATTCTC-1,2,4,"IGHG3,IGHM,IGHD,IGHG1"


In [11]:
# Show the entries with 3 assigned c regions
df_count_per_barcode[df_count_per_barcode["cregion nunique"] == 3]

Unnamed: 0,barcode,contig nunique,cregion nunique,cregion name
4029,TTTACTGTCGATCCCT-1,6,3,"IGHA1,IGHM,IGHD"
539,AGATTGCAGCTGCAAG-1,5,3,"IGHA1,IGHM,IGHD"
1730,CGTGTCTTCACAAACC-1,5,3,"IGHA1,IGHM,IGHD"
3242,TACTTACCAGGACGTA-1,4,3,"IGHG3,IGHM,IGHD"
3453,TCCACACCACAAGACG-1,4,3,"IGHA1,IGHM,IGHD"
70,AACCATGGTAGGGTAC-1,3,3,"IGHG2,IGHM,IGHG1"
822,ATGAGGGTCTGATACG-1,3,3,"IGHG2,IGHM,IGHD"
2900,GTATTCTAGGCCGAAT-1,2,3,"IGHA1,IGHM,IGHD"


In [12]:
# Show the entries with 2 assigned c regions
df_count_per_barcode[df_count_per_barcode["cregion nunique"] == 2]

Unnamed: 0,barcode,contig nunique,cregion nunique,cregion name
1265,CCACTACTCCGCAGTG-1,8,2,"IGHM,IGHD"
3762,TGGCGCAAGTAGCCGA-1,8,2,"IGHM,IGHD"
1362,CCGTTCACAGTACACT-1,7,2,"IGHM,IGHD"
1516,CGAGCCAGTGTGACGA-1,7,2,"IGHM,IGHD"
1747,CGTTCTGCACAACGCC-1,7,2,"IGHM,IGHD"
2205,GACGGCTAGTATGACA-1,7,2,"IGHA1,IGHM"
2927,GTCACGGCAACACCTA-1,7,2,"IGHM,IGHD"
278,ACATGGTCACATGGGA-1,6,2,"IGHM,IGHD"
583,AGCGTCGAGCAACGGT-1,6,2,"IGHA1,IGHM"
1510,CGAGCACGTTTCGCTC-1,6,2,"IGHM,IGHD"


In [13]:
df_count_per_barcode.to_excel("count-c-region.xlsx")
print("Wrote count-c-region.xlsx to disk")

Wrote count-c-region.xlsx to disk
