In [1]:
import pandas as pd
import os

In [2]:
# Output files
outClones = "basta-clones.xlsx"
outAllinfo = "basta-allinfo.xlsx"
outAllinfoLong = "basta-allinfo-long.xlsx"

## Read CDR3 list

In [3]:
myfile = "autoantibody_sequences.xlsx"

In [4]:
xl = pd.ExcelFile(myfile)
sheet_names = xl.sheet_names  # see all sheet names
sheet_names

['B005', 'B007', 'B008', 'B009', 'B010', 'B011', '0843']

In [5]:
def read_excel_sheet(myfile, sheetname):
    df = pd.read_excel(myfile, sheet_name=sheetname, header=None)
    df.columns = ['sequence']
    df['sheet'] = sheetname
    return(df)

In [6]:
df_cdr3 = read_excel_sheet(myfile, sheet_names[0])
for sheetname in sheet_names[1:]:
    df_cdr3 = pd.concat([df_cdr3, read_excel_sheet(myfile, sheetname)])
df_cdr3 = df_cdr3.reset_index()
df_cdr3.tail()

Unnamed: 0,index,sequence,sheet
510,25,TCTVSGDSMSSHYWSWIRQPPGKGPEWIAYIYYSGSTNYNPSLRGR...,843
511,26,TCTVSGGSISSYYWSWIRQPPGKGLEWIGYIYYSGSTNSNPSLKSR...,843
512,27,TCTVSGGSISSYYWSWIRQTPGKGLEWIGYIYDSGSTNYNPSLKSR...,843
513,28,CARKAAGGPFDLWGRGTLVT,843
514,29,>>0843_RF,843


In [7]:
df_cdr3.head()

Unnamed: 0,index,sequence,sheet
0,0,>>B005_anti-Ro52,B005
1,1,YYCAKESSIAAALYNRFDHWGQGTLVTVSSASTK,B005
2,2,LSCAASGFTFSLYAMNWVRQAPGKGLEWVSGLFGSGGGTDYADSVR...,B005
3,3,LSCAASGFTFSSLNMNWVRQAPGKGLEWVSSISSSSSYKYYVDSVK...,B005
4,4,LSCAASGFTFRSYWMHWVRQAPGKGLVWVSRINSDGINTYYADSVR...,B005


In [8]:
# Remove the ">>some-name" rows
patternDel = ">>"
filter = df_cdr3['sequence'].str.contains(patternDel)
df_cdr3 = df_cdr3[~filter]
df_cdr3.head()

Unnamed: 0,index,sequence,sheet
1,1,YYCAKESSIAAALYNRFDHWGQGTLVTVSSASTK,B005
2,2,LSCAASGFTFSLYAMNWVRQAPGKGLEWVSGLFGSGGGTDYADSVR...,B005
3,3,LSCAASGFTFSSLNMNWVRQAPGKGLEWVSSISSSSSYKYYVDSVK...,B005
4,4,LSCAASGFTFRSYWMHWVRQAPGKGLVWVSRINSDGINTYYADSVR...,B005
5,5,LSCAASEFSFSKYWIHWVRHVPGKGLVWVSRINSDGINTYYADSVR...,B005


In [9]:
def f1(x): # remove first two amino acids if it starts with YY
    if x.startswith("YY"):
        return(x[2:])
    else:
        return(x)

def f2(x): # remove everything after the "VT" if the pattern "VTVS" is in the sequence
    x = x.split("VTVS")
    if len(x) > 2:
        print("WARNING: multiple times VTVS in sequence", x)
    return(x[0] + "VT")
    
df_cdr3['sequence_without_YY'] = [x for x in map(f1, df_cdr3['sequence'])]
df_cdr3['cdr3pep'] = [x for x in map(f2, df_cdr3['sequence_without_YY'])]
df_cdr3.head()

Unnamed: 0,index,sequence,sheet,sequence_without_YY,cdr3pep
1,1,YYCAKESSIAAALYNRFDHWGQGTLVTVSSASTK,B005,CAKESSIAAALYNRFDHWGQGTLVTVSSASTK,CAKESSIAAALYNRFDHWGQGTLVT
2,2,LSCAASGFTFSLYAMNWVRQAPGKGLEWVSGLFGSGGGTDYADSVR...,B005,LSCAASGFTFSLYAMNWVRQAPGKGLEWVSGLFGSGGGTDYADSVR...,LSCAASGFTFSLYAMNWVRQAPGKGLEWVSGLFGSGGGTDYADSVR...
3,3,LSCAASGFTFSSLNMNWVRQAPGKGLEWVSSISSSSSYKYYVDSVK...,B005,LSCAASGFTFSSLNMNWVRQAPGKGLEWVSSISSSSSYKYYVDSVK...,LSCAASGFTFSSLNMNWVRQAPGKGLEWVSSISSSSSYKYYVDSVK...
4,4,LSCAASGFTFRSYWMHWVRQAPGKGLVWVSRINSDGINTYYADSVR...,B005,LSCAASGFTFRSYWMHWVRQAPGKGLVWVSRINSDGINTYYADSVR...,LSCAASGFTFRSYWMHWVRQAPGKGLVWVSRINSDGINTYYADSVR...
5,5,LSCAASEFSFSKYWIHWVRHVPGKGLVWVSRINSDGINTYYADSVR...,B005,LSCAASEFSFSKYWIHWVRHVPGKGLVWVSRINSDGINTYYADSVR...,LSCAASEFSFSKYWIHWVRHVPGKGLVWVSRINSDGINTYYADSVR...


## Read clone files and lookup CDR3s

In [10]:
clone_files = [x for x in os.listdir(".") if x.endswith("-clones-mut-sites-reassigned.csv")]
clone_files[:10]

['B005-SG-B_S1_L001.assembled-ACGTACGT-IGH_HUMAN-clones-mut-sites-reassigned.csv',
 'B007-BMDC-B_S4_L001.assembled-AGTCAGTC-IGH_HUMAN-clones-mut-sites-reassigned.csv',
 'B007-naive_S206_L001.assembled-ACGTACGT-IGH_HUMAN-clones-mut-sites-reassigned.csv',
 'B007-Lnpre-B_S2_L001.assembled-ACTGACTG-IGH_HUMAN-clones-mut-sites-reassigned.csv',
 'BASTA-1467-B_S49_L001.assembled-ACTGACTG-IGH_HUMAN-clones-mut-sites-reassigned.csv',
 'B007-Lnpost-B_S3_L001.assembled-AGCTAGCT-IGH_HUMAN-clones-mut-sites-reassigned.csv',
 'B007-PBBMDC-1-B_S5_L001.assembled-ATCGATCG-IGH_HUMAN-clones-mut-sites-reassigned.csv',
 'B007-PBBMDC-2-B_S6_L001.assembled-ATGCATGC-IGH_HUMAN-clones-mut-sites-reassigned.csv',
 'B009-clone2-B_S56_L001.assembled-ACGTACGT-IGH_HUMAN-clones-mut-sites-reassigned.csv',
 'B009-clone6-B_S57_L001.assembled-ACTGACTG-IGH_HUMAN-clones-mut-sites-reassigned.csv']

In [11]:
df_clones = pd.read_csv(clone_files[0], sep="\t")
sample_name, rest = clone_files[0].split("_L001")
df_clones["Sample"] = sample_name
df_clones.head()

Unnamed: 0,cdr3pep,V_sub,J_sub,freq,beforeMID.nunique,mut.count_x.sum,mut.count_x.mean,mut.count_x.mode,mut.frac_x.mean,mut.count_y.sum,...,mut.count_y.mode,mut.frac_y.mean,nr_sites.sum,nr_sites.mean,nr_sites.mode,UMIs,cdr3nuc.nunique,cdr3nuc.min_mode,UMIs.frac,Sample
0,CARGAGDRELQKPSPFDYWGQGTLVT,IGHV1-69,IGHJ4,1806,1656,8041.0,6.00547,6.0,0.025487,45.0,...,0.0,0.00062,29.0,0.175989,0.0,1633,38,TGTGCGAGAGGGGCCGGGGATCGGGAACTACAAAAACCTTCTCCGT...,0.046215,B005-SG-B_S1
1,CAHSNDYVWGSYRYTFDSWGQGTLVT,IGHV2-5,"IGHJ4,IGHJ5",1315,988,2141.0,1.623012,1.0,0.006757,1321.0,...,1.0,0.019086,15.0,0.013175,0.0,968,13,TGTGCACACTCTAATGATTACGTTTGGGGCAGTTATCGTTATACCT...,0.027395,B005-SG-B_S1
2,CVRGQWLFDYWGQGTLVT,IGHV4-31,IGHJ4,1171,981,9403.0,7.728507,9.0,0.032042,8.0,...,0.0,2.9e-05,38.0,0.155655,0.0,861,14,TGTGTGAGAGGACAGTGGTTGTTTGACTACTGGGGCCAGGGAACCC...,0.024367,B005-SG-B_S1
3,CAKKLGSGLTPYDYWGQGTLVT,IGHV3-23,IGHJ4,700,564,8894.0,8.20041,14.0,0.037251,3.0,...,0.0,0.000425,18.0,0.019506,0.0,528,8,TGTGCGAAAAAGTTGGGAAGTGGTCTTACCCCCTATGACTACTGGG...,0.014943,B005-SG-B_S1
4,CAHSYDYVWGSYRYTFDYWGQGTLVT,IGHV2-5,IGHJ4,477,362,1593.0,3.043103,4.0,0.012687,3.0,...,0.0,4.9e-05,7.0,0.032088,0.0,360,11,TGTGCACACTCTTATGATTACGTTTGGGGGAGTTATCGTTATACCT...,0.010188,B005-SG-B_S1


In [12]:
for clone_file in clone_files[1:]:
    df_tmp = pd.read_csv(clone_file, sep="\t")
    sample_name, rest = clone_file.split("_L001")
    df_tmp["Sample"] = sample_name
    df_clones = pd.concat([df_clones, df_tmp])
df_clones.tail()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """


Unnamed: 0,J_sub,Sample,UMIs,UMIs.frac,V_sub,beforeMID.nunique,cdr3nuc.min_mode,cdr3nuc.nunique,cdr3pep,freq,...,mut.count_x.mode,mut.count_x.sum,mut.count_y.mean,mut.count_y.mode,mut.count_y.sum,mut.frac_x.mean,mut.frac_y.mean,nr_sites.mean,nr_sites.mode,nr_sites.sum
332,IGHJ6,BASTA-7M_S170,1,0.000675,IGHV3-9,1,TGTGCGGTCAATTGTGGTGGTGACTGCTACCTCGTGCCTGATGGGG...,1,CAVNCGGDCYLVPDGVWGQGTTVT,1,...,8.0,8.0,0.0,0.0,0.0,0.034188,0.0,0.0,0.0,0.0
333,IGHJ4,BASTA-7M_S170,1,0.000675,IGHV3-21,1,TGTGCCTATATAGCAGCAGCTTTTGACTACTGGGGCCAGGGAACCC...,1,CAYIAAAFDYWGQGTLVT,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
334,IGHJ4,BASTA-7M_S170,1,0.000675,IGHV3-11,1,TGTGCCTATACTATGATAGTAGTGGTTACCCCCTTTGACTACTGGG...,1,CAYTMIVVVTPFDYWGQGTLVT,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
335,IGHJ5,BASTA-7M_S170,1,0.000675,IGHV3-30,1,TGTGAGAAAGGGGTCAACTGGTTCGACCCCTGGGGCCAGGGAACCC...,1,CEKGVNWFDPWGQGTLVT,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
336,IGHJ4,BASTA-7M_S170,1,0.000675,IGHV3-15,1,TATACCACAGGAACGATTGCAGTGGTGGTATCTGCTACGCCTTTTG...,1,YTTGTIAVVVSATPFDYWGQGTLVT,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df_lookup_clones = pd.merge(df_cdr3, df_clones, how='inner', on='cdr3pep')
df_lookup_clones

Unnamed: 0,index,sequence,sheet,sequence_without_YY,cdr3pep,J_sub,Sample,UMIs,UMIs.frac,V_sub,...,mut.count_x.mode,mut.count_x.sum,mut.count_y.mean,mut.count_y.mode,mut.count_y.sum,mut.frac_x.mean,mut.frac_y.mean,nr_sites.mean,nr_sites.mode,nr_sites.sum
0,21,YYCARAAAEITTIIKWKYFDLWGQGTLVTVSSASTK,B005,CARAAAEITTIIKWKYFDLWGQGTLVTVSSASTK,CARAAAEITTIIKWKYFDLWGQGTLVT,IGHJ4,SS3-lip_S201,1,,IGHV3-74,...,23.0,23.0,0.0,0.0,0.0,0.097046,0.0,0.0,0.0,0.0
1,21,YYCARAAAEITTIIKWKYFDLWGQGTLVTVSSASTK,B005,CARAAAEITTIIKWKYFDLWGQGTLVTVSSASTK,CARAAAEITTIIKWKYFDLWGQGTLVT,IGHJ5,SS3-lip_S201,1,,IGHV3-74,...,22.0,22.0,0.0,0.0,0.0,0.092827,0.0,0.0,0.0,0.0
2,91,YYCTRAWELLPQYWGQGTLVTVSSASTK,B005,CTRAWELLPQYWGQGTLVTVSSASTK,CTRAWELLPQYWGQGTLVT,IGHJ1,SS3-lung-1a-B_S110,5,0.000418,IGHV3-74,...,23.0,140.0,0.0,0.0,0.0,0.100649,0.0,1.0,1.0,6.0
3,91,YYCTRAWELLPQYWGQGTLVTVSSASTK,B005,CTRAWELLPQYWGQGTLVTVSSASTK,CTRAWELLPQYWGQGTLVT,IGHJ1,SS3-lung-1b-B_S111,2,0.000502,IGHV3-74,...,23.0,48.0,0.0,0.0,0.0,0.103448,0.0,1.0,1.0,2.0
4,105,YYCARDWSGTYRAVNWGQGTLVTVSSASTK,B005,CARDWSGTYRAVNWGQGTLVTVSSASTK,CARDWSGTYRAVNWGQGTLVT,IGHJ4,B005-SG-B_S1,3,8.5e-05,IGHV3-48,...,17.0,62.0,0.333333,0.0,1.0,0.086111,0.009259,1.0,1.0,3.0
5,105,YYCARDWSGTYRAVNWGQGTLVTVSSASTK,B005,CARDWSGTYRAVNWGQGTLVTVSSASTK,CARDWSGTYRAVNWGQGTLVT,IGHJ4,BASTA-1G_S161,1,0.000556,IGHV3-48,...,4.0,4.0,1.0,1.0,1.0,0.035714,0.027778,1.0,1.0,1.0
6,118,YYCARDWSGTYRAVNWGQGTLVTVSSASTK,B005,CARDWSGTYRAVNWGQGTLVTVSSASTK,CARDWSGTYRAVNWGQGTLVT,IGHJ4,B005-SG-B_S1,3,8.5e-05,IGHV3-48,...,17.0,62.0,0.333333,0.0,1.0,0.086111,0.009259,1.0,1.0,3.0
7,118,YYCARDWSGTYRAVNWGQGTLVTVSSASTK,B005,CARDWSGTYRAVNWGQGTLVTVSSASTK,CARDWSGTYRAVNWGQGTLVT,IGHJ4,BASTA-1G_S161,1,0.000556,IGHV3-48,...,4.0,4.0,1.0,1.0,1.0,0.035714,0.027778,1.0,1.0,1.0
8,11,YYCAKGTPFSATGVFDYWGQGTLVTVSSASTK,B007,CAKGTPFSATGVFDYWGQGTLVTVSSASTK,CAKGTPFSATGVFDYWGQGTLVT,IGHJ4,B007-Lnpre-B_S2,6,0.000456,IGHV3-23,...,5.0,74.0,0.0,0.0,0.0,0.028148,0.0,0.090909,0.0,1.0
9,11,YYCAKGTPFSATGVFDYWGQGTLVTVSSASTK,B007,CAKGTPFSATGVFDYWGQGTLVTVSSASTK,CAKGTPFSATGVFDYWGQGTLVT,IGHJ4,B007-Lnpost-B_S3,60,0.001687,IGHV3-23,...,6.0,968.0,0.0,0.0,0.0,0.060697,0.0,0.04529,0.0,3.0


In [14]:
df_lookup_clones.to_excel(outClones)
print("Wrote", outClones, "to disk")

Wrote basta-clones.xlsx to disk


## Read all info files and lookup CDR3s

In [15]:
allinfo_files = [x for x in os.listdir(".") if x.endswith(".all_info.csv")]
allinfo_files[:10]

['B005-SG-B_S1_L001.assembled-ACGTACGT-IGH_HUMAN-all_info.csv.rr.all_info.csv',
 'B007-BMDC-B_S4_L001.assembled-AGTCAGTC-IGH_HUMAN-all_info.csv.rr.all_info.csv',
 'B007-Lnpre-B_S2_L001.assembled-ACTGACTG-IGH_HUMAN-all_info.csv.rr.all_info.csv',
 'BASTA-1467-B_S49_L001.assembled-ACTGACTG-IGH_HUMAN-all_info.csv.rr.all_info.csv',
 'B007-Lnpost-B_S3_L001.assembled-AGCTAGCT-IGH_HUMAN-all_info.csv.rr.all_info.csv',
 'B007-PBBMDC-1-B_S5_L001.assembled-ATCGATCG-IGH_HUMAN-all_info.csv.rr.all_info.csv',
 'B007-PBBMDC-2-B_S6_L001.assembled-ATGCATGC-IGH_HUMAN-all_info.csv.rr.all_info.csv',
 'B009-clone2-B_S56_L001.assembled-ACGTACGT-IGH_HUMAN-all_info.csv.rr.all_info.csv',
 'B009-clone6-B_S57_L001.assembled-ACTGACTG-IGH_HUMAN-all_info.csv.rr.all_info.csv',
 'B009-clone10-B_S58_L001.assembled-AGCTAGCT-IGH_HUMAN-all_info.csv.rr.all_info.csv']

In [16]:
def lookupInAllinfo(df_cdr3, df_allinfo):
    df_tmp = pd.merge(df_cdr3, df_allinfo, how='inner', on='cdr3pep')
    return(df_tmp)

In [17]:
df_allinfo = pd.read_csv(allinfo_files[0], sep="\t")
sample_name, rest = allinfo_files[0].split("_L001")
df_allinfo["Sample"] = sample_name
df_allinfo.head()

Unnamed: 0,acc,beforeMID,MID,afterMID,readingframe,cdr3pep,cdr3nuc,cdr3_qual_min,cdr3_qual_max,cdr3_qual_avg,...,acc2,nr_v_mains,nr_v_subs,nr_v_alleles,nr_j_subs,nr_j_alleles,acc3,readingframe2,nr_sites,Sample
0,M02984:466:000000000-BJD2F:1:1101:10002:8181,TACCCGTCT,ACGTACGT,CTTAC,4,CAKLTGYYYDSSGVRAFDIWGQGTMVT,TGTGCGAAACTCACTGGGTATTACTATGATAGTAGTGGCGTACGGG...,10,40,38.2,...,M02984:466:000000000-BJD2F:1:1101:10002:8181,1,1,1,1,1,,,0,B005-SG-B_S1
1,M02984:466:000000000-BJD2F:1:1101:10008:9626,GAACTCCCG,ACGTACGT,CTTAC,4,CAKDLQVAVAGTGVYWGQGTLVT,TGTGCGAAAGATCTTCAAGTAGCAGTGGCTGGTACGGGAGTCTACT...,40,40,40.0,...,M02984:466:000000000-BJD2F:1:1101:10008:9626,1,1,1,1,1,,,0,B005-SG-B_S1
2,M02984:466:000000000-BJD2F:1:1101:10009:16113,TTCCGAGAG,ACGTACGT,CTTAC,4,CARDNALLSGMDVWGQGTTVT,TGTGCGAGAGATAACGCTCTACTATCCGGTATGGACGTCTGGGGCC...,37,40,40.0,...,M02984:466:000000000-BJD2F:1:1101:10009:16113,1,1,1,1,1,,,0,B005-SG-B_S1
3,M02984:466:000000000-BJD2F:1:1101:10017:5969,AAATGTATT,ACGTACGT,CTTAC,4,CARHRRFLDLFFDIWDQGTMVT,TGTGCGAGACATCGACGATTTTTGGATCTTTTTTTTGATATCTGGG...,36,40,39.8,...,M02984:466:000000000-BJD2F:1:1101:10017:5969,1,1,1,1,1,,,0,B005-SG-B_S1
4,M02984:466:000000000-BJD2F:1:1101:10027:22362,TACCACTCT,ACGTACGT,CTTAC,4,CARFLVEDAFDIWGQGTLVT,TGTGCGCGATTCTTGGTGGAAGATGCTTTTGATATCTGGGGCCAAG...,40,40,40.0,...,M02984:466:000000000-BJD2F:1:1101:10027:22362,1,1,1,1,1,,,0,B005-SG-B_S1


In [18]:
df_lookup_allinfo = lookupInAllinfo(df_cdr3, df_allinfo)
print(sample_name, len(df_lookup_allinfo))

B005-SG-B_S1 304


In [19]:
for allinfo_file in allinfo_files[1:]:
    df_allinfo = pd.read_csv(allinfo_file, sep="\t")
    sample_name, rest = allinfo_file.split("_L001")
    df_allinfo["Sample"] = sample_name
    df_tmp = lookupInAllinfo(df_cdr3, df_allinfo)
    df_lookup_allinfo = pd.concat([df_lookup_allinfo, df_tmp])
    print(sample_name, len(df_tmp))
print("ALL", len(df_lookup_allinfo))

  interactivity=interactivity, compiler=compiler, result=result)


B007-BMDC-B_S4 0
B007-Lnpre-B_S2 110
BASTA-1467-B_S49 0
B007-Lnpost-B_S3 241
B007-PBBMDC-1-B_S5 0
B007-PBBMDC-2-B_S6 25
B009-clone2-B_S56 0


  interactivity=interactivity, compiler=compiler, result=result)


B009-clone6-B_S57 0
B009-clone10-B_S58 0
B009-clone17-B_S59 0
SS3-lung-1a-B_S110 55
SS3-lung-1b-B_S111 45
SS3-lung-1c-B_S112 30
BASTA-2172-B_S150 0
2172-mem_S166 0
3684-MZ_S181 1
BASTA-1467-B_S196 0
B007-naive_S206 0
2172-MZ_S167 0
3684-naive_S182 0
B007-PB_S207 0
1193-DN_S152 0
2172-naive_S168 0
3684-PB_S183 0
B005-PB_S197 0
1193-mem_S153 0
2172-PB_S169 0
1193-MZ_S154 0
1467-DN_S184 0
1193-naive_S155 0
3684-CD20p_S170 0
1467-mem_S185 0
2172-CD20p_S156 0
3684-DN_S171 0
1467-MZ_S186 0
2172-DN_S157 0
3684-mem_S172 0
1467-naive_S187 0
SS4-PE-slice_S57 2
2172-PE-block_S37 0
SS4-PE-block_S38 0
3684-PE-block_S39 79
1467-PE-block_S40 5
BASTA-0352_S195 0
SS3-lip_S201 76
B007-LN_S202 615
BASTA17_S198 0
BASTA18_S199 0
BASTA11_S176 0
BASTA19_S200 0
BASTA12_S177 0
BASTA20_S201 0
BASTA13_S178 0
BASTA14_S179 0
BASTA15_S180 0
BASTA16_S181 0
BASTA-1D_S160 0
BASTA-1G_S161 16
BASTA-1M_S162 0
BASTA-2G_S163 1
BASTA-2M_S164 0
BASTA-3G_S165 0
BASTA-3M_S171 0
BASTA-4M_S172 0
BASTA-6G_S166 0
BASTA-6M_S167 0
B

In [20]:
df_lookup_allinfo.to_excel(outAllinfo)
print("Wrote", outAllinfo, "to disk")

Wrote basta-allinfo.xlsx to disk


## Lookup sequences as sub sequence in the entire peptide sequence

In [21]:
cdr3_list = list(set(df_cdr3['sequence']))
len(cdr3_list)

408

In [22]:
def lookupSubSequence(pep):
    # cdr3_list is a global variable (list)
    hits = list()
    for cdr3 in cdr3_list:
        if cdr3 in pep:
            hits.append(cdr3)
    if len(hits) == 0:
        return(None)
    elif len(hits) == 1:
        return(hits[0])
    else:
        return(",".join(hits))

In [23]:
def countHits(x):
    if x is None:
        return(0)
    else:
        return(len(x.split(",")))

In [24]:
def lookupSubSequencePerFile(df_allinfo):
    df_allinfo['hits'] = [x for x in map(lookupSubSequence, df_allinfo['pep'])]
    df_allinfo['hit_count'] = [x for x in map(countHits, df_allinfo['hits'])]
    df_tmp = df_allinfo[df_allinfo['hits'].notna()]
    return(df_tmp)

In [25]:
df_allinfo = pd.read_csv(allinfo_files[0], sep="\t")
sample_name, rest = allinfo_files[0].split("_L001")
df_allinfo["Sample"] = sample_name
df_lookup_long = lookupSubSequencePerFile(df_allinfo)
print(sample_name, len(df_lookup_long))

B005-SG-B_S1 2196


In [26]:
for allinfo_file in allinfo_files[1:]:
    df_allinfo = pd.read_csv(allinfo_file, sep="\t")
    sample_name, rest = allinfo_file.split("_L001")
    df_allinfo["Sample"] = sample_name
    df_tmp = lookupSubSequencePerFile(df_allinfo)
    df_lookup_long = pd.concat([df_lookup_long, df_tmp])
    print(sample_name, len(df_tmp))
print("ALL", len(df_lookup_long))

  interactivity=interactivity, compiler=compiler, result=result)


B007-BMDC-B_S4 1112
B007-Lnpre-B_S2 500
BASTA-1467-B_S49 0
B007-Lnpost-B_S3 1074
B007-PBBMDC-1-B_S5 891
B007-PBBMDC-2-B_S6 439
B009-clone2-B_S56 0


  interactivity=interactivity, compiler=compiler, result=result)


B009-clone6-B_S57 0
B009-clone10-B_S58 0
B009-clone17-B_S59 0
SS3-lung-1a-B_S110 218
SS3-lung-1b-B_S111 92
SS3-lung-1c-B_S112 110
BASTA-2172-B_S150 0
2172-mem_S166 0
3684-MZ_S181 0
BASTA-1467-B_S196 13
B007-naive_S206 0
2172-MZ_S167 0
3684-naive_S182 0
B007-PB_S207 0
1193-DN_S152 0
2172-naive_S168 0
3684-PB_S183 0
B005-PB_S197 0
1193-mem_S153 0
2172-PB_S169 0
1193-MZ_S154 0
1467-DN_S184 22
1193-naive_S155 0
3684-CD20p_S170 0
1467-mem_S185 0
2172-CD20p_S156 1
3684-DN_S171 0
1467-MZ_S186 0
2172-DN_S157 0
3684-mem_S172 0
1467-naive_S187 1
SS4-PE-slice_S57 7
2172-PE-block_S37 0
SS4-PE-block_S38 4
3684-PE-block_S39 0
1467-PE-block_S40 38
BASTA-0352_S195 13
SS3-lip_S201 1537
B007-LN_S202 1454
BASTA17_S198 2
BASTA18_S199 0
BASTA11_S176 28
BASTA19_S200 0
BASTA12_S177 0
BASTA20_S201 0
BASTA13_S178 0
BASTA14_S179 0
BASTA15_S180 0
BASTA16_S181 17
BASTA-1D_S160 0
BASTA-1G_S161 8
BASTA-1M_S162 22
BASTA-2G_S163 0
BASTA-2M_S164 0
BASTA-3G_S165 0
BASTA-3M_S171 0
BASTA-4M_S172 151
BASTA-6G_S166 1
BASTA

In [27]:
df_lookup_long.to_excel(outAllinfoLong)
print("Wrote", outAllinfoLong, "to disk")

Wrote basta-allinfo-long.xlsx to disk
