In [1]:
import pandas as pd
import regex

## Read IgBlast output (AIRR format)

In [2]:
df = pd.read_csv("basta-allinfo-long-mut-nglyc-cdr3peplengthRT.igblast.csv", sep="\t")
df.head()

Unnamed: 0,sequence_id,sequence,locus,stop_codon,vj_in_frame,productive,rev_comp,complete_vdj,v_call,d_call,...,fwr3_start,fwr3_end,fwr4_start,fwr4_end,cdr3_start,cdr3_end,np1,np1_length,np2,np2_length
0,M02984:466:000000000-BJD2F:1:1105:4900:10042,ACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGCAGCTATGCCATG...,IGH,F,T,T,T,F,"IGHV3-23*01,IGHV3-23*04,IGHV3-23D*01",IGHD4-17*01,...,119.0,232.0,,,233.0,280.0,CTACCC,6.0,GATGTAGCCCGA,12.0
1,M02984:466:000000000-BJD2F:1:1109:10088:21530,ACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGCAGCTATGCCATG...,IGH,F,T,T,T,F,"IGHV3-23*01,IGHV3-23*04,IGHV3-23D*01",IGHD4-17*01,...,119.0,232.0,,,233.0,280.0,CTACCC,6.0,GATGTAGCCCGA,12.0
2,M02984:466:000000000-BJD2F:1:1112:22723:21149,ACTCTCCTGTGCAGCCTCTGGATTCACCTTAAGCAGCTATGCCATG...,IGH,F,T,T,T,F,"IGHV3-23*01,IGHV3-23*04,IGHV3-23D*01",IGHD4-17*01,...,119.0,232.0,,,233.0,280.0,CTACCC,6.0,GATGTAGCCCGA,12.0
3,M02984:466:000000000-BJD2F:1:1113:11675:23479,ACTCTCCTGTGCAGCCTCTGGAATCACCTTTAGCAGCTATGCCATG...,IGH,F,T,T,T,F,"IGHV3-23*01,IGHV3-23*04,IGHV3-23D*01",IGHD4-17*01,...,119.0,232.0,,,233.0,280.0,CTACCC,6.0,GATGTAGCCCGA,12.0
4,M02984:466:000000000-BJD2F:1:1115:15412:14634,ACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGCAGCTATGCCATG...,IGH,F,T,T,T,F,"IGHV3-23*01,IGHV3-23*04,IGHV3-23D*01",IGHD4-17*01,...,119.0,232.0,,,233.0,280.0,CTACCC,6.0,GATGTAGCCCGA,12.0


In [3]:
df.columns

Index(['sequence_id', 'sequence', 'locus', 'stop_codon', 'vj_in_frame',
       'productive', 'rev_comp', 'complete_vdj', 'v_call', 'd_call', 'j_call',
       'sequence_alignment', 'germline_alignment', 'sequence_alignment_aa',
       'germline_alignment_aa', 'v_alignment_start', 'v_alignment_end',
       'd_alignment_start', 'd_alignment_end', 'j_alignment_start',
       'j_alignment_end', 'v_sequence_alignment', 'v_sequence_alignment_aa',
       'v_germline_alignment', 'v_germline_alignment_aa',
       'd_sequence_alignment', 'd_sequence_alignment_aa',
       'd_germline_alignment', 'd_germline_alignment_aa',
       'j_sequence_alignment', 'j_sequence_alignment_aa',
       'j_germline_alignment', 'j_germline_alignment_aa', 'fwr1', 'fwr1_aa',
       'cdr1', 'cdr1_aa', 'fwr2', 'fwr2_aa', 'cdr2', 'cdr2_aa', 'fwr3',
       'fwr3_aa', 'fwr4', 'fwr4_aa', 'cdr3', 'cdr3_aa', 'junction',
       'junction_length', 'junction_aa', 'junction_aa_length', 'v_score',
       'd_score', 'j_score', 'v_c

In [4]:
aa_cols = list()
for col in df.columns:
    if col.endswith("_aa") and (col.startswith("fwr") or col.startswith("cdr")):
        aa_cols.append(col)
aa_cols

['fwr1_aa', 'cdr1_aa', 'fwr2_aa', 'cdr2_aa', 'fwr3_aa', 'fwr4_aa', 'cdr3_aa']

## Search for motif in each of the aa columns

In [5]:
# Check for an extra motif Asn-X-Ser/Thr (X is not Proline)
P = regex.compile("N[^P][ST]")

In [6]:
# Search for motif in the protein translation
def findMotif(myseq): # P is a global variable
    found = list()
    for m in P.finditer(str(myseq)):
        #print(m.group(0), m.span())
        found.append(m.group(0))
    return(",".join(found))
print(findMotif("XXXNRSXXX"))
print(findMotif("XXXNPTXXX"))
print(findMotif("XXXNRSXXXNOTXXX"))

NRS

NRS,NOT


In [7]:
for aa_col in aa_cols:
    df[aa_col + "_nglyc"] = [x for x in map(findMotif, df[aa_col])]
df.head()                                            

Unnamed: 0,sequence_id,sequence,locus,stop_codon,vj_in_frame,productive,rev_comp,complete_vdj,v_call,d_call,...,np1_length,np2,np2_length,fwr1_aa_nglyc,cdr1_aa_nglyc,fwr2_aa_nglyc,cdr2_aa_nglyc,fwr3_aa_nglyc,fwr4_aa_nglyc,cdr3_aa_nglyc
0,M02984:466:000000000-BJD2F:1:1105:4900:10042,ACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGCAGCTATGCCATG...,IGH,F,T,T,T,F,"IGHV3-23*01,IGHV3-23*04,IGHV3-23D*01",IGHD4-17*01,...,6.0,GATGTAGCCCGA,12.0,,,,,,,
1,M02984:466:000000000-BJD2F:1:1109:10088:21530,ACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGCAGCTATGCCATG...,IGH,F,T,T,T,F,"IGHV3-23*01,IGHV3-23*04,IGHV3-23D*01",IGHD4-17*01,...,6.0,GATGTAGCCCGA,12.0,,,,,,,
2,M02984:466:000000000-BJD2F:1:1112:22723:21149,ACTCTCCTGTGCAGCCTCTGGATTCACCTTAAGCAGCTATGCCATG...,IGH,F,T,T,T,F,"IGHV3-23*01,IGHV3-23*04,IGHV3-23D*01",IGHD4-17*01,...,6.0,GATGTAGCCCGA,12.0,,,,,,,
3,M02984:466:000000000-BJD2F:1:1113:11675:23479,ACTCTCCTGTGCAGCCTCTGGAATCACCTTTAGCAGCTATGCCATG...,IGH,F,T,T,T,F,"IGHV3-23*01,IGHV3-23*04,IGHV3-23D*01",IGHD4-17*01,...,6.0,GATGTAGCCCGA,12.0,,,,,,,
4,M02984:466:000000000-BJD2F:1:1115:15412:14634,ACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGCAGCTATGCCATG...,IGH,F,T,T,T,F,"IGHV3-23*01,IGHV3-23*04,IGHV3-23D*01",IGHD4-17*01,...,6.0,GATGTAGCCCGA,12.0,,,,,,,


## How many sequences contain the motif?

In [8]:
len(df)

9663

In [9]:
df_nglyc = df[(df['fwr1_aa_nglyc'] != '') | (df['fwr2_aa_nglyc'] != '') | (df['fwr3_aa_nglyc'] != '') | (df['fwr4_aa_nglyc'] != '') | (df['cdr1_aa_nglyc'] != '') | (df['cdr2_aa_nglyc'] != '') | (df['cdr3_aa_nglyc'] != '')]
df_nglyc

Unnamed: 0,sequence_id,sequence,locus,stop_codon,vj_in_frame,productive,rev_comp,complete_vdj,v_call,d_call,...,np1_length,np2,np2_length,fwr1_aa_nglyc,cdr1_aa_nglyc,fwr2_aa_nglyc,cdr2_aa_nglyc,fwr3_aa_nglyc,fwr4_aa_nglyc,cdr3_aa_nglyc
136,M02984:489:000000000-C42NT:1:2115:5498:6265,ACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAATTACTGGATG...,IGH,F,T,T,T,F,"IGHV3-74*01,IGHV3-74*03",IGHD6-13*01,...,21.0,CCAAG,5.0,,,,NST,,,
137,M02984:489:000000000-C42NT:1:2115:5498:6265,ACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAATTACTGGATG...,IGH,F,T,T,T,F,"IGHV3-74*01,IGHV3-74*03",IGHD6-13*01,...,21.0,CCAAG,5.0,,,,NST,,,
152,M02984:489:000000000-C42NT:1:1101:23301:15798,ACTCTCCTGTGCAGCCTCTGGATTCTCCTTTAGGAACTACGCCATG...,IGH,F,T,T,T,F,"IGHV3-23*01,IGHV3-23*04,IGHV3-23D*01",IGHD6-6*01,...,13.0,G,1.0,,,,,NKT,,
155,M02984:489:000000000-C42NT:1:1103:24386:5547,ACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGCAGCTATGCCATG...,IGH,F,T,T,T,F,"IGHV3-23*01,IGHV3-23*04,IGHV3-23*05",IGHD6-6*01,...,13.0,G,1.0,,,,,NKT,,
156,M02984:489:000000000-C42NT:1:1104:23302:13854,ACTCTCCTGTGCAGCCTCTGGATTCTCCTTTAGGAACTACGCCATG...,IGH,F,T,T,T,F,"IGHV3-23*01,IGHV3-23*04,IGHV3-23D*01",IGHD6-6*01,...,13.0,G,1.0,,,,,NKT,,
157,M02984:489:000000000-C42NT:1:1106:12435:10086,ACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAGCTATGCTATG...,IGH,F,T,T,T,F,"IGHV3-23*01,IGHV3-23*04,IGHV3-23D*01",IGHD6-6*01,...,13.0,G,1.0,,,,,NKT,,
158,M02984:489:000000000-C42NT:1:1119:22550:20807,ACTCTCCTGTGCAGCCTCTGGATTCTCCTTTAGGAACTACGCCATG...,IGH,F,T,T,T,F,"IGHV3-23*01,IGHV3-23*04,IGHV3-23D*01",IGHD6-6*01,...,13.0,G,1.0,,,,,NKT,,
160,M02984:489:000000000-C42NT:1:2105:13570:6941,ACTCTCCTGTGCAGCCTCTGGATTCTCCTTTAGGAACTACGCCATG...,IGH,F,T,T,T,F,"IGHV3-23*01,IGHV3-23*04,IGHV3-23D*01",IGHD6-6*01,...,13.0,G,1.0,,,,,NKT,,
161,M02984:489:000000000-C42NT:1:2107:25010:14260,ACTCTCCTGTGCAGCCTCTGGATTCTCCTTTAGGAACTACGCCATG...,IGH,F,T,T,T,F,"IGHV3-23*01,IGHV3-23*04,IGHV3-23D*01",IGHD6-6*01,...,13.0,G,1.0,,,,,NKT,,
163,M02984:489:000000000-C42NT:1:2108:10438:7151,ACTCACCTGTGCAGCCTCTGGATTCTCCTTTAGGAACTACGCCATG...,IGH,F,T,T,T,F,"IGHV3-23*01,IGHV3-23*04,IGHV3-23D*01",IGHD6-6*01,...,13.0,G,1.0,,,,,NKT,,


In [10]:
df_nglyc.to_excel('basta-allinfo-long-mut-nglyc-cdr3peplengthRT-igblast-nglyc.xlsx')
print("Wrote basta-allinfo-long-mut-nglyc-cdr3peplengthRT-igblast-nglyc.xlsx to disk")

Wrote basta-allinfo-long-mut-nglyc-cdr3peplengthRT-igblast-nglyc.xlsx to disk
