In [1]:
import pandas as pd
import regex

## Read IgBlast output (AIRR format)

In [2]:
df = pd.read_csv("basta-allinfo-long-mut-nglyc-cdr3peplength-VJ.igblast.csv", sep="\t")
df.head()

Unnamed: 0,sequence_id,sequence,locus,stop_codon,vj_in_frame,productive,rev_comp,complete_vdj,v_call,d_call,...,fwr3_start,fwr3_end,fwr4_start,fwr4_end,cdr3_start,cdr3_end,np1,np1_length,np2,np2_length
0,M02984:466:000000000-BJD2F:1:1101:10160:22921,AGGTTTCCTGCAAGGCTTCTGGAGGCACCTTCAGCAACTATGCTAT...,IGH,F,T,T,T,F,"IGHV1-69*06,IGHV1-69*14",IGHD1-7*01,...,120.0,233.0,,,234.0,284.0,GGGCCGGGGATCG,13,AAAAACCTTCTCCG,14.0
1,M02984:466:000000000-BJD2F:1:1101:10314:6196,AGGTTTCCTGCAAGGCTTCTGGAGGCACCTTCAGCAGCTATGCTAT...,IGH,F,T,T,T,F,"IGHV1-69*06,IGHV1-69*14",IGHD1-7*01,...,120.0,233.0,,,234.0,284.0,GGGCCGGGGATCG,13,AAAAACCTTCTCCG,14.0
2,M02984:466:000000000-BJD2F:1:1101:11151:22824,AGGTTTCCTGCAAGGCTTCTGGAGGCACCTTCAGCAGTTATGCTAT...,IGH,F,T,T,T,F,"IGHV1-69*06,IGHV1-69*14",IGHD1-7*01,...,120.0,233.0,,,234.0,284.0,GGGCCGGGGATCG,13,AAAAACCTTCTCCG,14.0
3,M02984:466:000000000-BJD2F:1:1101:11198:8015,AGGTTTCCTGCAAGGCTTCTGGAGGCACCTTCAGCAACTATGCTAT...,IGH,F,T,T,T,F,"IGHV1-69*06,IGHV1-69*14",IGHD1-7*01,...,120.0,233.0,,,234.0,284.0,GGGCCGGGGATCG,13,AAAAACCTTCTCCG,14.0
4,M02984:466:000000000-BJD2F:1:1101:11436:16814,AGGTTTCCTGCAAGGCTTCTGGAGGCACCTTCAACAACTATGCTAT...,IGH,F,T,T,T,F,"IGHV1-69*06,IGHV1-69*14",IGHD1-7*01,...,120.0,233.0,,,234.0,284.0,GGGCCGGGGATCG,13,AAAAACCTTCTCCG,14.0


In [3]:
df.columns

Index(['sequence_id', 'sequence', 'locus', 'stop_codon', 'vj_in_frame',
       'productive', 'rev_comp', 'complete_vdj', 'v_call', 'd_call', 'j_call',
       'sequence_alignment', 'germline_alignment', 'sequence_alignment_aa',
       'germline_alignment_aa', 'v_alignment_start', 'v_alignment_end',
       'd_alignment_start', 'd_alignment_end', 'j_alignment_start',
       'j_alignment_end', 'v_sequence_alignment', 'v_sequence_alignment_aa',
       'v_germline_alignment', 'v_germline_alignment_aa',
       'd_sequence_alignment', 'd_sequence_alignment_aa',
       'd_germline_alignment', 'd_germline_alignment_aa',
       'j_sequence_alignment', 'j_sequence_alignment_aa',
       'j_germline_alignment', 'j_germline_alignment_aa', 'fwr1', 'fwr1_aa',
       'cdr1', 'cdr1_aa', 'fwr2', 'fwr2_aa', 'cdr2', 'cdr2_aa', 'fwr3',
       'fwr3_aa', 'fwr4', 'fwr4_aa', 'cdr3', 'cdr3_aa', 'junction',
       'junction_length', 'junction_aa', 'junction_aa_length', 'v_score',
       'd_score', 'j_score', 'v_c

In [4]:
aa_cols = list()
for col in df.columns:
    if col.endswith("_aa") and (col.startswith("fwr") or col.startswith("cdr")):
        aa_cols.append(col)
aa_cols

['fwr1_aa', 'cdr1_aa', 'fwr2_aa', 'cdr2_aa', 'fwr3_aa', 'fwr4_aa', 'cdr3_aa']

## Search for motif in each of the aa columns

In [5]:
# Check for an extra motif Asn-X-Ser/Thr (X is not Proline)
P = regex.compile("N[^P][ST]")

In [6]:
# Search for motif in the protein translation
def findMotif(myseq): # P is a global variable
    found = list()
    for m in P.finditer(str(myseq)):
        #print(m.group(0), m.span())
        found.append(m.group(0))
    return(",".join(found))
print(findMotif("XXXNRSXXX"))
print(findMotif("XXXNPTXXX"))
print(findMotif("XXXNRSXXXNOTXXX"))

NRS

NRS,NOT


In [7]:
for aa_col in aa_cols:
    df[aa_col + "_nglyc"] = [x for x in map(findMotif, df[aa_col])]
df.head()                                            

Unnamed: 0,sequence_id,sequence,locus,stop_codon,vj_in_frame,productive,rev_comp,complete_vdj,v_call,d_call,...,np1_length,np2,np2_length,fwr1_aa_nglyc,cdr1_aa_nglyc,fwr2_aa_nglyc,cdr2_aa_nglyc,fwr3_aa_nglyc,fwr4_aa_nglyc,cdr3_aa_nglyc
0,M02984:466:000000000-BJD2F:1:1101:10160:22921,AGGTTTCCTGCAAGGCTTCTGGAGGCACCTTCAGCAACTATGCTAT...,IGH,F,T,T,T,F,"IGHV1-69*06,IGHV1-69*14",IGHD1-7*01,...,13,AAAAACCTTCTCCG,14.0,,,,,,,
1,M02984:466:000000000-BJD2F:1:1101:10314:6196,AGGTTTCCTGCAAGGCTTCTGGAGGCACCTTCAGCAGCTATGCTAT...,IGH,F,T,T,T,F,"IGHV1-69*06,IGHV1-69*14",IGHD1-7*01,...,13,AAAAACCTTCTCCG,14.0,,,,,,,
2,M02984:466:000000000-BJD2F:1:1101:11151:22824,AGGTTTCCTGCAAGGCTTCTGGAGGCACCTTCAGCAGTTATGCTAT...,IGH,F,T,T,T,F,"IGHV1-69*06,IGHV1-69*14",IGHD1-7*01,...,13,AAAAACCTTCTCCG,14.0,,,,,,,
3,M02984:466:000000000-BJD2F:1:1101:11198:8015,AGGTTTCCTGCAAGGCTTCTGGAGGCACCTTCAGCAACTATGCTAT...,IGH,F,T,T,T,F,"IGHV1-69*06,IGHV1-69*14",IGHD1-7*01,...,13,AAAAACCTTCTCCG,14.0,,,,,,,
4,M02984:466:000000000-BJD2F:1:1101:11436:16814,AGGTTTCCTGCAAGGCTTCTGGAGGCACCTTCAACAACTATGCTAT...,IGH,F,T,T,T,F,"IGHV1-69*06,IGHV1-69*14",IGHD1-7*01,...,13,AAAAACCTTCTCCG,14.0,,,,,,,
