In [1]:
from Bio import SeqIO
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *

In [2]:
#Create a selector for the intended size exclusion media
geldict = {
    "Q Media (Triethylamine +)" : -1,
    "S Media (Sulfite -)" : 1,
}
gelselect = Dropdown(options=geldict.keys(),description="Media")
display(gelselect)

pHslider = widgets.FloatSlider(value=7.0,min=0,max=14,step=0.1)
display(pHslider)




Dropdown(description='Media', options=('Q Media (Triethylamine +)', 'S Media (Sulfite -)'), value='Q Media (Tr…

FloatSlider(value=7.0, max=14.0)

In [62]:
#Generate a file path for the correct .fasta file
filename = input("Enter File Name")
datafile = os.path.join('data', filename)

Enter File Name Streptococcus_pyogenes.faa


In [63]:
data_folder = os.path.join('data', filename.replace(".faa","")+"_affinity_chromatography")

os.mkdir(data_folder)

In [3]:
def getHistPercent(protstring):
    Hcount = 0
    for AA in protstring:
        if AA == "H":
            Hcount += 1
    return(Hcount/len(protstring))

In [37]:
def findHistChains(protstring):
    chainDict = {}
    chainlen = 1
    lastAA = ""
    for AA in protstring:
        if AA == "H":
            if lastAA == "H":
                chainlen += 1
        elif chainlen > 0 and lastAA == "H":
            #print(f"Chain found, current length {chainlen}")
            if str(chainlen) in chainDict:
                chainDict[str(chainlen)] = chainDict[str(chainlen)] + 1
            else:
                chainDict[str(chainlen)] = 1
            chainlen = 1
        lastAA = AA
    if chainlen > 0:
        if str(chainlen) in chainDict:
            chainDict[str(chainlen)] = chainDict[str(chainlen)] + 1
        else:
            chainDict[str(chainlen)] = 1
    return(chainDict)
        
findHistChains("HAAHHHHBDFHHAHH")

{'1': 1, '4': 1, '2': 2}

In [46]:
def HistScore(protstring):
    histChains = findHistChains(protstring)
    score = 0
    for key in histChains:
        score += histChains[key] * (int(key) ** 2)
        #print(key, histChains[key], score)
    return(score)
        
HistScore("HAAHHHHBDFHHAHH")

25

In [64]:
unfiltered = []
wash = []
fractions = []
with open(datafile) as protfile:
    for record in SeqIO.parse(protfile,"fasta"):
        unfiltered.append(record)
        protparams = PA(str(record.seq))
        if HistScore(record.seq) < 8:
            wash.append(record)
        else:
            fractions.append(record)    
        
SeqIO.write(unfiltered,os.path.join(data_folder,filename.replace(".faa","")+"_total.faa"),"fasta")
SeqIO.write(wash,os.path.join(data_folder,filename.replace(".faa","")+"_wash.faa"),"fasta")
SeqIO.write(fractions,os.path.join(data_folder,filename.replace(".faa","")+"_fractions.faa"),"fasta")


706

In [79]:
datadict = {
    'sequence' : [],
    'length' : [],
    'isoelectric_point' : [],
    'hist_score' : [],
    'max_chain_length': [],
    'ID' : [],
    'description' : []
}

with open(os.path.join(data_folder,filename.replace(".faa","")+"_fractions.faa"),"r") as prot_file:
    for record in SeqIO.parse(prot_file,"fasta"):
        sequence = str(record.seq)  #It is not strictly necessary to record the sequence as a string, but it is easier to work with and displays better. (Ex:len() does not work on Seq objects)
        parameters = PA(sequence)
        datadict['sequence'].append(sequence)
        datadict['length'].append(len(sequence))
        datadict['isoelectric_point'].append(parameters.isoelectric_point())
        datadict['hist_score'].append(HistScore(sequence))
        datadict['max_chain_length'].append(max(findHistChains(sequence).keys()))
        datadict['ID'].append(record.id)
        datadict['description'].append(record.description)

df = pd.DataFrame.from_dict(datadict)
df.sort_values('hist_score',inplace = True)
df

Unnamed: 0,sequence,length,isoelectric_point,hist_score,max_chain_length,ID,description
230,MDKTVLYFGLVEISFDVPDFIFQKIKEQNNKIRYYSKSINILGSVK...,319,9.261222,8,1,ERL23115,ERL23115 pep supercontig:SpyogGA06023v1.0:cont...
551,MIRIKNITKSKFFGTAIILLQQLIALLILVYNRENLSLLFSEKVAL...,520,7.045045,8,1,ERL11869,ERL11869 pep supercontig:SpyogGA06023v1.0:cont...
104,MGLSYQEELTLPFELCDVKSDIKLPLLLDYCLMVSGRQSAQLGRSN...,250,5.167881,8,1,ERL21440,ERL21440 pep supercontig:SpyogGA06023v1.0:cont...
552,MPISFQLISSKSQKRKDLRMKYNKTKYPNIFWYITLKGKRYYIRRG...,380,9.600841,8,1,ERL22906,ERL22906 pep supercontig:SpyogGA06023v1.0:cont...
106,MKMKTLLGIIAGKAAQSILTKLGRGSTYPGRLALACDKDILKDLSK...,447,6.677240,8,1,ERL21431,ERL21431 pep supercontig:SpyogGA06023v1.0:cont...
...,...,...,...,...,...,...,...
393,MKKKVNQGSKRYQYLLKKWGIGFVIAATGTVVLGCTPSILTHQVAA...,1165,6.892433,33,2,ERL21600,ERL21600 pep supercontig:SpyogGA06023v1.0:cont...
372,MMITTVAADELTTMSEPTITNHAQQQAQRLTNTELSSAESKSQDTS...,1621,6.139543,34,1,ERL23382,ERL23382 pep supercontig:SpyogGA06023v1.0:cont...
565,MKKTYGYIGSVAAILLATHIGSYQLGKHHMGLATKDNQIAYIDDSK...,823,5.910369,37,2,ERL14902,ERL14902 pep supercontig:SpyogGA06023v1.0:cont...
345,MTSDSKEQTVTGYQYHYIDQEGRKQPFNQGWRFLMADVACAQDPSF...,1138,5.400636,37,2,ERL08986,ERL08986 pep supercontig:SpyogGA06023v1.0:cont...


In [80]:
nfractions = 7
fraclen = round(len(df)/nfractions)
for n in range(nfractions - 1):
    fracrecords = []
    tempdf = df.iloc[n*fraclen:(n+1)*fraclen]
    for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        fracrecords.append(record)
    SeqIO.write(fracrecords,os.path.join(data_folder,filename.replace(".faa","")+"_fraction"+ str(n+1) + ".faa"),"fasta")

tempdf = df.iloc[(nfractions-1)*fraclen:]
remainderrecord = []
for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        remainderrecord.append(record)
SeqIO.write(remainderrecord,os.path.join(data_folder,filename.replace(".faa","")+"_fraction"+ str(nfractions) + ".faa"),"fasta")
tempdf

Unnamed: 0,sequence,length,isoelectric_point,hist_score,max_chain_length,ID,description
2,MCGIVGVVGNRNATDILMQGLEKLEYRGYDSAGIFVANANQTNLIK...,604,5.102516,18,1,ERL19302,ERL19302 pep supercontig:SpyogGA06023v1.0:cont...
116,MSFDGFFLHHLTNELKENLLYGRIQKVNQPFERELVLTIRNHRKNY...,550,8.510230,18,2,ERL21443,ERL21443 pep supercontig:SpyogGA06023v1.0:cont...
617,MAAWSKMIKGLANLAKGCIINVIDLKMILLEKFIIHLKGAHYFSYR...,462,6.011087,18,1,ERL20683,ERL20683 pep supercontig:SpyogGA06023v1.0:cont...
454,MDKHLLVKRTLGCVCAATLMGAALATHHDSLNTVKAEEKTVQVQKK...,775,6.613126,18,2,ERL19734,ERL19734 pep supercontig:SpyogGA06023v1.0:cont...
335,MPASKKVTIIFILNLSFSLIEFIFGTLFFSGAILADAVHDFGDAIA...,291,6.337457,18,2,ERL12326,ERL12326 pep supercontig:SpyogGA06023v1.0:cont...
...,...,...,...,...,...,...,...
393,MKKKVNQGSKRYQYLLKKWGIGFVIAATGTVVLGCTPSILTHQVAA...,1165,6.892433,33,2,ERL21600,ERL21600 pep supercontig:SpyogGA06023v1.0:cont...
372,MMITTVAADELTTMSEPTITNHAQQQAQRLTNTELSSAESKSQDTS...,1621,6.139543,34,1,ERL23382,ERL23382 pep supercontig:SpyogGA06023v1.0:cont...
565,MKKTYGYIGSVAAILLATHIGSYQLGKHHMGLATKDNQIAYIDDSK...,823,5.910369,37,2,ERL14902,ERL14902 pep supercontig:SpyogGA06023v1.0:cont...
345,MTSDSKEQTVTGYQYHYIDQEGRKQPFNQGWRFLMADVACAQDPSF...,1138,5.400636,37,2,ERL08986,ERL08986 pep supercontig:SpyogGA06023v1.0:cont...
