In [2]:
from Bio import SeqIO
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *

In [3]:
def listtostr(strlist):
    output = ""
    for word in strlist:
        output += word + "_"
    return(output[:len(output)-1])
def simplifyname(orgname):
    firstname = [orgname.split("_")[0][0]]
    lastname = orgname.split("_")[1:]
    fullname = firstname + lastname
    return(listtostr(fullname))

In [4]:
#Create a selector for the intended size exclusion media
geldict = {
    "Q Media (Triethylamine +)" : -1,
    "S Media (Sulfite -)" : 1,
}
gelselect = Dropdown(options=geldict.keys(),description="Media")
display(gelselect)

pHslider = widgets.FloatSlider(value=7.0,min=0,max=14,step=0.1)
display(pHslider)

#testbutton = widgets.Button(description="Confirm")
#output = widgets.Output()
#display(testbutton, output)
#def on_button_clicked(b):
#    with output:
#        runpH = pHslider.value
#        gelcharge = int(geldict[gelselect.value])
    
#testbutton.on_click(on_button_clicked)



Dropdown(description='Media', options=('Q Media (Triethylamine +)', 'S Media (Sulfite -)'), value='Q Media (Tr…

FloatSlider(value=7.0, max=14.0)

In [62]:
#Generate a file path for the correct .fasta file
input_filename = input("Enter File Name")
datafile = os.path.join('data', input_filename)
organism_name = input_filename.replace(".faa","")

Enter File Name Yersinia_pestis.faa


In [63]:
data_folder = os.path.join('data', organism_name +"_ion_exchange")

if not os.path.isdir(data_folder):
    os.mkdir(data_folder)

In [64]:
unfiltered = []
wash = []
fractions = []
with open(datafile) as protfile:
    for record in SeqIO.parse(protfile,"fasta"):
        unfiltered.append(record)
        protparams = PA(str(record.seq))
        if protparams.charge_at_pH(pHslider.value)*int(geldict[gelselect.value]) < 0:
            wash.append(record)
        else:
            fractions.append(record)    

data_name = gelselect.value[0] + str(pHslider.value).replace(".","_") + "_" + simplifyname(organism_name)
SeqIO.write(unfiltered,os.path.join(data_folder,data_name+"_total.faa"),"fasta")
SeqIO.write(wash,os.path.join(data_folder,data_name+"_wash.faa"),"fasta")
SeqIO.write(fractions,os.path.join(data_folder,data_name+"_fractions.faa"),"fasta")


533

In [65]:
#Converting modified fasta into a pd dataframe to show filter at work, preset file name used for simplicity

data = {
    'sequence' : [],
    'description' : [],
    f'charge at ph {pHslider.value}' : [],
    'ID' : []
}


protcharge = []
prot_desc = []
prot_name = []
prot_id = []
with open(os.path.join(data_folder,data_name+"_fractions.faa")) as protfile:
  for record in SeqIO.parse(protfile,"fasta"):
    sequence = str(record.seq)
    data['sequence'].append(sequence)
    protparams = PA(sequence)
    charge = round(protparams.charge_at_pH(pHslider.value),2)
    data[f'charge at ph {pHslider.value}'].append(charge)
    data['description'].append(record.description)
    data['ID'].append(record.id)


df = pd.DataFrame.from_dict(data)

df.sort_values(by=[f'charge at ph {pHslider.value}'], inplace=True, ascending = bool(int(geldict[gelselect.value]) == -1))
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,sequence,description,charge at ph 6.0,ID
0,MAIVKCKPTSPGRRHVVKVVNPELHKGKPYAPLLEKLSKSGGRNNN...,sp|P60436|RL2_YERPE 50S ribosomal protein L2 O...,40.18,sp|P60436|RL2_YERPE
1,MSISLIQPERDLFSYQPYWAECYGTAPFLPMSREEMDILGWDSCDI...,sp|Q8CZS4|Y674_YERPE Putative UPF0313 protein ...,33.72,sp|Q8CZS4|Y674_YERPE
2,MGASATNSVTHPAFTLNVRPDNIGIITIDVVGDKVNTLKAEFADQI...,sp|Q8ZD45|FADJ_YERPE Fatty acid oxidation comp...,26.54,sp|Q8ZD45|FADJ_YERPE
3,MSGWRKIYYKLLNLPLKLLVKSKVIPADPVSELGLDPSRPILYVLP...,sp|Q8ZJ18|PLSB_YERPE Glycerol-3-phosphate acyl...,26.46,sp|Q8ZJ18|PLSB_YERPE
4,MNLLTMSTELIYIFLFSMAFLFVARKVAIKIGLVDKPNYRKRHQGL...,sp|Q8ZAE1|WECA_YERPE Undecaprenyl-phosphate al...,26.05,sp|Q8ZAE1|WECA_YERPE
...,...,...,...,...
528,MTQVYNFSAGPAMLPVEVLRRAEQELRNWHGLGTSVMEISHRSKEF...,sp|Q8ZGB4|SERC_YERPE Phosphoserine aminotransf...,0.08,sp|Q8ZGB4|SERC_YERPE
529,MAHVTSVTLGEHLTGFVGEMIQSGRYGNISEVLRDALRLMEAREQR...,sp|Q9ZGW3|PARD_YERPE Antitoxin ParD OS=Yersini...,0.08,sp|Q9ZGW3|PARD_YERPE
530,MISIERLSKTYPQGGLPMVALEEVSLEIPTGSVFGIVGRSGAGKST...,sp|Q7CHF8|METN2_YERPE Methionine import ATP-bi...,0.04,sp|Q7CHF8|METN2_YERPE
531,MINEIRKDAEVRMEKCLEAFQNHISKIRTGRASPSILDGIQVEYYG...,sp|Q8ZH63|RRF_YERPE Ribosome-recycling factor ...,0.01,sp|Q8ZH63|RRF_YERPE


In [74]:
nfractions = 7
fraclen = round(len(df)/nfractions)
seqofinterest = "HHHHHH"
seqhits = {}
for n in range(nfractions - 1):
    fracrecords = []
    tempdf = df.iloc[n*fraclen:(n+1)*fraclen]
    for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        fracrecords.append(record)
        if seqofinterest in tempdf['sequence'][index]:
            if str(n) in seqhits:
                seqhits[str(n+1)][0] += 1
                seqhits[str(n+1)][1].append(index)
            else:
                seqhits[str(n+1)] = [1,[index]]
    SeqIO.write(fracrecords,os.path.join(data_folder,data_name+"_fraction"+ str(n+1) + ".faa"),"fasta")

tempdf = df.iloc[(nfractions-1)*fraclen:]
remainderrecord = []
for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        remainderrecord.append(record)
        if seqofinterest in tempdf['sequence'][index]:
            if str(n) in seqhits:
                seqhits[str(n+1)][0] += 1
                seqhits[str(n+1)][1].append(index)
            else:
                seqhits[str(n+1)] = [1,[index]]
SeqIO.write(remainderrecord,os.path.join(data_folder,data_name+"_fraction"+ str(nfractions) + ".faa"),"fasta")
tempdf

Unnamed: 0,sequence,description,charge at ph 6.0,ID
456,MTQDASPILTSLLDTDAYKLHMQQAVFHHYRHITVAAEFRCRSDEL...,sp|Q8ZG93|PNCB_YERPE Nicotinate phosphoribosyl...,1.09,sp|Q8ZG93|PNCB_YERPE
457,MSGSQTLVVKLGTSVLTGGSRRLNRAHIVELVRQCAQQHAKGHRIV...,sp|Q8ZC08|PROB_YERPE Glutamate 5-kinase OS=Yer...,1.04,sp|Q8ZC08|PROB_YERPE
458,MSTFSLKIIRVGITVLVVVLAVIAIFNVWAFYTESPWTRDAKFTAD...,sp|Q8ZAU9|AAEA_YERPE p-hydroxybenzoic acid eff...,1.01,sp|Q8ZAU9|AAEA_YERPE
459,MPYLLEMKDITKQFGVVKAVDNISLTLEAGQVLSLCGENGSGKSTL...,sp|Q7CFR2|XYLG_YERPE Xylose import ATP-binding...,1.00,sp|Q7CFR2|XYLG_YERPE
460,MIPLQHGLILAAILFVLGLTGLLIRRNLLFMLISLEVMINAAALAF...,sp|Q7CJ87|NUOK_YERPE NADH-quinone oxidoreducta...,1.00,sp|Q7CJ87|NUOK_YERPE
...,...,...,...,...
528,MTQVYNFSAGPAMLPVEVLRRAEQELRNWHGLGTSVMEISHRSKEF...,sp|Q8ZGB4|SERC_YERPE Phosphoserine aminotransf...,0.08,sp|Q8ZGB4|SERC_YERPE
529,MAHVTSVTLGEHLTGFVGEMIQSGRYGNISEVLRDALRLMEAREQR...,sp|Q9ZGW3|PARD_YERPE Antitoxin ParD OS=Yersini...,0.08,sp|Q9ZGW3|PARD_YERPE
530,MISIERLSKTYPQGGLPMVALEEVSLEIPTGSVFGIVGRSGAGKST...,sp|Q7CHF8|METN2_YERPE Methionine import ATP-bi...,0.04,sp|Q7CHF8|METN2_YERPE
531,MINEIRKDAEVRMEKCLEAFQNHISKIRTGRASPSILDGIQVEYYG...,sp|Q8ZH63|RRF_YERPE Ribosome-recycling factor ...,0.01,sp|Q8ZH63|RRF_YERPE


In [75]:
seqhits
for key in seqhits:
    print(f'Hits in fraction {key}')
    for hit in seqhits[key][1]:
        print(f'\tHit on index {hit}')

Hits in fraction 5
	Hit on index 355


sequence                                              MNRVQFNHHHHHHPD
description         sp|Q8D079|LPHI_YERPE his operon leader peptide...
charge at ph 6.0                                                 2.89
ID                                               sp|Q8D079|LPHI_YERPE
Name: 355, dtype: object

sequence                                              MNRVQFNHHHHHHPD
description         sp|Q8D079|LPHI_YERPE his operon leader peptide...
charge at ph 6.0                                                 2.89
ID                                               sp|Q8D079|LPHI_YERPE
Name: 355, dtype: object
