In [2]:
from Bio import SeqIO
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *

In [3]:
#Create a selector for the intended size exclusion media
geldict = {
    "Q Media (Triethylamine +)" : -1,
    "S Media (Sulfite -)" : 1,
}
gelselect = Dropdown(options=geldict.keys(),description="Media")
display(gelselect)

pHslider = widgets.FloatSlider(value=7.0,min=0,max=14,step=0.1)
display(pHslider)

#testbutton = widgets.Button(description="Confirm")
#output = widgets.Output()
#display(testbutton, output)
#def on_button_clicked(b):
#    with output:
#        runpH = pHslider.value
#        gelcharge = int(geldict[gelselect.value])
    
#testbutton.on_click(on_button_clicked)



Dropdown(description='Media', options=('Q Media (Triethylamine +)', 'S Media (Sulfite -)'), value='Q Media (Tr…

FloatSlider(value=7.0, max=14.0)

In [15]:
#Generate a file path for the correct .fasta file

filename = 'bacillus_halodurans.faa'
#filename = input("Enter File Name")
datafile = os.path.join('data', filename)

In [16]:
data_folder = os.path.join('data', filename.replace(".faa","")+"_ion_exchange_"+gelselect.value[0])

os.mkdir(data_folder)

In [32]:
unfiltered = []
wash = []
fractions = []
with open(datafile) as protfile:
    for record in SeqIO.parse(protfile,"fasta"):
        unfiltered.append(record)
        protparams = PA(str(record.seq))
        if protparams.charge_at_pH(pHslider.value)*int(geldict[gelselect.value]) < 0:
            wash.append(record)
        else:
            fractions.append(record)    
        
SeqIO.write(unfiltered,os.path.join(data_folder,filename.replace(".faa","")+"_total.faa"),"fasta")
SeqIO.write(wash,os.path.join(data_folder,filename.replace(".faa","")+"_wash.faa"),"fasta")
SeqIO.write(fractions,os.path.join(data_folder,filename.replace(".faa","")+"_fractions.faa"),"fasta")


2740

In [33]:
#Converting modified fasta into a pd dataframe to show filter at work, preset file name used for simplicity

data = defaultdict(list)


protcharge = []
prot_desc = []
prot_name = []
prot_id = []
with open(os.path.join(data_folder,filename.replace(".faa","")+"_fractions.faa")) as protfile:
  for record in SeqIO.parse(protfile,"fasta"):
    sequence = str(record.seq)
    data['sequence'].append(sequence)
    protparams = PA(sequence)
    protcharge.append(protparams.charge_at_pH(pHslider.value))
    prot_desc.append(str(record.description.replace(record.id,"")))
    prot_id.append(str(record.id))


df = pd.DataFrame.from_dict(data)
df['Charge at pH '+str(pHslider.value)] = protcharge
df['description'] = prot_desc
df['ID'] = prot_id

df.sort_values(by=['Charge at pH '+str(pHslider.value)], inplace=True, ascending = bool(int(geldict[gelselect.value]) == -1))
df#.head(10)

Unnamed: 0,sequence,Charge at pH 6.9,description,ID
1362,MKHLSNKLLLLAMVAVLISHTFLASIGLPLQAIANATANVFTNVEL...,-230.287158,BH2014~unknown [Bacillus halodurans],gi|10174633|dbj|BAB05733.1|
228,MKTLKHISYKLRIFALIALLISQTLLTSLSLPFQAVANSEKSGLTN...,-182.611985,BH0361~unknown [Bacillus halodurans],gi|10172974|dbj|BAB04080.1|
2185,MVKSKFLVLFSVFSLLFGVFVVGFSHQGVKAEEERPMGTAFYESFD...,-129.620780,"endo-beta-1,3-1,4 glucanase (licheninase) [Ba...",gi|10175855|dbj|BAB06951.1|
2231,MKRKWQKLLSILSVWMILFASFAPSIAGAAEANSLANGEYTIDFKV...,-124.475843,BH3298~unknown [Bacillus halodurans],gi|10175921|dbj|BAB07017.1|
1503,MKALLIKLSIGLLLLLMFLPVALWYLKEPTELQVTILDKTVPDETY...,-121.537502,BH2230~unknown [Bacillus halodurans],gi|10174849|dbj|BAB05949.1|
...,...,...,...,...
44,MKIKEMIVVEGRDDTIAIQRAVNADTIETNGSAVNEETLKRIALAQ...,-0.057205,BH0056~unknown conserved protein [Bacillus ha...,gi|10172668|dbj|BAB03775.1|
243,MISFQQVTKKYRQKAALQEVNLELTRGKIIGLVGENGSGKSTTLKL...,-0.052113,ABC transporter (ATP-binding protein) [Bacill...,gi|10172995|dbj|BAB04101.1|
2003,MIEFKNVSLVYPNGTQGLKDVNLKINEGEFVVIVGLSGAGKSTLIR...,-0.049484,alkylphosphonate ABC tranporter (ATP-binding ...,gi|10175596|dbj|BAB06693.1|
2353,MQQQSFMQSGQQQGVMPQPPQVITVKDQLYITDMLSWNLLACKKAH...,-0.022177,BH3489~unknown [Bacillus halodurans],gi|10176113|dbj|BAB07208.1|


In [7]:
nfractions = 7
fraclen = round(len(df)/nfractions)
for n in range(nfractions - 1):
    fracrecords = []
    tempdf = df.iloc[n*fraclen:(n+1)*fraclen]
    for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        fracrecords.append(record)
    SeqIO.write(fracrecords,os.path.join(data_folder,filename.replace(".faa","")+"_fraction"+ str(n+1) + ".faa"),"fasta")

tempdf = df.iloc[(nfractions-1)*fraclen:]
remainderrecord = []
for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        remainderrecord.append(record)
SeqIO.write(remainderrecord,os.path.join(data_folder,filename.replace(".faa","")+"_fraction"+ str(nfractions) + ".faa"),"fasta")
tempdf

Unnamed: 0,sequence,MW,description,ID


In [1]:
print("test (in parantheses)")

test (in parantheses)
