In [1]:
from Bio import SeqIO
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *

In [34]:
#Create a selector for the intended size exclusion media
geldict = {
    "Bio P 0.1-1.8 kDa" : [100,1800],
    "Bio P 0.8-4.0 kDa" : [800,4000],
    "Bio P 1.0-6.0 kDa" : [1000,6000],
    "Bio P 1.5-20.0 kDa" : [1500,20000],
    "Bio P 2.5-40.0 kDA" : [2500,40000],
    "Bio P 3.0-60.0 kDa" : [3000,60000],
    "Bio P 5.0-100 kDa" : [5000,100000],
    "S-X 0.4-14.0 kDa" : [400,14000],
    "S-X <2.0 kDA" : [0,2000],
    "S-X <0.4 kDA" : [0,400],
    "Bio A 10.0 - 1,500 kDA" : [10000,1500000],
}
gelselect = Dropdown(options=geldict.keys(),description="Media")
display(gelselect)

filttype = Label(value = "Media: Waiting")

testbutton = widgets.Button(description="Confirm")
display(testbutton, output)
def on_button_clicked():
        filttype.value = str(gelselect.value)
    
testbutton.on_click(on_button_clicked)


Dropdown(description='Media', options=('Bio P 0.1-1.8 kDa', 'Bio P 0.8-4.0 kDa', 'Bio P 1.0-6.0 kDa', 'Bio P 1…

Button(description='Confirm', style=ButtonStyle())

Output(layout=Layout(border='1px solid black'), outputs=({'name': 'stdout', 'text': 'Bio P 0.1-1.8 kDa\n[100, …

In [35]:
print(filttype.value)

Media: Waiting


In [14]:
out = widgets.Output(layout={'border': '1px solid black'})
out.append_stdout('Settings')
with out:
    for i in range(10):
        print(i, 'Hello world!')

out

Output(layout=Layout(border='1px solid black'))

In [16]:
from IPython.display import YouTubeVideo
with out:
    display(YouTubeVideo('eWzY2nGfkXk'))

out = widgets.Output(layout={'border': '1px solid black'})
out.append_stdout('Output appended with append_stdout')
out.append_display_data(YouTubeVideo('eWzY2nGfkXk'))
out

Output(layout=Layout(border='1px solid black'), outputs=({'output_type': 'stream', 'name': 'stdout', 'text': '…

In [3]:
#Generate a file path for the correct .fasta file

filename = 'bacillus_halodurans.faa'
#filename = input("Enter File Name")
datafile = os.path.join('data', filename)

In [4]:
data_folder = os.path.join('data', filename.replace(".faa","")+"_size_exclusion "+gelselect.value)

os.mkdir(data_folder)

In [5]:
size_min = geldict[gelselect.value][0]
size_max = geldict[gelselect.value][1]
unfiltered = []
wash = []
fractions = []
unpassed = []
with open(datafile) as protfile:
    for record in SeqIO.parse(protfile,"fasta"):
        unfiltered.append(record)
        protparams = PA(str(record.seq))
        if protparams.molecular_weight() < size_min:
            wash.append(record)
        elif protparams.molecular_weight() <= size_max:
            fractions.append(record)
        else:
            unpassed.append(record)
            
        
SeqIO.write(unfiltered,os.path.join(data_folder,filename.replace(".faa","")+"_total.faa"),"fasta")
SeqIO.write(wash,os.path.join(data_folder,filename.replace(".faa","")+"_wash.faa"),"fasta")
SeqIO.write(fractions,os.path.join(data_folder,filename.replace(".faa","")+"_fractions.faa"),"fasta")
SeqIO.write(unpassed,os.path.join(data_folder,filename.replace(".faa","")+"_unpassed.faa"),"fasta")

4016

In [6]:
#Converting modified fasta into a pd dataframe to show filter at work, preset file name used for simplicity

data = defaultdict(list)


protMW = []
prot_desc = []
prot_name = []
prot_id = []
with open(os.path.join(data_folder,filename.replace(".faa","")+"_total.faa")) as protfile:
  for record in SeqIO.parse(protfile,"fasta"):
    sequence = str(record.seq)
    data['sequence'].append(sequence)
    protparams = PA(sequence)
    protMW.append(protparams.molecular_weight())
    prot_desc.append(str(record.description.replace(record.id,"")))
    prot_id.append(str(record.id))


df = pd.DataFrame.from_dict(data)
df['MW'] = protMW
df['description'] = prot_desc
df['ID'] = prot_id

df.sort_values(by=["MW"], inplace=True)
df.head(10)

Unnamed: 0,sequence,MW,description,ID
3463,MDVKELLSGLD,1219.4048,BH3464~unknown [Bacillus halodurans],gi|10176087|dbj|BAB07183.1|
3861,MNYLLTKIIILQ,1462.8381,BH3862~unknown [Bacillus halodurans],gi|10176487|dbj|BAB07581.1|
837,MKDTVLANPAVIYE,1563.8129,BH0838~unknown [Bacillus halodurans],gi|10173453|dbj|BAB04557.1|
2710,MIQKRFSKPMIVK,1606.0516,BH2711~unknown [Bacillus halodurans],gi|10175332|dbj|BAB06430.1|
1031,MKDPKRMTVMIIHP,1697.1408,BH1032~unknown [Bacillus halodurans],gi|10173647|dbj|BAB04751.1|
1489,MIMYQKEEEEASSF,1721.9019,BH1490~unknown [Bacillus halodurans],gi|10174107|dbj|BAB05209.1|
3082,MNMADFQTWVLSYPI,1816.1039,BH3083~unknown [Bacillus halodurans],gi|10175705|dbj|BAB06802.1|
1723,MVHITQGKEPDVFHVV,1836.1186,BH1724~unknown [Bacillus halodurans],gi|10174341|dbj|BAB05443.1|
1155,MKRTIQLFKMMELEGG,1912.3439,BH1156~unknown [Bacillus halodurans],gi|10173772|dbj|BAB04875.1|
3306,MSEKGRALNDNKKQTNR,1990.206,BH3307~unknown [Bacillus halodurans],gi|10175930|dbj|BAB07026.1|


In [7]:
nfractions = 7
fraclen = round(len(df)/nfractions)
for n in range(nfractions - 1):
    fracrecords = []
    tempdf = df.iloc[n*fraclen:(n+1)*fraclen]
    for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        fracrecords.append(record)
    SeqIO.write(fracrecords,os.path.join(data_folder,filename.replace(".faa","")+"_fraction"+ str(n+1) + ".faa"),"fasta")

tempdf = df.iloc[(nfractions-1)*fraclen:]
remainderrecord = []
for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        remainderrecord.append(record)
SeqIO.write(remainderrecord,os.path.join(data_folder,filename.replace(".faa","")+"_fraction"+ str(nfractions) + ".faa"),"fasta")
tempdf

Unnamed: 0,sequence,MW,description,ID
1922,MSIIQFPKEMKWGVATASYQIEGAINAGGRGASIWDVFAKTPGKVK...,51624.5481,beta-glucosidase [Bacillus halodurans],gi|10174541|dbj|BAB05642.1|
3271,MSDKMEINWLKEVKNRRDEIVYELREFLQYDSVYDEKTRKAGAPFG...,51661.9196,Xaa-His dipeptidase [Bacillus halodurans],gi|10175895|dbj|BAB06991.1|
731,MTTSSRKRWPLFRSLANVLERKMILPTPRMAWIAMTGALLVGVGYA...,51685.0841,BH0732~unknown conserved protein in others [B...,gi|10173346|dbj|BAB04451.1|
823,MDRKETAKLMENVMTELSEKEGFLGIEDDEKRKRIVLSAQEILTTT...,51740.0034,glutamate dehydrogenase [Bacillus halodurans],gi|10173438|dbj|BAB04543.1|
1425,MTDYRVERDLIGEKKVPKEAYYGIQTMRARENFPITGYPPHEELIR...,51766.7409,aspartate ammonia-lyase [Bacillus halodurans],gi|10174043|dbj|BAB05145.1|
...,...,...,...,...
2417,MNEEQRVRQERFKLLMEQLLIPEDVTANHLKDGKIEKLTIKKDERR...,162124.2311,DNA polymerase III (alpha subunit) [Bacillus ...,gi|10175038|dbj|BAB06137.1|
1727,MQNYGYPKKQGLYDPQFEHDNCGIGFLAHMKGQKSHKIVEDALHIL...,170065.0097,glutamate synthase (large subunit) [Bacillus ...,gi|10174346|dbj|BAB05447.1|
974,MSSTLWIFYDQYVQRATVDGLPTFIGPGEQATLRISSLKNDIQVRD...,170264.5877,BH0975~unknown conserved protein [Bacillus ha...,gi|10173590|dbj|BAB04694.1|
360,MKTLKHISYKLRIFALIALLISQTLLTSLSLPFQAVANSEKSGLTN...,183655.3749,BH0361~unknown [Bacillus halodurans],gi|10172974|dbj|BAB04080.1|
