In [99]:
from Bio import SeqIO
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *

In [100]:
#Create a selector for the intended size exclusion media
geldict = {
    "Bio P 0.1-1.8 kDa" : [100,1800],
    "Bio P 0.8-4.0 kDa" : [800,4000],
    "Bio P 1.0-6.0 kDa" : [1000,6000],
    "Bio P 1.5-20.0 kDa" : [1500,20000],
    "Bio P 2.5-40.0 kDA" : [2500,40000],
    "Bio P 3.0-60.0 kDa" : [3000,60000],
    "Bio P 5.0-100 kDa" : [5000,100000],
    "S-X 0.4-14.0 kDa" : [400,14000],
    "S-X <2.0 kDA" : [0,2000],
    "S-X <0.4 kDA" : [0,400],
    "Bio A 10.0 - 1,500 kDA" : [10000,1500000],
}
gelselect = Dropdown(options=geldict.keys(),description="Media")
display(gelselect)


testbutton = widgets.Button(description="Confirm")
output = widgets.Output()
display(testbutton, output)
def on_button_clicked(b):
    with output:
        filttype = gelselect.value
        filtrange = geldict[gelselect.value]
    
testbutton.on_click(on_button_clicked)



Dropdown(description='Media', options=('Bio P 0.1-1.8 kDa', 'Bio P 0.8-4.0 kDa', 'Bio P 1.0-6.0 kDa', 'Bio P 1…

Button(description='Confirm', style=ButtonStyle())

Output()

In [101]:
#Generate a file path for the correct .fasta file

filename = 'bacillus_halodurans.faa'
#filename = input("Enter File Name")
datafile = os.path.join('data', filename)

In [102]:
data_folder = os.path.join('data', filename.replace(".faa","")+"_size_exclusion "+gelselect.value)
os.mkdir(data_folder)

In [108]:
size_min = geldict[gelselect.value][0]
size_max = geldict[gelselect.value][1]
unfiltered = []
wash = []
fractions = []
unpassed = []
with open(datafile) as protfile:
    for record in SeqIO.parse(protfile,"fasta"):
        unfiltered.append(record)
        protparams = PA(str(record.seq))
        if protparams.molecular_weight() < size_min:
            wash.append(record)
        elif protparams.molecular_weight() <= size_max:
            fractions.append(record)
        else:
            unpassed.append(record)
            
        
SeqIO.write(unfiltered,os.path.join(data_folder,filename.replace(".faa","")+"_total.faa"),"fasta")
SeqIO.write(wash,os.path.join(data_folder,filename.replace(".faa","")+"_wash.faa"),"fasta")
SeqIO.write(fractions,os.path.join(data_folder,filename.replace(".faa","")+"_fractions.faa"),"fasta")
SeqIO.write(unpassed,os.path.join(data_folder,filename.replace(".faa","")+"_unpassed.faa"),"fasta")

367

In [104]:
#Converting modified fasta into a pd dataframe to show filter at work, preset file name used for simplicity

data = defaultdict(list)


protMW = []
prot_desc = []
prot_name = []
prot_id = []
with open(os.path.join(data_folder,filename.replace(".faa","")+"_fractions.faa")) as protfile:
  for record in SeqIO.parse(protfile,"fasta"):
    sequence = str(record.seq)
    data['sequence'].append(sequence)
    protparams = PA(sequence)
    protMW.append(protparams.molecular_weight())
    prot_desc.append(str(record.description.replace(record.id,"")))
    prot_id.append(str(record.id))


df = pd.DataFrame.from_dict(data)
df['MW'] = protMW
df['description'] = prot_desc
df['ID'] = prot_id

df.sort_values(by=["MW"], inplace=True)
df

Unnamed: 0,sequence,MW,description,ID
316,MTVFEALMFAVAFATLIIAVLSFHEKK,3028.6700,BH0344~unknown [Bacillus halodurans],gi|10172957|dbj|BAB04063.1|
1054,MSGAYHGGFALIVVLFILLVIVGAAWYY,3044.6490,BH1178~unknown [Bacillus halodurans],gi|10173794|dbj|BAB04897.1|
922,MNTILILGVCLAFLVSIFTAGYEDNPHKN,3194.7197,BH1024~unknown [Bacillus halodurans],gi|10173639|dbj|BAB04743.1|
2908,MDSESHDQGNMFISIGATVTIAAFSMMADK,3206.6015,BH3226~unknown [Bacillus halodurans],gi|10175849|dbj|BAB06945.1|
1323,MPRKFQQLIIYIMIITIVVGSLLMGVSIF,3325.1858,BH1467~unknown [Bacillus halodurans],gi|10174084|dbj|BAB05186.1|
...,...,...,...,...
3469,MKRRLFPLTFSAKMMGFIALLIIAMFVLLGVFLNEQYARTLEEQMG...,59813.1839,two-component sensor histidine kinase [Bacill...,gi|10176464|dbj|BAB07558.1|
3379,MDQQSIVSILMILATFGLYIGISIYNRARATSDFYVASRGVPPFWN...,59852.7093,Na+/solute symporter (Ssf family) [Bacillus h...,gi|10176367|dbj|BAB07462.1|
1750,MMNSPLLLSHFIERAERYFPTKQVISRTLNGKKSLTYQQIGERTRR...,59872.7326,medium-chain fatty acid-CoA ligase [Bacillus ...,gi|10174550|dbj|BAB05651.1|
2442,MHKIISMPKRLLVKEQNVQTKTTGERLRLFLEELGPTFVKMGQMAS...,59885.9180,ABC transporter [Bacillus halodurans],gi|10175329|dbj|BAB06427.1|


In [105]:
nfractions = 7
fraclen = round(len(df)/nfractions)
for n in range(nfractions - 1):
    fracrecords = []
    tempdf = df.iloc[n*fraclen:(n+1)*fraclen]
    for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        fracrecords.append(record)
    SeqIO.write(fracrecords,os.path.join(data_folder,filename.replace(".faa","")+"_fraction"+ str(n+1) + ".faa"),"fasta")

tempdf = df.iloc[(nfractions-1)*fraclen:]
remainderrecord = []
for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        remainderrecord.append(record)
SeqIO.write(remainderrecord,os.path.join(data_folder,filename.replace(".faa","")+"_fraction"+ str(nfractions) + ".faa"),"fasta")
tempdf

Unnamed: 0,sequence,MW,description,ID
3310,MSTSIGQSTFENQLDSMVREFVKEKLETIMKEEMESFFTHEHPELK...,45809.6237,transposase (04) [Bacillus halodurans],gi|10176288|dbj|BAB07383.1|
1886,MSTSIGQSTFENQLDSMVREFVKEKLETIMKEEMESFFTHEHPELK...,45809.6237,transposase (04) [Bacillus halodurans],gi|10174712|dbj|BAB05812.1|
286,MSTSIGQSTFENQLDSMVREFVKEKLETIMKEEMESFFTHEHPELK...,45809.6237,transposase (04) [Bacillus halodurans],gi|10172924|dbj|BAB04030.1|
355,MSTSIGQSTFENQLDSMVREFVKEKLETIMKEEMESFFTHEHPELK...,45809.6237,transposase (04) [Bacillus halodurans],gi|10173000|dbj|BAB04106.1|
328,MSTSIGQSTFENQLDSMVREFVKEKLETIMKEEMESFFTHEHPELK...,45809.6237,transposase (04) [Bacillus halodurans],gi|10172969|dbj|BAB04075.1|
...,...,...,...,...
3469,MKRRLFPLTFSAKMMGFIALLIIAMFVLLGVFLNEQYARTLEEQMG...,59813.1839,two-component sensor histidine kinase [Bacill...,gi|10176464|dbj|BAB07558.1|
3379,MDQQSIVSILMILATFGLYIGISIYNRARATSDFYVASRGVPPFWN...,59852.7093,Na+/solute symporter (Ssf family) [Bacillus h...,gi|10176367|dbj|BAB07462.1|
1750,MMNSPLLLSHFIERAERYFPTKQVISRTLNGKKSLTYQQIGERTRR...,59872.7326,medium-chain fatty acid-CoA ligase [Bacillus ...,gi|10174550|dbj|BAB05651.1|
2442,MHKIISMPKRLLVKEQNVQTKTTGERLRLFLEELGPTFVKMGQMAS...,59885.9180,ABC transporter [Bacillus halodurans],gi|10175329|dbj|BAB06427.1|


Unnamed: 0,sequence,MW,description,ID
2170,MKKILIHGCVFAIILLMTYGAVQNPFSSQYIGQLKEEALPVAKMTD...,36681.5519,BH2406~unknown conserved protein [Bacillus ha...,gi|10175026|dbj|BAB06125.1|
1733,METVKEKAHSKIRVQRRKKILTTLKQQKYLYLMSLPFVAWVLVFNY...,36699.2142,transmembrane lipoprotein [Bacillus halodurans],gi|10174529|dbj|BAB05630.1|
504,MGDVVMNERELILAIETSCDETSAAVIENGTTILSNVVSSQIDSHK...,36707.2413,glycoprotein endopeptidase [Bacillus halodurans],gi|10173161|dbj|BAB04267.1|
2732,MFGGFSKDIGIDLGTANTLVYVKGKGIVLREPSVVAKRTDTGTIEA...,36714.7191,cell-shape determining protein [Bacillus halo...,gi|10175653|dbj|BAB06750.1|
713,MSELAKQNRKMAELESTPPPKKKKTYNPYGFSGSTNIFMHTLIGTF...,36741.0746,sugar transport system (permease) (binding pr...,gi|10173409|dbj|BAB04514.1|
...,...,...,...,...
1682,MNREVVIVGGGPAGMLMGLLLAKEKIQVTVLEKNTTFNRDFRGETI...,45693.7371,BH1851~unknown conserved protein in others [B...,gi|10174469|dbj|BAB05570.1|
1889,MGKSYFIWLAVHFVIWVFVLLHIEPSIHNSQLASICLFFIILFILP...,45694.6146,BH2096~unknown conserved protein [Bacillus ha...,gi|10174715|dbj|BAB05815.1|
553,MDRYFGFKEHGTTYGRESIAGLTTFLSMAYILFVNPLILGDAGMDV...,45709.7390,BH0608~unknown conserved protein [Bacillus ha...,gi|10173222|dbj|BAB04327.1|
2986,MLISRTVFDQVIERKQTASAKWDAVDQLFQGEDLLPMWVADMDFRA...,45716.9487,aminotransferase [Bacillus halodurans],gi|10175936|dbj|BAB07032.1|
