In [1]:
'''
PACKAGES
'''
#Bio packages for reading protein data
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

import os #File reading and manipulation
import pandas as pd #Dataframes to keep track of, and display data
import ipywidgets as widgets #Interactable elements
from ipywidgets import *
import math #Simplifying certain operations

pd.set_option('display.max_rows', 150)

'''
FUNCTIONS
'''
def fasta_to_dataframe(fastaPath,includeDefaults = True, pH = 7.0, attributes={}): 
    '''
    Reads a FASTA formatted file of a proteome and returns a pandas dataframe recording the attributes of each proteins entry, attributes are adjusted default includes sequence, molecular weight,
    ID, and description. All but molecular weight are necessary for rewriting proteins into another FASTA file.
    
    fastaPath - File path to the FASTA file being read
    includeDefautlts - Bool value on whether or not to include the 4 predefined columns
    attributes - Columns to go in the dataframe, formatted as a dictionary where each entry key is the column label and the entry is a lambda function that defines the data in that column
    '''
    attributes.clear()
    if includeDefaults:
        attributes['Sequence'] = lambda record : str(record.seq)
        attributes['ID'] = lambda record : record.id
        attributes['Description'] = lambda record : record.description
        attributes['Molecular Weight'] = lambda record : PA(record.seq.replace('X','Q')).molecular_weight()
        attributes['Charge at pH {}'.format(pH)] = lambda record : PA(record.seq.replace('X','Q')).charge_at_pH(pH)
    elif not attributes:
        raise Exception('No attribute functions, set includeDefault = False or add attribute functions')
    
    #This is built to use the from_dict method to create a dataframe where each dictionary key is a column and it's associated data is in a list bound to that key.
    columns = attributes.keys() #List of our tracked variables for ease of access
    dataDict = {} #dictionary the dataframe will be made from
    for x in columns: #prep dataDict to have the data added to it, will be a dictionary where each key is tied to an empty list
        dataDict[x] = []
    with open(fastaPath):
        for record in SeqIO.parse(fastaPath,'fasta'):
            for attribute in columns: #For each attribute, append the output of it's defined function to the appropriate list in dataDict
                dataDict[attribute].append(attributes[attribute](record)) 
    df = pd.DataFrame.from_dict(dataDict) #This command builds the dataframe from dataDict
    return(df)
                
def fractionate(df,numFractions,noise=0.0,newIndices = False):
    '''
    Splits a dataframe into equal segments, returned as a list of sub-dataframes.
    
    df - Dataframe being split
    numFractions - Int value, how many subdivisions are made.
    noise - Float value determining how much subdivisions can bleed over into one another. (Ex: noise=0.1 would result in each subdivision growing 10% larger in either 
            direction, overlapping that data with its neighbors
    newIndices - Determines whether the index numbers for the subdivisions will be reset to start from 0 or retain the indices of those entries in the original df
    '''
    fracLen = round(len(df) / numFractions) #Base length each sub-dataframe will be (excluding the last one)
    fuzz = fracLen * noise #How many additional entries will be added as "noise"
    dfFractions = [] 
    for n in range(numFractions):
        fuzzyMin = max(0,int(round(n*fracLen - fuzz))) #Determine the minimum index for this subdivision
        fuzzyMax = min(len(df),int(round((n+1)*fracLen + fuzz))) #Determine the maximum index for this subdivision
        if n == numFractions - 1: #check if this fraction is the last one to prevent cutting off data near the end
            fuzzyMax = len(df)
            
        output = df.iloc[fuzzyMin:fuzzyMax]  #This creates a new dataframe for each subdivision so any necessary changes, like adjusting the index numbers, can be made before appending
        if newIndices:
            output.reset_index(drop=True, inplace=True)
            
        dfFractions.append(output)
    return(dfFractions)



'''
GLOBAL VARIABLES
'''
#These dictionaries are used to link dropdown options to a boolean function that determines whether or not the sequence is separated properly by the media
sizeDict = {
    #"Bio-P 0.1-1.8 kDa" : lambda x : bool(100<x.molecular_weight()<1800),  #Currently disabled because the size range is too small to be useful
    "Bio-P 0.8-4.0 kDa" : lambda x : bool(800<x.molecular_weight()<4000),
    "Bio-P 1.0-6.0 kDa" : lambda x : bool(1000<x.molecular_weight()<6000),
    "Bio-P 1.5-20.0 kDa" : lambda x : bool(1500<x.molecular_weight()<20000),
    "Bio-P 2.5-40.0 kDA" : lambda x : bool(2500<x.molecular_weight()<40000),
    "Bio-P 3.0-60.0 kDa" : lambda x : bool(3000<x.molecular_weight()<60000),
    "Bio-P 5.0-100 kDa" : lambda x : bool(5000<x.molecular_weight()<100000),
    "S-X 0.4-14.0 kDa" : lambda x : bool(400<x.molecular_weight()<14000),
    #"S-X <2.0 kDA" : lambda x : bool(0<x.molecular_weight()<2000),         #Currently disabled because the size range is too small to be useful
    #"S-X <0.4 kDA" : lambda x : bool(0<x.molecular_weight()<400),          #Currently disabled because the size range is too small to be useful
    "Bio-A 10.0 - 500 kDA" : lambda x : bool(10000<x.molecular_weight()<500000),
    "Bio-A 10.0 - 1500 kDA" : lambda x : bool(10000<x.molecular_weight()<1500000),
}
ionDict = {
    "Q Media (Triethylamine +)" : lambda x : bool(x.charge_at_pH(pHslider.value)<=-0.01),
    "S Media (Sulfite -)" : lambda x : bool(x.charge_at_pH(pHslider.value)>=0.01),
}
affinityDict ={} #Empty for now until more affinity separations are researched and implemented


methodDict = {       #This dictionary holds the previous ones for the method selection menu
    'Size Exclusion' : sizeDict,
    'Ion Exchange' : ionDict,
    #'Affinity Chromatography' : affinityDict,                      #Commented out until there's a algoritm implemented
}
sortColumn = {       #Holds a dictionary matching a filtration method to the variable that proteins should be sorted by in the fractions
    'Size Exclusion' : 'Molecular Weight',
    'Ion Exchange' : 'Charge at pH 7.0',
    'Affinity Chromatography' : ''
}
data=os.listdir('data') #List to keep track of previous files for conveinience
if '.ipynb_checkpoints' in data:
    data.remove('.ipynb_checkpoints')
style = {'description_width': 'initial'} #shorthands for widget appearances description width
autoLayout = Layout(width='auto')

#Output Trackers
init2active = False
init3active = False

inFile = None
outFolder = None
method = None

main_df = pd.DataFrame





'''
MEDIA & METHOD SELECTION
'''
def confirmMethod(_): #Swaps media selection to match method
    with init:
        mediaSelect.options = methodDict[methodSelect.value].keys()
        mediaSelect.value = mediaSelect.options[0]

def confirmMedia(_):
    global method
    with init:
        method = methodDict[methodSelect.value][mediaSelect.value]
def pHUpdate(_):
    with init:
        sortColumn['Ion Exchange'] = 'Charge at pH {}'.format(pHslider.value)
        
init = widgets.Output()
init2 = widgets.Output()
init3 = widgets.Output()
methodSelect = widgets.Dropdown(options=methodDict.keys(),description='Method:')
methodSelect.observe(confirmMethod,names='value')
mediaSelect = widgets.Dropdown(options=methodDict[methodSelect.value].keys(),description='Media',continuous_updates=False)
method = methodDict[methodSelect.value][mediaSelect.value] #Initial declaration of 'method' in case the user doesn't update mediaSelect
mediaSelect.observe(confirmMedia,names='value')
pHslider = widgets.FloatSlider(value=7.0,min=0,max=14,step=0.1,description='pH',style=style)
pHslider.observe(pHUpdate)
selectDisplay = TwoByTwoLayout(top_left=methodSelect,bottom_left=mediaSelect,bottom_right=pHslider,layout=Layout(width='50%'))


'''
FASTA INPUT SELECTION
'''
def confirmInput(_):
    global init2active
    global inFile
    with init:
        file = inputFile.value
        fileEnd = file.split('.')[1]
        fastaEnds = ['fasta', 'fna', 'ffn', 'faa', 'frn', 'fa']
        if file == None or file == '':
            errorText.value = 'Error: No file selected'
        elif fileEnd not in fastaEnds:
            errorText.value = 'Error: \"{}\" is not recognized as a FASTA file'.format(inputFile.value)
        else:
            if inputFile.value not in data:
                previousInputs.append(inputFile.value)
            currentInput.value = inputFile.value
            inputFile.options = data
            errorText.value = ''
            inFile = os.path.join('data',inputFile.value)
            if not init2active:
                display(init2)
                init2active = True

boxLayout = Layout(width='30%')

inputFile = widgets.Combobox(value=None,placeholder='Enter a file to be separated',options=data,description='Unseparated data',style = style,layout = Layout(width='440px'),continuous_update=False)

inputButton = widgets.Button(description='Confirm File')
currentInput = widgets.HTML(value='No file selected',description='Current input:',style = style)
errorText = widgets.HTML(value=None,description='\t',style={'text_color':'#CC0000','font_size':'16px'})


inputButton.on_click(confirmInput,remove=True)
inputFile.observe(confirmInput,names=['value'])



inputTop = Box(children=[inputFile,inputButton],layout=boxLayout)
inputBottom = Box(children=[currentInput,errorText],layout=boxLayout)
inputDisplay=VBox([inputTop,inputBottom])

'''
OUTPUT FOLDER CREATION
'''

def checkFolder(_):
    global init3active
    global outFolder
    if os.path.isdir(os.path.join('outputs',outputFile.value)):
        with init2:
            header.description=f'\"{outputFile.value}\" already exists. Some files may be replaced. Continue?'
            outputFile.disabled = outputButton.disabled = True
            display(overwriteCheck)
    else:
        with init2:
            dataFolder = outputFile.value
            os.mkdir(os.path.join('outputs',dataFolder))
            outputFile.disabled = outputButton.disabled = True
            outFolder = os.path.join('outputs',outputFile.value)
            writeFASTAs(_)
            if not init3active:
                display(init3)
                init3active = True
        

def confirmOverwrite(_):
    global init3active
    global outFolder
    with init2:
        init2.clear_output()
        display(outputDisplay)
        outFolder = os.path.join('outputs',outputFile.value)
        writeFASTAs(_)
        if not init3active:
                display(init3)
                init3active = True
    
def denyOverwrite(_):
    with init2:
        init2.clear_output()
        display(outputDisplay)
        outputFile.value = ''
        outputFile.disabled = outputButton.disabled = False

#Generate the output name entry
outputFile = widgets.Text(placeholder='Enter a label for the output folder.', style=style,disabled=False,continuous_update=False,)
outputButton = widgets.Button(description='Confirm',disabled=False)
outputLabel = widgets.Label(value='Select output location:')
outputDisplay = VBox([outputLabel,Box(children=[outputFile,outputButton])])

#Generate the overwrite prompt
confirmButton = widgets.Button(description='Yes',layout=Layout(width='auto'))
denyButton = widgets.Button(description='No',layout=Layout(width='auto'))
header = widgets.HTML(description=f'\"{outputFile.value}\" already exists. Continue?',layout=Layout(justify_content='center',width='auto'),style=style)
YesNo = Box(children=[confirmButton, denyButton],layout=Layout())
overwriteCheck = VBox([header,YesNo],layout=Layout(justify_content='center',),)

#Define button functionality
outputButton.on_click(checkFolder)
outputFile.observe(checkFolder,names=['value'])
confirmButton.on_click(confirmOverwrite)
denyButton.on_click(denyOverwrite)

with init2:
    display(outputDisplay)

    
'''
WRITING OUTPUT FILES
'''

#Bind user inputs to named variables for simplicity
    #inFile = os.path.join('data',inputFile.value)                                 
    #outFolder = os.path.join('outputs',outputFile.value)
    #method = methodDict[methodSelect.value][mediaSelect.value]
#Create lists of SeqRecord objects that biopython can transform into .fasta files

def writeFASTAs(_):
    global inFile
    global outFolder
    global main_df
    if not (inFile and outFolder):
        raise Exception('Input/output locations not properly selected')
    totals = []
    wash = []
    separated = []
    #Separate proteins by the parameter
    with open(inFile):
        for record in SeqIO.parse(inFile,'fasta'):
            totals.append(record)
            sequence = str(record.seq)
            #X is a notation for any residue, biopython doesn't accept it though so for the protien analysis we substituted Glutamine since it's close to the average for the properties we care about.
            protparams = PA(sequence.replace("X","Q"))
            if method(protparams):
                separated.append(record)
            else:
                wash.append(record)

    SeqIO.write(totals,os.path.join(outFolder,'{}_total'.format(outputFile.value)),'fasta')
    SeqIO.write(separated,os.path.join(outFolder,'{}_separated'.format(outputFile.value)),'fasta')
    SeqIO.write(wash,os.path.join(outFolder,'{}_wash'.format(outputFile.value)),'fasta')
    
    #Define a data frame to easily split into fractions
    main_df = fasta_to_dataframe(os.path.join(outFolder,'{}_separated'.format(outputFile.value)),includeDefaults=True,pH=pHslider.value)
    main_df.index.name = 'Protein Index #'
    main_df.sort_values(by=sortColumn[methodSelect.value],inplace=True)
    fraction_dfs = fractionate(main_df,7)
    for n in range(len(fraction_dfs)):
        records = []
        for index in range(len(fraction_dfs[n])):
            single_frac = fraction_dfs[n]
            records.append(SeqRecord(seq = Seq(single_frac.iloc[index]['Sequence']),id=single_frac.iloc[index]['ID'],description=single_frac.iloc[index]['Description']))
        SeqIO.write(records,os.path.join(outFolder,'{}_fraction_{}'.format(outputFile.value,n+1)),'fasta')    

'''
PRESENTING DATA
'''
def pageUp(_):
    with init3:
        if (pageNum.value < pageNum.max):
            pageNum.value += 1
        if pageNum.value == pageNum.max:
            nextButton.disabled = True
        else:
            nextButton.disabled = False
        prevButton.disabled = False
        
        init3.clear_output()
        temp_df = main_df.sort_values(by=sortBy.value,ascending=bool(ascButton.icon == 'sort-amount-asc'))
        display(dfMenu)
        display(temp_df.iloc[(pageNum.value-1)*pageSize.value:pageNum.value*pageSize.value])
def pageDown(_):
    with init3:
        if (pageNum.value > pageNum.min):
            pageNum.value -= 1
        if pageNum.value == pageNum.min:
            prevButton.disabled = True
        else:
            prevButton.disabled = False
        nextButton.disabled = False
        
        init3.clear_output()
        temp_df=main_df.sort_values(by=sortBy.value)
        display(dfMenu)
        display(temp_df.iloc[(pageNum.value-1)*pageSize.value:pageNum.value*pageSize.value])
        
def adjustMax(_):
    with init3:
        pageNum.max = math.ceil(len(main_df)/pageSize.value)
        pageNum.value = 1
        
        init3.clear_output()
        temp_df = main_df.sort_values(by=sortBy.value,ascending=bool(ascButton.icon == 'sort-amount-asc'))
        display(dfMenu)
        display(temp_df.iloc[(pageNum.value-1)*pageSize.value:pageNum.value*pageSize.value])

def defPageVars(_):
    global sortCategories
    sortCategories = [main_df.index.name]
    if len(sortCategories) <= 1:
        for column in list(main_df.keys()):
            sortCategories.append(column)
    for i in ['ID','Description','Sequence']: #Define column titles that can be ignored as sortable values
        sortCategories.remove(i)
    sortBy.options = sortCategories
    
    pageNum.max=math.ceil(len(main_df)/pageSize.value)
        
pageSize = widgets.BoundedIntText(value=25,min=5,max=100,step=5,description='Items per Page',style=style,layout=Layout(width='30%'))
pageNum = widgets.BoundedIntText(value=1,min=1,max=1,description='Page:',style=style,layout=Layout(width='18%'))
pageSize.observe(adjustMax)


#define navigation buttons
prevButton = widgets.Button(description='Prev',style=style,layout=Layout(width='10%'),disabled=True)
prevButton.on_click(pageDown)
nextButton = widgets.Button(description='Next',style=style,layout=Layout(width='10%'))
nextButton.on_click(pageUp)

#Define a list of categories it makes sense to sort, will automatically add any new columns to list

    
sortBy = widgets.Dropdown(description='Sort By:')
def changeSortBy(_):
    with init3:
        init3.clear_output()
        temp_df = main_df.sort_values(by=sortBy.value,ascending=bool(ascButton.icon == 'sort-amount-asc'))
        display(dfMenu)
        display(temp_df.iloc[(pageNum.value-1)*pageSize.value:pageNum.value*pageSize.value])

sortBy.observe(changeSortBy)
ascButton = widgets.Button(icon='sort-amount-asc',layout=Layout(max_width='35px'))
def swapSort(_):
    with init3:
        if ascButton.icon == 'sort-amount-asc':
            ascButton.icon = 'sort-amount-desc'
        else:
            ascButton.icon = 'sort-amount-asc'
        init3.clear_output()
        temp_df = main_df.sort_values(by=sortBy.value,ascending=bool(ascButton.icon == 'sort-amount-asc'))
        display(dfMenu)
        display(temp_df.iloc[(pageNum.value-1)*pageSize.value:pageNum.value*pageSize.value])

    
ascButton.on_click(swapSort)

pageBox = Box(children=[prevButton,pageNum,nextButton,pageSize],layout=Layout(width='35%'))
dfMenu = Box(children=[pageBox,sortBy,ascButton])

   

'''
INITIALIZATION
'''
with init:
    display(selectDisplay,inputDisplay)
init

Output()

In [24]:
def foo(_):
    with init3:
        print(text.value)

init3 = widgets.Output()
text = widgets.Combobox(continuous_update=False)
text.observe(foo,names=['value'])
with init3:
    display(text)
init3

Output()

<function __main__.fullDataOutput(_)>

In [30]:
b = widgets.Button()
b.on_click(writeFASTAs)
b

Button(style=ButtonStyle())

In [20]:
print(inFile,outFolder)

None outputs\Bacillus_subtilis


In [29]:
method

<function __main__.<lambda>(x)>

In [33]:
writeFASTAs(_)

In [18]:
mediaSelect.options[0]

'Q Media (Triethylamine +)'