## 1DE Separation Simulation

This program is designed to generate simulated electrolysis separations for the purposes of generating datasets to be shown in the JBioFramework 1-Dimensional Electrolysis simulation. To use the program,
1. Select a separation method, separation media and a pH that the simulation will take place at.
2. Select a FASTA formatted file from those available in the 'data' folder.
3. Enter a name for the folder the data will be exported to.
4. Your simulated fractions will be exported to 'outputs/YOUR FOLDER NAME', the filtered entries will be viewable in a dataframe below.

In [1]:
'''
PACKAGES
'''
#Bio packages for reading protein data
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

import os #File reading and manipulation
import pandas as pd #Dataframes to keep track of, and display data
import ipywidgets as widgets #Interactable elements
from ipywidgets import *
import math #Simplifying certain operations (as of now it's just there for the ceil() operation)

pd.set_option('display.max_rows', 150)

'''
FUNCTIONS
'''
def fasta_to_dataframe(fastaPath,includeDefaults = True, pH = 7.0, attributes={}): 
    '''
    Reads a FASTA formatted file of a proteome and returns a pandas dataframe recording the attributes of each proteins entry, attributes are adjusted default includes sequence, molecular weight,
    ID, and description. All but molecular weight are necessary for rewriting proteins into another FASTA file.
    
    fastaPath - File path to the FASTA file being read
    includeDefautlts - Bool value on whether or not to include the 4 predefined columns
    attributes - Columns to go in the dataframe, formatted as a dictionary where each entry key is the column label and the entry is a lambda function that defines the data in that column
    '''
    #If includeDefaults is true then include the default categories of Sequence, ID, Description, Molecular Weight and Charge. I don't see much benefit to ever turning it off but the option is there.
    if includeDefaults: 
        attributes['Sequence'] = lambda record : str(record.seq)
        attributes['ID'] = lambda record : record.id
        attributes['Description'] = lambda record : record.description
        attributes['Molecular Weight'] = lambda record : PA(record.seq.replace('X','Q')).molecular_weight()
        attributes['Charge at pH {}'.format(pH)] = lambda record : PA(record.seq.replace('X','Q')).charge_at_pH(pH)
    elif not attributes: 
        raise Exception('No attribute functions, set includeDefault = False or add attribute functions') #This is to catch if someone uses the function without includeDefaults and with no attributes declared
    
    #This is built to use the from_dict method to create a dataframe where each dictionary key is a column and it's associated data is in a list bound to that key.
    columns = attributes.keys() #List of our tracked variables for ease of access
    dataDict = {} #dictionary the dataframe will be made from
    for x in columns: #prep dataDict to have the data added to it, will be a dictionary where each key is tied to an empty list
        dataDict[x] = []
    with open(fastaPath):
        for record in SeqIO.parse(fastaPath,'fasta'):
            for attribute in columns: #For each attribute, append the output of it's defined function to the appropriate list in dataDict
                dataDict[attribute].append(attributes[attribute](record)) 
    df = pd.DataFrame.from_dict(dataDict) #This command builds the dataframe from dataDict
    return(df)
                
def fractionate(df,numFractions,noise=0.0,newIndices = False):
    '''
    Splits a dataframe into equal segments, returned as a list of sub-dataframes.
    
    df - Dataframe being split
    numFractions - Int value, how many subdivisions are made.
    noise - Float value determining how much subdivisions can bleed over into one another. (Ex: noise=0.1 would result in each subdivision growing 10% larger in either 
            direction, overlapping that data with its neighbors
    newIndices - Determines whether the index numbers for the subdivisions will be reset to start from 0 or retain the indices of those entries in the original df
    '''
    fracLen = round(len(df) / numFractions) #Base length each sub-dataframe will be (excluding the last one)
    fuzz = fracLen * noise #How many additional entries will be added as "noise"
    dfFractions = [] 
    for n in range(numFractions):
        fuzzyMin = max(0,int(round(n*fracLen - fuzz))) #Determine the minimum index for this subdivision
        fuzzyMax = min(len(df),int(round((n+1)*fracLen + fuzz))) #Determine the maximum index for this subdivision
        if n == numFractions - 1: #check if this fraction is the last one to prevent cutting off data near the end
            fuzzyMax = len(df)
            
        output = df.iloc[fuzzyMin:fuzzyMax]  #This creates a new dataframe for each subdivision so any necessary changes, like adjusting the index numbers, can be made before appending
        if newIndices:
            output.reset_index(drop=True, inplace=True)
            
        dfFractions.append(output)
    return(dfFractions)



'''
GLOBAL VARIABLES
'''
#These dictionaries are used to link dropdown options to a boolean function that determines whether or not the sequence is separated properly by the media
#Each lambda function is designed to be run on a biopython protein analysis of a sequence, an output of True means that the protein will be separated properly.
#Currently a protein either is separated or isn't. For example, there is no difference between a protein that is too large or too small in a size exclusion separation, they will all be in the wash.
sizeDict = {
    "Bio-P 0.8-4.0 kDa" : lambda x : bool(800<x.molecular_weight()<4000),
    "Bio-P 1.0-6.0 kDa" : lambda x : bool(1000<x.molecular_weight()<6000),
    "Bio-P 1.5-20.0 kDa" : lambda x : bool(1500<x.molecular_weight()<20000),
    "Bio-P 2.5-40.0 kDA" : lambda x : bool(2500<x.molecular_weight()<40000),
    "Bio-P 3.0-60.0 kDa" : lambda x : bool(3000<x.molecular_weight()<60000),
    "Bio-P 5.0-100 kDa" : lambda x : bool(5000<x.molecular_weight()<100000),
    "S-X 0.4-14.0 kDa" : lambda x : bool(400<x.molecular_weight()<14000),
    "Bio-A 10.0 - 500 kDA" : lambda x : bool(10000<x.molecular_weight()<500000),
    "Bio-A 10.0 - 1500 kDA" : lambda x : bool(10000<x.molecular_weight()<1500000),
}
ionDict = {
    "Q Media (Triethylamine +)" : lambda x : bool(x.charge_at_pH(pHslider.value)<=-0.01),
    "S Media (Sulfite -)" : lambda x : bool(x.charge_at_pH(pHslider.value)>=0.01),
}
affinityDict ={} #Empty for now until more affinity separations are researched and implemented


methodDict = {       #This dictionary holds the previous ones for the method selection menu
    'Size Exclusion' : sizeDict,
    'Ion Exchange' : ionDict,
    #'Affinity Chromatography' : affinityDict,                      #Commented out until there's a algoritm implemented
}
sortColumn = {       #Holds a dictionary matching a filtration method to the variable that proteins should be sorted by in the fractions
    'Size Exclusion' : 'Molecular Weight',
    'Ion Exchange' : 'Charge at pH 7.0',
    'Affinity Chromatography' : ''
}
data=os.listdir('data') #List to keep track of previous files for conveinience
if '.ipynb_checkpoints' in data:
    data.remove('.ipynb_checkpoints')
style = {'description_width': 'initial'} #shorthands for widget appearances description width
autoLayout = Layout(width='auto')

#Output Trackers
init2active = False
init3active = False

#These variables are defined by user input so they're all blank for now
inFile = None
outFolder = None
method = None
main_df = pd.DataFrame

#This is just a shorthand for defining widget layouts
boxLayout = Layout(display='flex')



'''
MEDIA & METHOD SELECTION
'''
def confirmMethod(_): #Swaps media selection to match method
    with init:
        mediaSelect.options = methodDict[methodSelect.value].keys()
        mediaSelect.value = mediaSelect.options[0]

def confirmMedia(_): #Defines the method function to match the user selection
    global method
    with init:
        method = methodDict[methodSelect.value][mediaSelect.value]
        
def pHUpdate(_): #Adjusts the Charge title in the dataframe so the program sorts things properly if necessary
    with init:
        sortColumn['Ion Exchange'] = 'Charge at pH {}'.format(round(pHslider.value,2))
#Define the jupyter outputs used to display everything, each one is displayed by the former as it contains elements that need previous user input        
init = widgets.Output()
init2 = widgets.Output()
init3 = widgets.Output()

#A dropdown menu with the available separation methods
methodSelect = widgets.Dropdown(options=methodDict.keys(),description='Method:')  
methodSelect.observe(confirmMethod,names='value')

#A dropdown menu with the available media for the selected method
mediaSelect = widgets.Dropdown(options=methodDict[methodSelect.value].keys(),description='Media',continuous_updates=False)
method = methodDict[methodSelect.value][mediaSelect.value] #Initial declaration of 'method' in case the user doesn't update mediaSelect
mediaSelect.observe(confirmMedia,names='value')

#A slider to select the pH of the simulation 
pHslider = widgets.FloatSlider(value=7.0,min=0,max=14,step=0.1,description='pH',style=style)
pHslider.observe(pHUpdate)

#Box display to layout each of the above elements properly
selectDisplay = Box(children=[Box(children=[methodSelect,pHslider],layout=boxLayout),mediaSelect],layout=Layout(display='flex',flex_flow='column'))

'''
FASTA INPUT SELECTION
'''
def confirmInput(_): #Confirms validity of file and then advances to init2 if file is valid
    global init2active
    global inFile
    with init:
        file = inputFile.value
        if '.' in file:
            fileEnd = file.split('.')[1]
        else:
            errorText.value = 'Error: \"{}\" is not recognized as a FASTA file'.format(file)
            return
        fastaEnds = ['fasta', 'fna', 'ffn', 'faa', 'frn', 'fa']
        if file == None or file == '': #Checks to make sure a selection is made
            errorText.value = 'Error: No file selected'
        elif fileEnd not in fastaEnds: #Checks the file is a valid FASTA format file
            errorText.value = 'Error: \"{}\" is not recognized as a FASTA file'.format(file)
        else:
            currentInput.value = inputFile.value #Update the current input display
            errorText.value = '' #Turn off the error display if it is on
            inFile = os.path.join('data',inputFile.value) #Update inFile with the selected file
            if not init2active: #Activate init2 if it isn't already active
                display(init2)
                init2active = True


#Text box with a dropdown menu of all the files in the 'data' folder
inputFile = widgets.Combobox(value=None,placeholder='Enter a file to be separated',options=data,description='Unseparated data',style = style,layout = Layout(width='440px'),continuous_update=False)

#Confirm button for the above text entry
inputButton = widgets.Button(description='Confirm File',layout=Layout(width='auto'))
#Text display of the currently selected file
currentInput = widgets.HTML(value='No file selected',description='Current input:',style = style)
#Error display for any invalid entries
errorText = widgets.HTML(value=None,description='\t',style={'text_color':'#CC0000','font_size':'16px'})

#Call to the confirmInput function any time the Confirm button is pressed or the text entry is updated (Enter key pressed or another element is interacted with after a change in the value of the box)
inputButton.on_click(confirmInput,remove=True)
inputFile.observe(confirmInput,names=['value'])


#Box to properly layout elements
inputTop = Box(children=[inputFile,inputButton],layout=boxLayout)
inputBottom = Box(children=[currentInput,errorText],layout=boxLayout)
inputDisplay=VBox([inputTop,inputBottom])

'''
OUTPUT FOLDER CREATION
'''

def checkFolder(_): #Function to check if the requested output name already has an associated folder with data
    global init3active
    global outFolder
    if os.path.isdir(os.path.join('outputs',outputFile.value)): #If a folder already exists an overwrite prompt is displayed to warn the user
        with init2:
            header.description=f'\"{outputFile.value}\" already exists. Some files may be replaced. Continue?'
            outputFile.disabled = outputButton.disabled = True #Disables elements until overwrite prompt is handled
            display(overwriteCheck) #displays overwrite prompt
    else: #If the name is unique the program proceeds
        with init2:
            dataFolder = outputFile.value
            os.mkdir(os.path.join('outputs',dataFolder)) #Make the new folder
            outputFile.disabled = outputButton.disabled = True #Disables elements to prevent a new entry
            outFolder = os.path.join('outputs',outputFile.value) #Defines outFolder as the path to the new folder
            writeFASTAs(_) #Generates the data to the new folder
            defPageVars(_) #Defines the necessary variables to display the dataframe
            if not init3active: #Activates init3 if it isn't already active
                display(init3)
                init3active = True
        

def confirmOverwrite(_): #Runs when the user acknowledges the overwrite and proceeds
    global init3active
    global outFolder
    with init2:
        init2.clear_output() #Clears the output name and overwrite prompts
        display(outputDisplay) #Redisplays the output name box, just to display the folder though, the elements remain disabled
        outFolder = os.path.join('outputs',outputFile.value) #Defines outFolder as the path to the folder
        writeFASTAs(_) #Generates the data to the new folder
        defPageVars(_)#Defines the necessary variables to display the dataframe
        if not init3active: #Activates init3 if it isn't already active
                display(init3)
                init3active = True
    
def denyOverwrite(_):
    with init2:
        init2.clear_output() #Clears the output name and overwrite prompts 
        display(outputDisplay) #Redisplays the output name box
        #Clears and reenables the output entry
        outputFile.value = ''
        outputFile.disabled = outputButton.disabled = False

#Generate the output name entry
outputFile = widgets.Text(placeholder='Enter a label for the output folder.', style=style,disabled=False,continuous_update=False,)
outputButton = widgets.Button(description='Confirm',disabled=False)
outputLabel = widgets.Label(value='Select output location:')
outputDisplay = VBox([outputLabel,Box(children=[outputFile,outputButton])])

#Generate the overwrite prompt
confirmButton = widgets.Button(description='Yes',layout=Layout(width='auto'))
denyButton = widgets.Button(description='No',layout=Layout(width='auto'))
header = widgets.HTML(description=f'\"{outputFile.value}\" already exists. Continue?',layout=Layout(justify_content='center',width='auto'),style=style)
YesNo = Box(children=[confirmButton, denyButton],layout=boxLayout)
overwriteCheck = Box([header,YesNo],layout=Layout(display='flex',flex_flow='column',justify_content='center',),)

#Define button functionality
outputButton.on_click(checkFolder)
outputFile.observe(checkFolder,names=['value'])
confirmButton.on_click(confirmOverwrite)
denyButton.on_click(denyOverwrite)

with init2:
    display(outputDisplay)

    
'''
WRITING OUTPUT FILES
'''

#Bind user inputs to named variables for simplicity
    #inFile = os.path.join('data',inputFile.value)                                 
    #outFolder = os.path.join('outputs',outputFile.value)
    #method = methodDict[methodSelect.value][mediaSelect.value]
#Create lists of SeqRecord objects that biopython can transform into .fasta files

def writeFASTAs(_):
    global inFile
    global outFolder
    global main_df
    if not (inFile and outFolder):
        raise Exception('Input/output locations not properly selected')
    totals = []
    wash = []
    separated = []
    #Separate proteins by the parameter
    with open(inFile):
        for record in SeqIO.parse(inFile,'fasta'):
            totals.append(record)
            sequence = str(record.seq)
            #X is a notation for any residue, biopython doesn't accept it though so for the protien analysis we substituted Glutamine since it's close to the average for the properties we care about.
            protparams = PA(sequence.replace("X","Q"))
            if method(protparams):
                separated.append(record)
            else:
                wash.append(record)

    SeqIO.write(totals,os.path.join(outFolder,'{}_total'.format(outputFile.value)),'fasta')
    SeqIO.write(separated,os.path.join(outFolder,'{}_separated'.format(outputFile.value)),'fasta')
    SeqIO.write(wash,os.path.join(outFolder,'{}_wash'.format(outputFile.value)),'fasta')
    
    #Define a data frame to easily split into fractions
    main_df = fasta_to_dataframe(os.path.join(outFolder,'{}_separated'.format(outputFile.value)),includeDefaults=True,pH=round(pHslider.value,2))
    main_df.index.name = 'Protein Index #'
    main_df.sort_values(by=sortColumn[methodSelect.value],inplace=True)
    fraction_dfs = fractionate(main_df,7)
    for n in range(len(fraction_dfs)):
        records = []
        for index in range(len(fraction_dfs[n])):
            single_frac = fraction_dfs[n]
            records.append(SeqRecord(seq = Seq(single_frac.iloc[index]['Sequence']),id=single_frac.iloc[index]['ID'],description=single_frac.iloc[index]['Description']))
        SeqIO.write(records,os.path.join(outFolder,'{}_fraction_{}'.format(outputFile.value,n+1)),'fasta')    

'''
PRESENTING DATA
'''
def pageUp(_): #Handles incrementing page number, may look into combining this and pageDown in the future
    with init3:
        if (pageNum.value < pageNum.max): #Increments page number as long as it isn't at max
            pageNum.value += 1
        if pageNum.value == pageNum.max: #Disables the next page button if on the last page
            nextButton.disabled = True 
        else:                              #Ensures the button is enabled otherwise
            nextButton.disabled = False
        prevButton.disabled = False      #Enables the previous page button if it was disabled
        
        init3.clear_output()             #Redefines and updates the displayed page
        temp_df = main_df.sort_values(by=sortBy.value,ascending=bool(ascButton.icon == 'sort-amount-asc'))
        display(dfMenu)
        display(temp_df.iloc[(pageNum.value-1)*pageSize.value:pageNum.value*pageSize.value])
def pageDown(_): #See above, works the same as pageUp but in reverse
    with init3:
        if (pageNum.value > pageNum.min):
            pageNum.value -= 1
        if pageNum.value == pageNum.min:
            prevButton.disabled = True
        else:
            prevButton.disabled = False
        nextButton.disabled = False
        
        init3.clear_output()
        temp_df=main_df.sort_values(by=sortBy.value)
        display(dfMenu)
        display(temp_df.iloc[(pageNum.value-1)*pageSize.value:pageNum.value*pageSize.value])
        
def adjustMax(_):  #Adjusts the number of pages whenever the results per page are adjusted
    with init3:
        pageNum.max = math.ceil(len(main_df)/pageSize.value)
        pageNum.value = 1
        
        init3.clear_output()
        temp_df = main_df.sort_values(by=sortBy.value,ascending=bool(ascButton.icon == 'sort-amount-asc'))
        display(dfMenu)
        display(temp_df.iloc[(pageNum.value-1)*pageSize.value:pageNum.value*pageSize.value])

def defPageVars(_):  #Defines the necessary attributes of the dataframe viewer, stored as a function so it won't be called until necessary data is entered by user
    global sortCategories
    sortCategories = [main_df.index.name] #Adds the index as a sortable category
    if len(sortCategories) <= 1: #Adds the columns of the dataframe as sortable categories
        for column in list(main_df.keys()):
            sortCategories.append(column)
    for i in ['ID','Description','Sequence']: #Define column titles that can be ignored as sortable values and remove them
        sortCategories.remove(i)
    sortBy.options = sortCategories #Set the sorting menu options to the new list
    sortBy.value = sortColumn[methodSelect.value] #Define the initial sorting category by whatever is most relevant to the separation method
    
    pageNum.max=math.ceil(len(main_df)/pageSize.value)  #Establish how many pages the dataframe will be split up into
        
pageSize = widgets.BoundedIntText(value=25,min=5,max=100,step=5,description='Items per Page',style=style,layout=Layout(width='30%')) #Number selection of how many entries will be present on each viewed page
pageNum = widgets.BoundedIntText(value=1,min=1,max=1,description='Page:',style=style,layout=Layout(width='auto')) #Indicator of which page the user is on
pageSize.observe(adjustMax) #Adjusts the number of pages whenever the entries-per-page changes


#define navigation buttons
prevButton = widgets.Button(description='Prev',style=style,layout=Layout(width='auto'),disabled=True)
prevButton.on_click(pageDown)
nextButton = widgets.Button(description='Next',style=style,layout=Layout(width='auto'))
nextButton.on_click(pageUp)

#Define a list of categories it makes sense to sort, will automatically add any new columns to list

    
sortBy = widgets.Dropdown(description='Sort By:') #Dropdown menu of sortable options
def changeSortBy(_): #Whenever the sortBy menu is changed this redefines the dataframe being shown and refreshes the output
    with init3:
        init3.clear_output()
        temp_df = main_df.sort_values(by=sortBy.value,ascending=bool(ascButton.icon == 'sort-amount-asc'))
        display(dfMenu)
        display(temp_df.iloc[(pageNum.value-1)*pageSize.value:pageNum.value*pageSize.value])

sortBy.observe(changeSortBy)
ascButton = widgets.Button(icon='sort-amount-asc',layout=Layout(width='auto')) #Ascending/Descending toggle for sorting
def swapSort(_): #The swap simply uses the icon of the button as a boolean value, if the button is showing ascending order then the sort will see that and set ascending to True
    with init3:
        if ascButton.icon == 'sort-amount-asc': #If the button icon is the ascending one it will toggle to the descending one and vice versa
            ascButton.icon = 'sort-amount-desc'
        else:
            ascButton.icon = 'sort-amount-asc'
        init3.clear_output() #Refreshes output with the new sorting order
        temp_df = main_df.sort_values(by=sortBy.value,ascending=bool(ascButton.icon == 'sort-amount-asc'))
        display(dfMenu)
        display(temp_df.iloc[(pageNum.value-1)*pageSize.value:pageNum.value*pageSize.value])

    
ascButton.on_click(swapSort)

pageBox = Box(children=[prevButton,pageNum,nextButton,pageSize],layout=boxLayout) #Box to group the page controls together
dfMenu = Box(children=[pageBox,sortBy,ascButton]) #Box to group the entire menu as one object

with init3:
    display(dfMenu)

'''
INITIALIZATION
'''
with init:
    display(selectDisplay,inputDisplay)
init

Output()