In [1]:
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *

In [2]:
def listtostr(strlist):
    output = ""
    for word in strlist:
        output += word + "_"
    return(output[:len(output)-1])
def simplifyname(orgname):
    firstname = [orgname.split("_")[0][0]]
    lastname = orgname.split("_")[1:]
    fullname = firstname + lastname
    return(listtostr(fullname))

In [3]:
#Generate a file path for the correct .fasta file
input_filename = input("Enter Input Name")
output_name = input("Enter Output Name")
datafile = os.path.join('data', input_filename)
#organism_name = input_filename.replace(".faa","")

Enter Input Name Samples Sets - E Coli Size and Ion separation_fraction_1
Enter Output Name Samples Sets - E Coli Three Separations


In [4]:
data_folder = os.path.join('data', output_name)

if not os.path.isdir(data_folder):
    os.mkdir(data_folder)

In [5]:
def getHistPercent(protstring):
    Hcount = 0
    for AA in protstring:
        if AA == "H":
            Hcount += 1
    return(Hcount/len(protstring))

In [6]:
def findHistChains(protstring):
    chainDict = {}
    chainlen = 1
    lastAA = ""
    for AA in protstring:
        if AA == "H":
            if lastAA == "H":
                chainlen += 1
        elif chainlen > 1 and lastAA == "H":
            #print(f"Chain found, current length {chainlen}")
            if str(chainlen) in chainDict:
                chainDict[str(chainlen)] = chainDict[str(chainlen)] + 1
            else:
                chainDict[str(chainlen)] = 1
            chainlen = 1
        lastAA = AA
    if chainlen > 1:
        if str(chainlen) in chainDict:
            chainDict[str(chainlen)] = chainDict[str(chainlen)] + 1
        else:
            chainDict[str(chainlen)] = 1
    return(chainDict)

In [7]:
def HistScore(protstring):
    last4 = ""
    last8 = ""
    CD4 = 0
    CD8 = 0
    score = 0
    for AA in protstring:
        #iterate last4 & last8
        if len(last4) < 4:
            last4 += AA
        else:
            last4 = last4[1:] + AA
        if len(last8) < 8:
            last8 += AA
        else:
            last8 = last8[1:] + AA
        
        #scan for non-chain H groupings 
        if getHistPercent(last4) >= 0.5 and CD4 < 1:
            score += 2
            CD4 = 4
        else:
            CD4 -= 1
        if getHistPercent(last8) >= 0.375 and CD8 < 1:
            score += 6
            CD8 = 8
        else:
            CD8 -= 1
    
    #scan for chains
    histChains = findHistChains(protstring)
    for key in histChains:
        score += histChains[key] * (int(key) ** 2)
    return(score)

In [8]:
unfiltered = []
wash = []
fractions = []
with open(datafile) as protfile:
    for record in SeqIO.parse(protfile,"fasta"):
        unfiltered.append(record)
        protparams = PA(str(record.seq))
        if HistScore(record.seq) < 8:
            wash.append(record)
        else:
            fractions.append(record)    

#data_name = "Affinity_chromatography_" + simplifyname(organism_name)
data_name = output_name
SeqIO.write(unfiltered,os.path.join(data_folder,data_name+"_total.faa"),"fasta")
SeqIO.write(wash,os.path.join(data_folder,data_name+"_wash.faa"),"fasta")
SeqIO.write(fractions,os.path.join(data_folder,data_name+"_fractions.faa"),"fasta")


1

In [11]:
datadict = {
    'sequence' : [],
    'length' : [],
    'isoelectric_point' : [],
    'hist_score' : [],
    'max_chain_length': [],
    'ID' : [],
    'description' : []
}

with open(os.path.join(data_folder,data_name+"_fractions.faa"),"r") as prot_file:
    for record in SeqIO.parse(prot_file,"fasta"):
        sequence = str(record.seq)  #It is not strictly necessary to record the sequence as a string, but it is easier to work with and displays better. (Ex:len() does not work on Seq objects)
        parameters = PA(sequence)
        datadict['sequence'].append(sequence)
        datadict['length'].append(len(sequence))
        datadict['isoelectric_point'].append(parameters.isoelectric_point())
        datadict['hist_score'].append(HistScore(sequence))
        datadict['max_chain_length'].append(max(findHistChains(sequence).keys(),default = '0'))
        datadict['ID'].append(record.id)
        datadict['description'].append(record.description)

df = pd.DataFrame.from_dict(datadict)
df.sort_values('hist_score',inplace = True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,sequence,length,isoelectric_point,hist_score,max_chain_length,ID,description
0,MTQSRLHAAQNALAKLHERRGNTFYPHFHLAPPAGWMNDPNGLIWF...,483,5.770374,64,6,sp|P40714|CSCA_ECOLX,sp|P40714|CSCA_ECOLX Sucrose-6-phosphate hydro...


In [12]:
def param_of_interest(protdata):
    output = bool(HistScore(protdata) >= 4)
    return(output)

nfractions = 7
fraclen = round(len(df)/nfractions)
seqofinterest = "HHHHHH"
seqhits = {}
noise = 0.10
for n in range(nfractions - 1):
    fracrecords = []
    #Generate min/max range values
    fuzzymin = int(round(n*fraclen-noise*fraclen,0))
    fuzzymax = int(round((n+1)*fraclen + noise*fraclen,0))
    if fuzzymin < 0:
        fuzzymin = 0
    if fuzzymax > len(df):
        fuzzymax = len(df)
    tempdf = df.iloc[fuzzymin:fuzzymax]
    for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        fracrecords.append(record)
        if param_of_interest(tempdf['sequence'][index]) == True:
            if n+1 in seqhits:
                seqhits[n+1][0] += 1
                seqhits[n+1][1].append(index)
            else:
                seqhits[n+1] = [1,[index]]
    SeqIO.write(fracrecords,os.path.join(data_folder,data_name+"_fraction"+ str(n+1) + ".faa"),"fasta")

tempdf = df.iloc[int(round((nfractions-1)*fraclen-noise*fraclen,0)):]
remainderrecord = []
for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        remainderrecord.append(record)
        if param_of_interest(tempdf['sequence'][index]) == True:
            if nfractions in seqhits:
                seqhits[nfractions][0] += 1
                seqhits[nfractions][1].append(index)
            else:
                seqhits[nfractions] = [1,[index]]
SeqIO.write(remainderrecord,os.path.join(data_folder,data_name+"_fraction"+ str(nfractions) + ".faa"),"fasta")
tempdf

Unnamed: 0,sequence,length,isoelectric_point,hist_score,max_chain_length,ID,description
0,MTQSRLHAAQNALAKLHERRGNTFYPHFHLAPPAGWMNDPNGLIWF...,483,5.770374,64,6,sp|P40714|CSCA_ECOLX,sp|P40714|CSCA_ECOLX Sucrose-6-phosphate hydro...


In [13]:
seqhits
for key in seqhits:
    print(f'Hits in fraction {key}')
    for hit in seqhits[key][1]:
        print(f'\tHit on index {hit}')

Hits in fraction 7
	Hit on index 0


In [14]:
df.iloc[10]['sequence']

IndexError: single positional indexer is out-of-bounds

In [None]:
data_folder