In [21]:
from Bio import SeqIO
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *

In [61]:
def listtostr(strlist):
    output = ""
    for word in strlist:
        output += word + "_"
    return(output[:len(output)-1])
def simplifyname(orgname):
    firstname = [orgname.split("_")[0][0]]
    lastname = orgname.split("_")[1:]
    fullname = firstname + lastname
    return(listtostr(fullname))

In [66]:
#Generate a file path for the correct .fasta file
input_filename = input("Enter File Name")
datafile = os.path.join('data', input_filename)
organism_name = input_filename.replace(".faa","")

Enter File Name Yersinia_pestis.faa


In [67]:
data_folder = os.path.join('data', organism_name +"_affinity_chromatography")

if not os.path.isdir(data_folder):
    os.mkdir(data_folder)

In [68]:
def getHistPercent(protstring):
    Hcount = 0
    for AA in protstring:
        if AA == "H":
            Hcount += 1
    return(Hcount/len(protstring))

In [69]:
def findHistChains(protstring):
    chainDict = {}
    chainlen = 1
    lastAA = ""
    for AA in protstring:
        if AA == "H":
            if lastAA == "H":
                chainlen += 1
        elif chainlen > 1 and lastAA == "H":
            #print(f"Chain found, current length {chainlen}")
            if str(chainlen) in chainDict:
                chainDict[str(chainlen)] = chainDict[str(chainlen)] + 1
            else:
                chainDict[str(chainlen)] = 1
            chainlen = 1
        lastAA = AA
    if chainlen > 1:
        if str(chainlen) in chainDict:
            chainDict[str(chainlen)] = chainDict[str(chainlen)] + 1
        else:
            chainDict[str(chainlen)] = 1
    return(chainDict)
        
findHistChains("HAAHHHHBDFHHAHH")

{'4': 1, '2': 2}

In [70]:
def HistScore(protstring):
    last4 = ""
    last8 = ""
    CD4 = 0
    CD8 = 0
    score = 0
    for AA in protstring:
        #iterate last4 & last8
        if len(last4) < 4:
            last4 += AA
        else:
            last4 = last4[1:] + AA
        if len(last8) < 8:
            last8 += AA
        else:
            last8 = last8[1:] + AA
        
        #scan for non-chain H groupings 
        if getHistPercent(last4) >= 0.5 and CD4 < 1:
            score += 2
            CD4 = 4
        else:
            CD4 -= 1
        if getHistPercent(last8) >= 0.375 and CD8 < 1:
            score += 6
            CD8 = 8
        else:
            CD8 -= 1
    
    #scan for chains
    histChains = findHistChains(protstring)
    for key in histChains:
        score += histChains[key] * (int(key) ** 2)
    return(score)

            
HistScore('HAAHHHHBDFHHAHH')

42

In [71]:
unfiltered = []
wash = []
fractions = []
with open(datafile) as protfile:
    for record in SeqIO.parse(protfile,"fasta"):
        unfiltered.append(record)
        protparams = PA(str(record.seq))
        if HistScore(record.seq) < 9:
            wash.append(record)
        else:
            fractions.append(record)    
        
SeqIO.write(unfiltered,os.path.join(data_folder,filename.replace(".faa","")+"_total.faa"),"fasta")
SeqIO.write(wash,os.path.join(data_folder,filename.replace(".faa","")+"_wash.faa"),"fasta")
SeqIO.write(fractions,os.path.join(data_folder,filename.replace(".faa","")+"_fractions.faa"),"fasta")


125

In [83]:
datadict = {
    'sequence' : [],
    'length' : [],
    'isoelectric_point' : [],
    'hist_score' : [],
    'max_chain_length': [],
    'ID' : [],
    'description' : []
}

with open(os.path.join(data_folder,filename.replace(".faa","")+"_fractions.faa"),"r") as prot_file:
    for record in SeqIO.parse(prot_file,"fasta"):
        sequence = str(record.seq)  #It is not strictly necessary to record the sequence as a string, but it is easier to work with and displays better. (Ex:len() does not work on Seq objects)
        parameters = PA(sequence)
        datadict['sequence'].append(sequence)
        datadict['length'].append(len(sequence))
        datadict['isoelectric_point'].append(parameters.isoelectric_point())
        datadict['hist_score'].append(HistScore(sequence))
        datadict['max_chain_length'].append(max(findHistChains(sequence).keys(),default = '0'))
        datadict['ID'].append(record.id)
        datadict['description'].append(record.description)

df = pd.DataFrame.from_dict(datadict)
df.sort_values('hist_score',inplace = True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,sequence,length,isoelectric_point,hist_score,max_chain_length,ID,description
0,MKMKLPPFIELYRALIATPSISAADSALDQSNEALINLLAGWFADL...,389,5.196585,10,2,sp|Q8ZA85|ARGE_YERPE,sp|Q8ZA85|ARGE_YERPE Acetylornithine deacetyla...
1,MHNTFTHTKNNTHTKNNTQAKNSGSQTKSNAVSLNKPRKLTEADVT...,366,9.211968,10,0,sp|Q8ZAW9|MSRP_YERPE,sp|Q8ZAW9|MSRP_YERPE Protein-methionine-sulfox...
2,MCGIVGAVAQRDIAEILIEGLRRLEYRGYDSAGLAVVDSEGHLTRL...,609,5.534947,10,2,sp|Q8Z9S8|GLMS_YERPE,sp|Q8Z9S8|GLMS_YERPE Glutamine--fructose-6-pho...
3,MSQFLTEDFLLDTEFARRLYHDYAKDQPIFDYHCHLPPEQIAENYR...,469,5.769408,10,2,sp|Q8ZIC6|UXAC_YERPE,sp|Q8ZIC6|UXAC_YERPE Uronate isomerase OS=Yers...
4,MNIQALLSDKVSQALIAAGAPADCEAQVRQSAKAQFGDYQANGVMA...,576,5.359542,10,2,sp|Q8ZEV7|SYR_YERPE,sp|Q8ZEV7|SYR_YERPE Arginine--tRNA ligase OS=Y...
...,...,...,...,...,...,...,...
120,MSTSRLQQQFIRLWQRYNGQSTETTLQALAEVLNCSRRHVRSLLGK...,553,6.233214,44,2,sp|Q74Q56|SGRR_YERPE,sp|Q74Q56|SGRR_YERPE HTH-type transcriptional ...
121,MNRVQFNHHHHHHPD,15,7.016626,46,6,sp|Q8D079|LPHI_YERPE,sp|Q8D079|LPHI_YERPE his operon leader peptide...
122,MKVTKDLVVSLAYQVRTEDGVLVDESPVSAPLDYLHGHGSLIAGLE...,195,4.719876,47,3,sp|Q7CFU4|SLYD_YERPE,sp|Q7CFU4|SLYD_YERPE FKBP-type peptidyl-prolyl...
123,MLIPSKLSRPVRLQNTVVRDRLLVKLSSAANYRLTLINCPAGYGKT...,903,5.990000,48,2,sp|Q8ZJI2|MALT_YERPE,sp|Q8ZJI2|MALT_YERPE HTH-type transcriptional ...


In [79]:
nfractions = 7
fraclen = round(len(df)/nfractions)
seqofinterest = "HHHHHH"
seqhits = {}
for n in range(nfractions - 1):
    fracrecords = []
    tempdf = df.iloc[n*fraclen:(n+1)*fraclen]
    for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        fracrecords.append(record)
        if seqofinterest in tempdf['sequence'][index]:
            if str(n) in seqhits:
                seqhits[str(n+1)][0] += 1
                seqhits[str(n+1)][1].append(index)
            else:
                seqhits[str(n+1)] = [1,[index]]
    SeqIO.write(fracrecords,os.path.join(data_folder,filename.replace(".faa","")+"_fraction"+ str(n+1) + ".faa"),"fasta")

tempdf = df.iloc[(nfractions-1)*fraclen:]
remainderrecord = []
for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        remainderrecord.append(record)
        if seqofinterest in tempdf['sequence'][index]:
            if str(n) in seqhits:
                seqhits[str(n+1)][0] += 1
                seqhits[str(n+1)][1].append(index)
            else:
                seqhits[str(n+1)] = [1,[index]]
SeqIO.write(remainderrecord,os.path.join(data_folder,filename.replace(".faa","")+"_fraction"+ str(nfractions) + ".faa"),"fasta")
tempdf

Unnamed: 0,sequence,length,isoelectric_point,hist_score,max_chain_length,ID,description
108,MTDNNKALKNAGLKVTLPRLKILEVLQNPACHHVSAEDLYKILIDI...,148,6.097710,23,3,sp|P33086|FUR_YERPE,sp|P33086|FUR_YERPE Ferric uptake regulation p...
109,MNLISIPAFQDNYIWLLANRQKHCVIVDPGESAPVLATLAQGQYVP...,251,6.093674,23,3,sp|Q0WHW5|GLO2_YERPE,sp|Q0WHW5|GLO2_YERPE Hydroxyacylglutathione hy...
110,MNKYDLIERMNTRFAELEVTLHQLHQQLDDLPLIAARVFSLPEIEK...,311,8.759851,24,2,sp|Q9L6X9|TUS_YERPE,sp|Q9L6X9|TUS_YERPE DNA replication terminus s...
111,MSISLIQPERDLFSYQPYWAECYGTAPFLPMSREEMDILGWDSCDI...,781,9.165422,24,2,sp|Q8CZS4|Y674_YERPE,sp|Q8CZS4|Y674_YERPE Putative UPF0313 protein ...
112,MSVLPDRQVINQLISGHYGDPFSILGMHETSQGLQICALLPDAREV...,727,5.780662,26,2,sp|Q8ZA75|GLGB_YERPE,"sp|Q8ZA75|GLGB_YERPE 1,4-alpha-glucan branchin..."
...,...,...,...,...,...,...,...
120,MSTSRLQQQFIRLWQRYNGQSTETTLQALAEVLNCSRRHVRSLLGK...,553,6.233214,44,2,sp|Q74Q56|SGRR_YERPE,sp|Q74Q56|SGRR_YERPE HTH-type transcriptional ...
121,MNRVQFNHHHHHHPD,15,7.016626,46,6,sp|Q8D079|LPHI_YERPE,sp|Q8D079|LPHI_YERPE his operon leader peptide...
122,MKVTKDLVVSLAYQVRTEDGVLVDESPVSAPLDYLHGHGSLIAGLE...,195,4.719876,47,3,sp|Q7CFU4|SLYD_YERPE,sp|Q7CFU4|SLYD_YERPE FKBP-type peptidyl-prolyl...
123,MLIPSKLSRPVRLQNTVVRDRLLVKLSSAANYRLTLINCPAGYGKT...,903,5.990000,48,2,sp|Q8ZJI2|MALT_YERPE,sp|Q8ZJI2|MALT_YERPE HTH-type transcriptional ...


In [85]:
seqhits
for key in seqhits:
    print(f'Hits in fraction {key}')
    for hit in seqhits[key][1]:
        print(f'\tHit on index {hit}')

Hits in fraction 6
	Hit on index 121
