# Creating new 1DE protein sets
The purpose of this notebook is to create new protein sets for separation on the 1DE simulation. Code will be created that analyzes proteins for molecular weight, isoelectric point and hydrophobicity for subsequent simulated separations by gel filtration, ion exchange and hydrophobic interaction chromatography.
For this notebook, we will need several libraries: os, Bio.SeqUtils, pandas, re, Bio.SeqIO, and collections.defaultdict.

| Library | Uses | Abbreviation |
| :------- | :----: | :------------: |
| os | file management in operating systems | os |
| pandas | data management | pd |
| Bio.SeqIO | parsing FASTA files | SeqIO |
| collections.defaultdict | not sure why | defaultdict |
| Bio.SeqUtils.ProtParam.ProteinAnalysis | protein calculations | PA |

In [2]:
# Import required libraries
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
import os

In [3]:
protein = IP('MKQAVYVASPDSQQIHVWQLDSAGELTLLQTVDVPGQVQPMAISPNQRHLYVGVRPDFGIVSYHIADDGTLTAAGMAPLPGSPTHIDTDRQGRFLFSASYSFNCVSISPIDTHGVVQAPIQQLDDLPAPHSANIDPTNQILLVPCLKEDKVRLFDLSAEGQLTPHAQADITVAAGAGPRHMAFHPNHQVAYCVNELNSSVDVYQISNNGQEYHLVQSLDAMPADFTGTRWAADIHITPNGRYLYISDRTANLLGIFTVSEDGRVISLVGHHLTEAQPRGFNIDHSGNFLIASGQKSDHIEVYRIDQNTGELTTLKRYPVGKGPMWVSIRGAQNS')
print(protein.sequence)
print(protein.pi())
print(f'IEP of protein {protein.sequence} is {protein.pi():.2f}')
print(f'The charge on the protein at pH 7 is {protein.charge_at_pH(5.55):.2f}')

MKQAVYVASPDSQQIHVWQLDSAGELTLLQTVDVPGQVQPMAISPNQRHLYVGVRPDFGIVSYHIADDGTLTAAGMAPLPGSPTHIDTDRQGRFLFSASYSFNCVSISPIDTHGVVQAPIQQLDDLPAPHSANIDPTNQILLVPCLKEDKVRLFDLSAEGQLTPHAQADITVAAGAGPRHMAFHPNHQVAYCVNELNSSVDVYQISNNGQEYHLVQSLDAMPADFTGTRWAADIHITPNGRYLYISDRTANLLGIFTVSEDGRVISLVGHHLTEAQPRGFNIDHSGNFLIASGQKSDHIEVYRIDQNTGELTTLKRYPVGKGPMWVSIRGAQNS
5.553590202331542
IEP of protein MKQAVYVASPDSQQIHVWQLDSAGELTLLQTVDVPGQVQPMAISPNQRHLYVGVRPDFGIVSYHIADDGTLTAAGMAPLPGSPTHIDTDRQGRFLFSASYSFNCVSISPIDTHGVVQAPIQQLDDLPAPHSANIDPTNQILLVPCLKEDKVRLFDLSAEGQLTPHAQADITVAAGAGPRHMAFHPNHQVAYCVNELNSSVDVYQISNNGQEYHLVQSLDAMPADFTGTRWAADIHITPNGRYLYISDRTANLLGIFTVSEDGRVISLVGHHLTEAQPRGFNIDHSGNFLIASGQKSDHIEVYRIDQNTGELTTLKRYPVGKGPMWVSIRGAQNS is 5.55
The charge on the protein at pH 7 is 0.04


In [4]:
datafile = os.path.join('data', 'yersinia_pestis[632].fasta')

In [6]:
filename = 'yersinia_pestis[632].fasta'
filename = input("Enter File Name")
print(filename)
datafile = os.path.join('data', filename)
print(datafile)

Enter File Name bacillus_halodurans_size_exclusion Bio P 3.0-60.0 kDa/bacillus_halodurans_fraction7.faa


bacillus_halodurans_size_exclusion Bio P 3.0-60.0 kDa/bacillus_halodurans_fraction7.faa
data\bacillus_halodurans_size_exclusion Bio P 3.0-60.0 kDa/bacillus_halodurans_fraction7.faa


In [9]:
import pandas as pd
from Bio import SeqIO
from collections import defaultdict


data = defaultdict(list)
datalen = []
with open(datafile) as fp:
  for record in SeqIO.parse(fp,"fasta"):
    
    sequence = str(record.seq)
    length = len(sequence)
    # data['organism'].append(organism)
    data['sequence'].append(sequence)
    datalen.append(length) 

df = pd.DataFrame.from_dict(data)
# df['lengths'] = data2
df['lengths'] = datalen
# print(data2)
df

Unnamed: 0,sequence,lengths
0,MSTSIGQSTFENQLDSMVREFVKEKLETIMKEEMESFFTHEHPELK...,390
1,MSTSIGQSTFENQLDSMVREFVKEKLETIMKEEMESFFTHEHPELK...,390
2,MSTSIGQSTFENQLDSMVREFVKEKLETIMKEEMESFFTHEHPELK...,390
3,MSTSIGQSTFENQLDSMVREFVKEKLETIMKEEMESFFTHEHPELK...,390
4,MSTSIGQSTFENQLDSMVREFVKEKLETIMKEEMESFFTHEHPELK...,390
...,...,...
518,MKRRLFPLTFSAKMMGFIALLIIAMFVLLGVFLNEQYARTLEEQMG...,538
519,MDQQSIVSILMILATFGLYIGISIYNRARATSDFYVASRGVPPFWN...,554
520,MMNSPLLLSHFIERAERYFPTKQVISRTLNGKKSLTYQQIGERTRR...,538
521,MHKIISMPKRLLVKEQNVQTKTTGERLRLFLEELGPTFVKMGQMAS...,526


In [8]:
import pandas as pd
import re
from Bio import SeqIO
from collections import defaultdict
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA

data = defaultdict(list)
datalen = []
protpI = []
protMW = []
prot_charge_at_pH = []
with open(datafile) as fp:
  for record in SeqIO.parse(fp,"fasta"):
    
    sequence = str(record.seq)
    length = len(sequence)
    # data['organism'].append(organism)
    data['sequence'].append(sequence)
    datalen.append(length)
    protparams = PA(sequence)
    protpI.append(protparams.isoelectric_point())
    protMW.append(protparams.molecular_weight())
    prot_charge_at_pH.append(protparams.charge_at_pH(7))

df = pd.DataFrame.from_dict(data)

df['length'] = datalen

df['IEP'] = protpI

df['MW'] = protMW

df['charge at pH 7'] = prot_charge_at_pH

df

Unnamed: 0,length,IEP,MW,charge at pH 7


In [7]:
df.iloc[-2,0]

'MKKKHRIKKNDEFQTVFQKGKSNANRQFVVYQLDKEEQPNFRIGLSVSKKIGNAVVRNRIKRMIRQSITELKDEIDSGKDFVIIARKPCAEMTYEELKKSLIHVFKRSGMKRIKSSVRK'

In [8]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
datapI = PA(df.iloc[-2,0])
print(datapI.sequence)
print(datapI.isoelectric_point())
print(datapI.charge_at_pH(7))
print(datapI.molecular_weight())

MKKKHRIKKNDEFQTVFQKGKSNANRQFVVYQLDKEEQPNFRIGLSVSKKIGNAVVRNRIKRMIRQSITELKDEIDSGKDFVIIARKPCAEMTYEELKKSLIHVFKRSGMKRIKSSVRK
10.403085899353027
17.67080155355838
14026.344999999987


In [9]:
csv_filepath = "data\\CSVs\\" + filename.replace(".faa",".csv").replace(".fasta",".csv")
#print(csv_filepath)
df.to_csv(csv_filepath)