# Creating new 1DE protein sets
The purpose of this notebook is to create new protein sets for separation on the 1DE simulation. Code will be created that analyzes proteins for molecular weight, isoelectric point and hydrophobicity for subsequent simulated separations by gel filtration, ion exchange and hydrophobic interaction chromatography.
For this notebook, we will need several libraries: os, Bio.SeqUtils, pandas, re, Bio.SeqIO, and collections.defaultdict.

| Library | Uses | Abbreviation |
| :------- | :----: | :------------: |
| os | file management in operating systems | os |
| pandas | data management | pd |
| Bio.SeqIO | parsing FASTA files | SeqIO |
| collections.defaultdict | not sure why | defaultdict |
| Bio.SeqUtils.ProtParam.ProteinAnalysis | protein calculations | PA |

In [1]:
# Import required libraries
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
import os

In [2]:
protein = IP('MKQAVYVASPDSQQIHVWQLDSAGELTLLQTVDVPGQVQPMAISPNQRHLYVGVRPDFGIVSYHIADDGTLTAAGMAPLPGSPTHIDTDRQGRFLFSASYSFNCVSISPIDTHGVVQAPIQQLDDLPAPHSANIDPTNQILLVPCLKEDKVRLFDLSAEGQLTPHAQADITVAAGAGPRHMAFHPNHQVAYCVNELNSSVDVYQISNNGQEYHLVQSLDAMPADFTGTRWAADIHITPNGRYLYISDRTANLLGIFTVSEDGRVISLVGHHLTEAQPRGFNIDHSGNFLIASGQKSDHIEVYRIDQNTGELTTLKRYPVGKGPMWVSIRGAQNS')
print(protein.sequence)
print(protein.pi())
print(f'IEP of protein {protein.sequence} is {protein.pi():.2f}')
print(f'The charge on the protein at pH 7 is {protein.charge_at_pH(5.55):.2f}')

MKQAVYVASPDSQQIHVWQLDSAGELTLLQTVDVPGQVQPMAISPNQRHLYVGVRPDFGIVSYHIADDGTLTAAGMAPLPGSPTHIDTDRQGRFLFSASYSFNCVSISPIDTHGVVQAPIQQLDDLPAPHSANIDPTNQILLVPCLKEDKVRLFDLSAEGQLTPHAQADITVAAGAGPRHMAFHPNHQVAYCVNELNSSVDVYQISNNGQEYHLVQSLDAMPADFTGTRWAADIHITPNGRYLYISDRTANLLGIFTVSEDGRVISLVGHHLTEAQPRGFNIDHSGNFLIASGQKSDHIEVYRIDQNTGELTTLKRYPVGKGPMWVSIRGAQNS
5.553590202331542
IEP of protein MKQAVYVASPDSQQIHVWQLDSAGELTLLQTVDVPGQVQPMAISPNQRHLYVGVRPDFGIVSYHIADDGTLTAAGMAPLPGSPTHIDTDRQGRFLFSASYSFNCVSISPIDTHGVVQAPIQQLDDLPAPHSANIDPTNQILLVPCLKEDKVRLFDLSAEGQLTPHAQADITVAAGAGPRHMAFHPNHQVAYCVNELNSSVDVYQISNNGQEYHLVQSLDAMPADFTGTRWAADIHITPNGRYLYISDRTANLLGIFTVSEDGRVISLVGHHLTEAQPRGFNIDHSGNFLIASGQKSDHIEVYRIDQNTGELTTLKRYPVGKGPMWVSIRGAQNS is 5.55
The charge on the protein at pH 7 is 0.04


In [3]:
datafile = os.path.join('data', 'yersinia_pestis[632].fasta')

In [4]:
filename = 'yersinia_pestis[632].fasta'
filename = input("Enter File Name")
print(filename)
datafile = os.path.join('data', filename)
print(datafile)

Enter File Name aechaeoglobus_fulgidus.faa


aechaeoglobus_fulgidus.faa
data\aechaeoglobus_fulgidus.faa


In [5]:
import pandas as pd
from Bio import SeqIO
from collections import defaultdict


data = defaultdict(list)
datalen = []
with open(datafile) as fp:
  for record in SeqIO.parse(fp,"fasta"):
    
    sequence = str(record.seq)
    length = len(sequence)
    # data['organism'].append(organism)
    data['sequence'].append(sequence)
    datalen.append(length) 

df = pd.DataFrame.from_dict(data)
# df['lengths'] = data2
df['lengths'] = datalen
# print(data2)
df

Unnamed: 0,sequence,lengths
0,MQLSIPFWSDFNSAFEEFVKLFLALSIPFWSDFNSISVSISLSMFT...,126
1,MKAASYGVPFQSHFGLISTIRGNDGLPRVVVFFQSHFGLISTVKIV...,175
2,MKETIQLAIGVMLLAMLGCYIYITEFYHYESTEESSKAAIEYLNQL...,164
3,MPLRIAVVDRERCQPKKCGQECVKYCPRVRTGDETVKIEDKAVISE...,586
4,MKLFHRITISLSEDDMKLIEELRRDSGDSLSKIFREAIQLYYNLLK...,197
...,...,...
2402,MSSKKLSLEQIRKDLREFSRYEAVIYGSYVTGEYREGSDIDVAVIT...,147
2403,MEALSEIPERVKTPIEVSGVFYNLLTSIESAMDISAMLVKDLGGRV...,119
2404,MVVIVYVIVAYDVNVERVNRVKKFLRRYLNWVQNSLFEGELSSADL...,92
2405,MRKKNYYLVSDGKLRRHENTIYFENEDGKRPIPINSIYAIYALGSL...,322


In [6]:
import pandas as pd
import re
from Bio import SeqIO
from collections import defaultdict
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA

data = defaultdict(list)
datalen = []
protpI = []
protMW = []
prot_charge_at_pH = []
with open(datafile) as fp:
  for record in SeqIO.parse(fp,"fasta"):
    
    sequence = str(record.seq)
    length = len(sequence)
    # data['organism'].append(organism)
    data['sequence'].append(sequence)
    datalen.append(length)
    protparams = PA(sequence)
    protpI.append(protparams.isoelectric_point())
    protMW.append(protparams.molecular_weight())
    prot_charge_at_pH.append(protparams.charge_at_pH(7))

df = pd.DataFrame.from_dict(data)

df['length'] = datalen

df['IEP'] = protpI

df['MW'] = protMW

df['charge at pH 7'] = prot_charge_at_pH

df

Unnamed: 0,sequence,length,IEP,MW,charge at pH 7
0,MQLSIPFWSDFNSAFEEFVKLFLALSIPFWSDFNSISVSISLSMFT...,126,6.023080,14474.2517,-2.151053
1,MKAASYGVPFQSHFGLISTIRGNDGLPRVVVFFQSHFGLISTVKIV...,175,9.912095,19949.9286,6.024084
2,MKETIQLAIGVMLLAMLGCYIYITEFYHYESTEESSKAAIEYLNQL...,164,5.080974,18879.4461,-8.073724
3,MPLRIAVVDRERCQPKKCGQECVKYCPRVRTGDETVKIEDKAVISE...,586,6.148751,66091.6100,-3.868617
4,MKLFHRITISLSEDDMKLIEELRRDSGDSLSKIFREAIQLYYNLLK...,197,6.013474,22583.6651,-2.192692
...,...,...,...,...,...
2402,MSSKKLSLEQIRKDLREFSRYEAVIYGSYVTGEYREGSDIDVAVIT...,147,9.152915,17793.2953,3.794495
2403,MEALSEIPERVKTPIEVSGVFYNLLTSIESAMDISAMLVKDLGGRV...,119,4.650532,13548.5276,-9.378797
2404,MVVIVYVIVAYDVNVERVNRVKKFLRRYLNWVQNSLFEGELSSADL...,92,4.870158,10730.2915,-4.469387
2405,MRKKNYYLVSDGKLRRHENTIYFENEDGKRPIPINSIYAIYALGSL...,322,9.275276,37260.4586,9.485492


In [7]:
df.iloc[-2,0]

'MRKKNYYLVSDGKLRRHENTIYFENEDGKRPIPINSIYAIYALGSLSITSKAISLLAKEGVCIHFFNRYGYYIGSFYPRESLVSGEVVLRQAEHHLDSEKRLHLARAFVEGAILNMARVLKKAEQDDSDVIASLQHLSSAKSIVELMGAEAAARNAYYTKFDEILKNFEFGKRSRMPPENEVNAMISFGNSLLYSAVLSEIYHTQLNPAISYLHEPSERRFSLALDIAELFKPVIVDRLIFYLVNNGIVTESDFDSRLGGILLSEEGKKKFVRHFNERLEKTVKHRKLNRKVSYQRLIRLECYKLVKHFTAVEKYSPFVMWW'

In [8]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
datapI = PA(df.iloc[-2,0])
print(datapI.sequence)
print(datapI.isoelectric_point())
print(datapI.charge_at_pH(7))
print(datapI.molecular_weight())

MRKKNYYLVSDGKLRRHENTIYFENEDGKRPIPINSIYAIYALGSLSITSKAISLLAKEGVCIHFFNRYGYYIGSFYPRESLVSGEVVLRQAEHHLDSEKRLHLARAFVEGAILNMARVLKKAEQDDSDVIASLQHLSSAKSIVELMGAEAAARNAYYTKFDEILKNFEFGKRSRMPPENEVNAMISFGNSLLYSAVLSEIYHTQLNPAISYLHEPSERRFSLALDIAELFKPVIVDRLIFYLVNNGIVTESDFDSRLGGILLSEEGKKKFVRHFNERLEKTVKHRKLNRKVSYQRLIRLECYKLVKHFTAVEKYSPFVMWW
9.275275993347169
9.485491582100678
37260.45860000002
