# Query

Use pip or conda to install `pytables`.

In [1]:
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqUtils import molecular_weight

In [2]:
df = pd.read_hdf("peptide_weights.h5").reset_index()
df.columns = ["peptide", 'MW']
#df.head()

In [3]:
ans = df.query("1072< MW < 1073").query("peptide.str.contains('P')", engine='python')
ans.shape

(6933, 2)

In [11]:
df.query("438< MW < 443 & peptide.str.contains('CCP')", engine='python')


Unnamed: 0,peptide,MW


In [84]:
ans.to_csv("441_442_p.csv",index=False,sep=',')

In [14]:
molecular_weight('SPH', seq_type="protein")

339.3471

# Generate

Use pip or conda to install `biopython`.

Run once to generate the `peptide_weights.h5` file.

In [7]:
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqUtils import molecular_weight

In [8]:
peptides_all = set()
for seq_record in SeqIO.parse("salamander.fasta", "fasta"):
    seq = str(seq_record.seq).removesuffix("*")
    seq_len = len(seq)
    for peptide_len in range(1, 21):
        peptide_seqs = (
            seq[i : i + peptide_len] for i in range(seq_len - peptide_len + 1)
        )
        peptides_all.update(peptide_seqs)


In [9]:
len(peptides_all)

26887953

In [10]:
peptide_weights = pd.Series(
    {
        peptide_seq: molecular_weight(peptide_seq, seq_type="protein")
        for peptide_seq in peptides_all
    }
)
peptide_weights.sort_values(inplace=True)

In [11]:
peptide_weights.head()

G     75.0666
A     89.0932
S    105.0926
P    115.1305
V    117.1463
dtype: float64

In [12]:
peptide_weights.to_hdf("peptide_weights.h5", key="MW", mode="w")
