In [1]:
import re

def classify_residue(residue):
    polar = set('RKEDQNHSTY')
    nonpolar = set('GAVLIMFWP')
    
    if residue in polar:
        return 'P'  # Polar
    elif residue in nonpolar:
        return 'N'  # Non-polar
    else:
        return 'P'  # Mặc định coi không xác định là polar (an toàn hơn)

def find_polar_switch(seq):
    """Tìm vị trí đổi tính chất phân cực (polar/nonpolar)"""
    profile = [classify_residue(aa) for aa in seq]
    switch_points = []
    for i in range(1, len(profile)):
        if profile[i] != profile[i-1]:
            switch_points.append(i)
    return switch_points

def split_peptide(seq, min_len=10, max_len=50):
    if len(seq) <= max_len:
        return [seq]

    switch_points = find_polar_switch(seq)
    fragments = []
    last_cut = 0

    for p in switch_points:
        # Nếu đoạn từ last_cut đến p >= min_len
        if (p - last_cut) >= min_len:
            frag = seq[last_cut:p]
            if len(frag) <= max_len:
                fragments.append(frag)
                last_cut = p

    # Xử lý đoạn còn lại
    if (len(seq) - last_cut) >= min_len:
        frag = seq[last_cut:]
        if len(frag) <= max_len:
            fragments.append(frag)
        else:
            # Nếu đoạn cuối vẫn > max_len, cắt đều
            for i in range(last_cut, len(seq), max_len):
                frag = seq[i:i+max_len]
                if len(frag) >= min_len:
                    fragments.append(frag)
    return fragments

def process_peptides(peptide_list, min_len=20, max_len=30):
    all_fragments = {}
    for idx, seq in enumerate(peptide_list):
        fragments = split_peptide(seq, min_len, max_len)
        all_fragments[f"Peptide_{idx+1}"] = fragments
    return all_fragments


In [2]:
peptides = [
        "MKKKKYLIDLFSGCGGLSFGFEQAGFECLIGVDIEQSALNTFAHNHKHAKALNLDLSEDESISKIIEEIGNKNIEIIVAGPPCQ",
        "MLNTIELFAGCGGLLDGFKQTNKYNLLAAVEWQKAQVNTLIERLKSKYNTKDANEKVFYFDIQRTDELIYGWKDDEIYSSSKGIDHQIAGQNVDVITGGPPCQ"
    ]

results = process_peptides(peptides)

In [None]:
for pep_id, frags in results.items():
    print(f"== {pep_id} ==")
    for i, frag in enumerate(frags, 1):
        print(f"Fragment {i}: {frag} (Length: {len(frag)})")
    print("-"*40)

== Peptide_1 ==
Fragment 1: MKKKKYLIDLFSGCGGLSFGF (Length: 21)
Fragment 2: EQAGFECLIGVDIEQSALNT (Length: 20)
Fragment 3: FAHNHKHAKALNLDLSEDES (Length: 20)
Fragment 4: ISKIIEEIGNKNIEIIVAGPP (Length: 21)
----------------------------------------
== Peptide_2 ==
Fragment 1: MLNTIELFAGCGGLLDGFKQTNKYN (Length: 25)
Fragment 2: LLAAVEWQKAQVNTLIERLKSKYNTKD (Length: 27)
Fragment 3: ANEKVFYFDIQRTDELIYGW (Length: 20)
Fragment 4: KDDEIYSSSKGIDHQIAGQN (Length: 20)
----------------------------------------
