In [1]:
import re

In [None]:
def classify_residue(residue):
    polar = set('RKEDQNHSTY')
    nonpolar = set('GAVLIMFWP')

    if residue in polar:
        return 'P'  # Polar
    elif residue in nonpolar:
        return 'N'  # Non-polar
    else:
        return 'P'  # Mặc định: coi không xác định là polar cho an toàn

def find_polar_switch(seq):
    profile = [classify_residue(aa) for aa in seq]
    switch_points = []
    for i in range(1, len(profile)):
        if profile[i] != profile[i-1]:
            switch_points.append(i)
    return switch_points

In [None]:
def split_peptide(seq, min_len=10, max_len=50):
    if len(seq) <= max_len:
        return [seq]

    switch_points = find_polar_switch(seq)
    fragments = []
    last_cut = 0

    for p in switch_points:
        if (p - last_cut) >= min_len:
            frag = seq[last_cut:p]
            if len(frag) <= max_len:
                fragments.append(frag)
                last_cut = p

    if (len(seq) - last_cut) >= min_len:
        frag = seq[last_cut:]
        if len(frag) <= max_len:
            fragments.append(frag)
        else:
            for i in range(last_cut, len(seq), max_len):
                frag = seq[i:i+max_len]
                if len(frag) >= min_len:
                    fragments.append(frag)
    return fragments

def parse_fasta(fasta_str):
    entries = []
    header = None
    seq_lines = []
    for line in fasta_str.strip().splitlines():
        line = line.strip()
        if line.startswith('>'):
            if header:
                entries.append((header, ''.join(seq_lines)))
            header = line[1:]
            seq_lines = []
        else:
            seq_lines.append(line)
    if header:
        entries.append((header, ''.join(seq_lines)))
    return entries

def load_fasta_file(file_path):
    with open(file_path, 'r') as f:
        return f.read()

def process_peptides(peptide_list, min_len=20, max_len=30):
    all_fragments = {}
    for header, seq in peptide_list:
        fragments = split_peptide(seq, min_len, max_len)
        all_fragments[header] = fragments
    return all_fragments

def save_fasta(results, output_path="output_fragments.fasta"):
    with open(output_path, "w") as f:
        for header, frags in results.items():
            for frag in frags:
                f.write(f">{header}\n")
                f.write(f"{frag}\n")

In [4]:

input_path = r"C:\Users\anhkh\Downloads\amp_genbank (1).fasta"   
fasta_data = load_fasta_file(input_path)

peptides = parse_fasta(fasta_data)
results = process_peptides(peptides)

save_fasta(results, output_path="output_fragments_284.fasta")
print("Fragments have been saved")

Fragments have been saved to output_fragments.fasta
