In [1]:
import pandas as pd
from collections import defaultdict
import math
from Bio import SeqIO

In [2]:
fasta_file = r"D:\PEPTIDE_PROJECT_OFF\MyWork\MyNoteBook\CD-HIT\DataSet_final.fasta"

In [3]:
def fasta_to_dataframe(fasta_file):

    data = []
    
    # Duyệt qua từng record trong tệp FASTA
    for record in SeqIO.parse(fasta_file, "fasta"):
        # Lấy ID, mô tả, và chuỗi từ từng record
        seq_id = record.id.split()[0]
        description = record.description.split()[1]
        sequence = str(record.seq)
        
        # Thêm thông tin vào danh sách
        data.append({
            "ID": seq_id,
            "Label": description,
            "Sequence": sequence
        })
    
    # Tạo DataFrame từ danh sách dữ liệu
    df = pd.DataFrame(data)
    
    return df

In [4]:
# Gọi hàm để chuyển đổi tệp FASTA thành DataFrame
df = fasta_to_dataframe(fasta_file)

# Xem dữ liệu
df.columns

Index(['ID', 'Label', 'Sequence'], dtype='object')

In [6]:
def generate_distribution_features(sequence):
    properties = {
        "hydrophobicity": {
            "polar": "RKEDQN",
            "neutral": "GASTPHY",
            "hydrophobic": "CLVIMFW"
        },
        "vdw_volume": {
            "small": "GASTPDC",
            "medium": "NVEQIL",
            "large": "MHKFRYW"
        },
        "polarity": {
            "polar": "EDQNKR",
            "neutral": "GASTPHY",
            "nonpolar": "CLVIMFW"
        },
        "polarizability": {
            "low": "GASDT",
            "medium": "CPNVEQIL",
            "high": "KMHFRYW"
        },
        "charge": {
            "negative": "DE",
            "neutral": "ACFGHILMNPQSTVWY",
            "positive": "KR"
        },
        "secondary_structure": {
            "helix": "EALMQKRH",
            "sheet": "VIYCWT",
            "coil": "GNPSD"
        },
        "solvent_accessibility": {
            "buried": "ALFCGIVW",
            "intermediate": "MPSTHY",
            "exposed": "DEKNQR"
        }
    }

    def calculate_distribution(amino_class):
        class_residues = [i for i, aa in enumerate(sequence) if aa in amino_class]
        length = len(sequence)
        total = len(class_residues)

        if total == 0:
            return {"first": 0, "20%": 0, "40%": 0, "60%": 0, "80%": 0, "100%": 0}

        distribution = {}
        for percentile, label in zip([0, 0.2, 0.4, 0.6, 0.8, 1.0], ["first", "20%", "40%", "60%", "80%", "100%"]):
            position = math.floor(total * percentile)
            if position >= total:  # Prevent index error
                position = total - 1
            residue_position = class_residues[position]
            distribution[label] = (residue_position + 1) / length * 100  # Convert to percentage

        return distribution

    feature_dict = defaultdict(dict)

    for prop, classes in properties.items():
        for class_name, amino_class in classes.items():
            # Calculate the distribution descriptors
            distribution = calculate_distribution(amino_class)
            for key, value in distribution.items():
                feature_name = f"{prop}_{class_name}_{key}"
                feature_dict[feature_name] = value

    return dict(feature_dict)

In [7]:
# Áp dụng hàm generate_distribution_features cho từng chuỗi trong cột 'Sequence'
features_list = df['Sequence'].apply(generate_distribution_features)

# Chuyển đổi danh sách các đặc trưng thành DataFrame
features_df = pd.DataFrame(features_list.tolist())

# Nối DataFrame gốc với các đặc trưng mới
df_with_features = pd.concat([df, features_df], axis=1)

# Lưu DataFrame kết quả vào tệp CSV
output_file = r'D:\PEPTIDE_PROJECT_OFF\MyWork\MyNoteBook\CD-HIT\DataSet_final.csv'
df_with_features.to_csv(output_file, index=False)

print(f"Kết quả đã được lưu vào {output_file}")

Kết quả đã được lưu vào D:\PEPTIDE_PROJECT_OFF\MyWork\MyNoteBook\CD-HIT\DataSet_final.csv


#### Anh Khoa custom ft. ChatGPT prod. by DeepSeek

In [10]:
import pandas as pd
import numpy as np
import math
from Bio import SeqIO

In [6]:
def generate_vector(sequence):
    properties = {
        "hydrophobicity": {
            "polar": "RKEDQN",
            "neutral": "GASTPHY",
            "hydrophobic": "CLVIMFW"
        },
        "vdw_volume": {
            "small": "GASTPDC",
            "medium": "NVEQIL",
            "large": "MHKFRYW"
        },
        "polarity": {
            "polar": "EDQNKR",
            "neutral": "GASTPHY",
            "nonpolar": "CLVIMFW"
        },
        "polarizability": {
            "low": "GASDT",
            "medium": "CPNVEQIL",
            "high": "KMHFRYW"
        },
        "charge": {
            "negative": "DE",
            "neutral": "ACFGHILMNPQSTVWY",
            "positive": "KR"
        },
        "secondary_structure": {
            "helix": "EALMQKRH",
            "sheet": "VIYCWT",
            "coil": "GNPSD"
        },
        "solvent_accessibility": {
            "buried": "ALFCGIVW",
            "intermediate": "MPSTHY",
            "exposed": "DEKNQR"
        }
    }

    def calculate_distribution(amino_class):
        class_residues = [i for i, aa in enumerate(sequence) if aa in amino_class]
        length = len(sequence)
        total = len(class_residues)

        if total == 0:
            return [0.0] * 6

        values = []
        for p in [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]:
            idx = math.floor(total * p)
            if idx >= total:
                idx = total - 1
            pos = class_residues[idx]
            values.append((pos + 1) / length * 100)
        return values

    matrix = []

    for prop_name in properties:
        row = []
        for class_name in properties[prop_name]:
            aa_group = properties[prop_name][class_name]
            dist = calculate_distribution(aa_group)
            row.extend(dist)  # mỗi lớp 6 mốc
        matrix.append(row)  # 3 lớp → 18 feature / 1 prop

    return np.array(matrix)  # shape (7, 18)


In [None]:
vector = generate_vector("MKKKKYLIDLFSGCGGLSFGF")
print(vector.shape)  # (7, 18)
print(vector)

(7, 18)
[[  9.52380952  14.28571429  19.04761905  23.80952381  42.85714286
   42.85714286  28.57142857  57.14285714  61.9047619   76.19047619
   85.71428571  95.23809524   4.76190476  33.33333333  47.61904762
   66.66666667  90.47619048 100.        ]
 [ 42.85714286  57.14285714  66.66666667  71.42857143  85.71428571
   95.23809524  33.33333333  33.33333333  38.0952381   47.61904762
   80.95238095  80.95238095   4.76190476   9.52380952  19.04761905
   28.57142857  90.47619048 100.        ]
 [  9.52380952  14.28571429  19.04761905  23.80952381  42.85714286
   42.85714286  28.57142857  57.14285714  61.9047619   76.19047619
   85.71428571  95.23809524   4.76190476  33.33333333  47.61904762
   66.66666667  90.47619048 100.        ]
 [ 42.85714286  57.14285714  61.9047619   76.19047619  85.71428571
   95.23809524  33.33333333  38.0952381   47.61904762  66.66666667
   80.95238095  80.95238095   4.76190476   9.52380952  19.04761905
   28.57142857  90.47619048 100.        ]
 [ 42.85714286  42.8

In [11]:
def fasta_to_dataframe(fasta_file):
    data = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        seq_id = record.id.split()[0]
        description = record.description.split()[1]
        sequence = str(record.seq)
        data.append({
            "ID": seq_id,
            "Label": description,
            "Sequence": sequence
        })
    df = pd.DataFrame(data)
    return df

In [12]:
df = fasta_to_dataframe(r"D:\PEPTIDE_PROJECT_OFF\MyWork\MyNoteBook\CD-HIT\DataSet_final.fasta")

In [16]:
df['Label'] = df['Label'].map({'nAMP': 0, 'AMP': 1})

In [17]:
df.head()

Unnamed: 0,ID,Label,Sequence
0,WGE87526.1,1,MKKKKYLIDLFSGCGGLSFGF
1,WGE87526.1,1,EQAGFECLIGVDIEQSALNT
2,WGE87526.1,1,FAHNHKHAKALNLDLSEDES
3,WGE87526.1,1,ISKIIEEIGNKNIEIIVAGPP
4,WGE87526.1,1,CQGFSLTGKRNENDKRNKLF


In [18]:
df.tail()

Unnamed: 0,ID,Label,Sequence
167791,tr|A0A0H3BQ69|A0A0H3BQ69_SALNS,0,MGVTHCHLYIMVPQYFLKRQYVATGHHKVSGESMAQDMC
167792,tr|A0A0H3BQF0|A0A0H3BQF0_SALNS,0,MVTFNIEISSRGLFQWCYNTLRLTKAFSFRRQHICPGVFL
167793,tr|A0A0H3BQI1|A0A0H3BQI1_SALNS,0,MAGIALRAEAGIYCIFPAPERIPVALLWSWVSNEAKRSGDGVSWR
167794,tr|A0A0H3BQK2|A0A0H3BQK2_SALNS,0,MPFSRQQNDSTLNLLIKRCNIVLMLKIAGMTVEPQRFGGF
167795,tr|A0A0H3BQK3|A0A0H3BQK3_SALNS,0,MYLALFLIAEDKIVTYSWLYIKNGFICVFFYAILIFYSPLIIFSMI
