In [5]:
import os
import glob
import pandas as pd
from Bio import SeqIO

def read_chains_to_df(directory: str) -> pd.DataFrame:
    """
    directory 내 모든 .fasta 파일을 읽어서,
    FASTA 헤더(>PDBID|Chain)에서 PDB ID와 체인(H, L, A)을 추출하고,
    하나의 DataFrame에 합친 뒤 pivot_table로 Hchain, Lchain, Achain 컬럼을 생성.
    """
    records = []
    for fp in glob.glob(os.path.join(directory, '*.fasta')):
        for rec in SeqIO.parse(fp, 'fasta'):
            # rec.id 예: "6X97|H"
            header = rec.id
            if '|' in header:
                pdb_id, chain = header.split('|', 1)
            else:
                # 헤더 형식이 예상과 다르면 파일명에서 시도
                name = os.path.splitext(os.path.basename(fp))[0]
                parts = name.split('_')
                if len(parts) == 2:
                    pdb_id, chain = parts
                else:
                    # 파싱 실패 시 건너뛰기
                    continue

            records.append({
                'pdb_id': pdb_id.upper(),
                'chain': chain.upper(),
                'sequence': str(rec.seq)
            })

    df = pd.DataFrame(records)
    if df.empty:
        raise ValueError(f"No records parsed from FASTA files in {directory!r}")

    # pivot_table: 중복이 있으면 첫 번째 값을 취하고, 없는 체인은 NaN 처리
    df_wide = (
        df.pivot_table(
            index='pdb_id',
            columns='chain',
            values='sequence',
            aggfunc='first'
        )
        .rename(columns={'H': 'Hchain', 'L': 'Lchain', 'A': 'Achain'})
        .reset_index()
    )
    # 컬럼 이름 정리
    df_wide.columns.name = None
    return df_wide

# 사용 예시
if __name__ == '__main__':
    directory = '/home/cseomoon/project/ABAG/2025_H_L_A/Complex'
    df_chains = read_chains_to_df(directory)
    display(df_chains)
    df_chains.to_csv('ABAG_native_complex.csv', index=False)

Unnamed: 0,pdb_id,Achain,Hchain,Lchain
0,6X97,LWVTVYYGVPVWKDAETTLFCASDAKAYETEKHNVWATHACVPTDP...,QLVESGGGLVKPGTSLSLTCKASGFDFSDNYYICWVRQAPGKGLEW...,IVMTQTPASVEAAVGGTVTIKCQASQRIGSHVSWYQQKPGQRPKLL...
1,7EW5,KVVATDAYVTRTNIFYHASSSRLLAVGHPYFSIKRANKTVVPKVSG...,EVQLQQSGAEVVRSGASVKLSCTASGFNIKDYAIHWVKQRPEKGLE...,DIVMTQSQKFMSTSVGDRVSITCKASQNVGTAVAWYQQKPGQSPKL...
2,7KF0,EVVKFMDVYQRSYCHPIETLVDIFQEYPDEIEYIFKPSCVPLMRCG...,VQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLEW...,DIQMTQSPSSLSASVGDRVTITCRASQDIPRSISGYVAWYQQKPGK...
3,7KF1,EVVKFMDVYQRSYCHPIETLVDIFQEYPDEIEYIFKPSCVPLMRCG...,EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDIPRSISGYVAWYQQKPGK...
4,7NP1,PFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGV...,EVQLVQSGGGLVKPGGSLRLSCAASGITVSSNYMSWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQSISRYLNWYQQKPGKAPKL...
5,7NX3,DPTVHWLFTTCGASGPHGPTQAQCNNAYQNSNLSVEVGSEGPLKGI...,VQLQQSGAELVKPGASVKISCKASGYAFSSYWVNWVKQRPGKGLEW...,DIVLTQSPASLAVSLGQRATISCRASESVDNYGISFMNWFQQKPGQ...
6,7O9W,PTVSVFSMFRYSNWLDKLYMVVGTLAAIIHGAGLPLMMLVFGEMTD...,EVQLQESGPELVKTGASVKISCKASGYSFSNYYIHWVKQSHGKSLE...,QVVMTQSPLSLPVSLGDQASISCRSSQSLLHSNGNTYLHWYLQKPG...
7,7R40,QCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFF...,EVQLLESGGGLVQPGGSLRLSCVASGFTFSSYVMSWVRQAPGKGLE...,EIVLTQSPATLSLSPGERATLSCRASQSFHNYLAWYQQKPGRAPRL...
8,7S0E,SSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLP...,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYYMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCQASQDISNYLNWYQQKPGKAPKL...
9,7SBG,DDKMSWQAYVDDHLMCEIEGNHLSAAAIIGQDGSVWAQSANFPQFK...,EVQLVESGGGLVQPKGSLKLSCAASGFTFNTYAMNWVRQAPGKGLE...,DIQMTQSPASLSASVGETVTITCRASGNIHNYLAWFQQKQGKSPQL...


In [6]:
if __name__ == '__main__':
    directory = '/home/cseomoon/project/ABAG/AbNb_benchmark/model_classification/Positive/fasta'
    df_chains = read_chains_to_df(directory)
    display(df_chains)
    df_chains.to_csv('AbNb_native_complex.csv', index=False)

Unnamed: 0,pdb_id,Achain,Hchain,Lchain
0,7Q0I,SQCVNFTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPF...,EVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLE...,SYELTQPPSVSVAPGKTATITCGGNNIGTKSVHWYQQKPGQAPVLV...
1,7Q6C,HCQLGQKQSGSECICMSPEEDCSHHSEDLCVFDTDSNDYFTSPACK...,EVQLVESDGGLVQPGGSLKLSCAASGFTFSDYYMAWVRQGPGKGLE...,DVVLTQTPSTLSVTPGQPASISCRSSQSLLNDVGNTYLYWYLQKPG...
2,7QNW,TNLCPFDEVFNATRFASVYAWNRKRISNCVADYSVLYNLAPFFTFK...,QVQLQESGPGLVKPSETLSLTCTVSGDSISSSRYYWGWIRQPPGKG...,AIRMTQSPSTLSASVGDRVTIACRASQSISAWLAWYQQKPGKAPKL...
3,7R58,GPLPKPSLQALPSSLVPLEKPVTLRCQGPPGVDLYRLEKLSSSRYQ...,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYNMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRSSQSLENSNGNTYLNWYQQKPG...
4,7SO5,LVNRKQLEKMANVRFRTQEDEYVAILDALEEYHNMSENTVVEKYLK...,QVQLLQGGAGLLKPSETLSLTCAVYGGSFSEHYWSWIRQPPGKGLE...,DIVMTQSPLSLPVTPGEPASISCRSSQSLLHTNGNNYLVWYLQKPG...
...,...,...,...,...
68,8QRG,TNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFK...,EVQLVESGGGLVEPGGSLRLSCAASGITVSSNYMHWVRQAPGRGLE...,DIQMTQSPSSLSASVGDRVTITCQASQDINKYLNWYQQKPGKAPNL...
69,8R1D,VNLITRTQSYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTW...,QVQLVESGGGLVQPGGSLRLSCTASGVTFSSYAMSWVRQAPGKGLE...,PSALTQPPSVSVSPGQTASITCSGDKLGNKYAYWYQQKPGQSPVLV...
70,8SDG,NLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKC...,EVQLVESGGGLVQPGGSLKLSCSASGFTLSDSAMHWVRQASGKGLE...,DIQLTQSPSTLSASVGDRVIITCRASQSISTWLAWYQQRPGQAPKL...
71,8SGN,VCQYTIQSLIHLTGEDPGFFNVEIPEFPFYPTCNVCTADVNVTINF...,QLQLQESGPGVVKPSETLSLTCTISGGSFSTYYWTWIRQPPGKGLE...,QAALTQPPSVSGSPGQSVTISCTGTSSDIGGYNYVSWYQQHPGKAP...


In [1]:
df_chains

NameError: name 'df_chains' is not defined