In [28]:
import os
import pandas as pd

def rename_chains_in_pdb(pdb_file, mapping):
    """
    PDB 파일 내 ATOM/HETATM/TER/ANISOU 레코드의 체인 ID를
    mapping dict {old_chain: new_chain} 에 따라 일괄 변경합니다.
    """
    # 1) 메모리로 원본 읽기
    with open(pdb_file, 'r') as f:
        lines = f.readlines()
    # 2) 덮어쓰기
    with open(pdb_file, 'w') as f:
        for line in lines:
            if line.startswith(('ATOM  ', 'HETATM', 'TER   ', 'ANISOU')) and len(line) >= 22:
                old = line[21]
                if old in mapping:
                    line = line[:21] + mapping[old] + line[22:]
            f.write(line)

# ————————— 설정 부분 —————————
pdb_dir = '/home/cseomoon/project/ABAG/2025_H_L_A/original_pdb'
tsv_path = '/home/cseomoon/appl/af_analysis-0.1.4/data/data_generation/ABAG-SabDab_final_dataset_240729.tsv'
# ——————————————————————————

# 메타데이터 로드
df_meta = pd.read_csv(tsv_path, sep='\t', dtype=str)

# 디렉토리 내 PDB 파일 순회
for fname in os.listdir(pdb_dir):
    if not fname.endswith('_ABAG.pdb'):
        continue
    pdb_id   = fname.replace('_ABAG.pdb','')
    fullpath = os.path.join(pdb_dir, fname)

    # TSV에서 일치하는 row 조회
    match = df_meta[df_meta['pdb'] == pdb_id]
    if match.empty:
        print(f"[WARN] Metadata missing for {pdb_id}, skipping.")
        continue
    row = match.iloc[0]

    # 매핑 dict 생성
    mapping = {
        row['Hchain']:       'H',
        row['Lchain']:       'L',
        row['antigen_chain']:'A'
    }

    # 파일 내 체인 ID 교체
    rename_chains_in_pdb(fullpath, mapping)
    print(f"→ {pdb_id}: {mapping}")

print("All done.")  

→ 7r40: {'H': 'H', 'L': 'L', 'A': 'A'}
→ 7vng: {'H': 'H', 'L': 'L', 'D': 'A'}
→ 7s0e: {'H': 'H', 'L': 'L', 'A': 'A'}
→ 7tfo: {'H': 'H', 'L': 'L', 'A': 'A'}
→ 7nx3: {'B': 'H', 'C': 'L', 'F': 'A'}
→ 7yqx: {'H': 'H', 'F': 'L', 'B': 'A'}
→ 7t25: {'H': 'H', 'L': 'L', 'E': 'A'}
→ 7ew5: {'Q': 'H', 'R': 'L', 'b': 'A'}
→ 7shy: {'C': 'H', 'D': 'L', 'A': 'A'}
→ 7sjn: {'H': 'H', 'L': 'L', 'A': 'A'}
→ 7sjo: {'H': 'H', 'L': 'L', 'A': 'A'}
→ 7su1: {'H': 'H', 'L': 'L', 'C': 'A'}
→ 7su0: {'H': 'H', 'L': 'L', 'C': 'A'}
→ 7zf9: {'H': 'H', 'L': 'L', 'E': 'A'}
→ 7np1: {'H': 'H', 'L': 'L', 'A': 'A'}
→ 7sbg: {'H': 'H', 'L': 'L', 'C': 'A'}
→ 7ued: {'H': 'H', 'L': 'L', 'M': 'A'}
→ 7wo5: {'F': 'H', 'G': 'L', 'A': 'A'}
→ 7sgm: {'H': 'H', 'L': 'L', 'A': 'A'}
→ 7z2m: {'H': 'H', 'L': 'L', 'I': 'A'}
→ 7vyr: {'H': 'H', 'L': 'L', 'R': 'A'}
→ 6x97: {'H': 'H', 'L': 'L', 'A': 'A'}
→ 7kf0: {'H': 'H', 'L': 'L', 'V': 'A'}
→ 7kf1: {'H': 'H', 'L': 'L', 'V': 'A'}
→ 7soc: {'H': 'H', 'L': 'L', 'A': 'A'}
→ 7yqz: {'I': 'H', 'G': '

In [1]:
import os
import pandas as pd

# 1) 경로 설정
pdb_dir = '/home/cseomoon/project/ABAG/2025_H_L_A/original_pdb'
tsv_path = '/home/cseomoon/appl/af_analysis-0.1.4/data/data_generation/ABAG-SabDab_final_dataset_240729.tsv'

import os
import pandas as pd

def get_chain_ids_simple(pdb_file):
    """
    PDB 파일을 읽어 ATOM/HETATM 레코드에서 chain ID를 추출합니다.
    """
    chain_ids = set()
    with open(pdb_file, 'r') as f:
        for line in f:
            if line.startswith(('ATOM  ', 'HETATM')) and len(line) >= 22:
                chain_ids.add(line[21])
    return sorted(chain_ids)

# 2) 메타정보 TSV 읽기
df_meta = pd.read_csv(tsv_path, sep='\t', dtype=str)

results = []
for fname in os.listdir(pdb_dir):
    if not fname.endswith('_ABAG.pdb'):
        continue
    pdb_id = fname[:-len('_ABAG.pdb')]
    full_path = os.path.join(pdb_dir, fname)

    # 3) TSV 매칭
    match = df_meta[df_meta['pdb'] == pdb_id]
    if match.empty:
        print(f"[WARNING] TSV에 '{pdb_id}' 항목이 없습니다.")
        continue
    row = match.iloc[0]

    # 4) PDB 파일에서 실제 chain ID 추출
    chains = get_chain_ids_simple(full_path)

    results.append({
        'pdb_id':         pdb_id,
        'Hchain':         row['Hchain'],
        'Lchain':         row['Lchain'],
        'antigen_chain':  row['antigen_chain'],
        'all_chain_ids':  ', '.join(chains)
    })

# 5) DataFrame으로 변환 후 출력
result_df = pd.DataFrame(results)
print(result_df)

   pdb_id Hchain Lchain antigen_chain all_chain_ids
0    7r40      H      L             A       A, H, L
1    7vng      H      L             D       A, H, L
2    7s0e      H      L             A       A, H, L
3    7tfo      H      L             A       A, H, L
4    7nx3      B      C             F       A, H, L
5    7yqx      H      F             B       A, H, L
6    7t25      H      L             E       A, H, L
7    7ew5      Q      R             b       A, H, L
8    7shy      C      D             A       A, H, L
9    7sjn      H      L             A       A, H, L
10   7sjo      H      L             A       A, H, L
11   7su1      H      L             C       A, H, L
12   7su0      H      L             C       A, H, L
13   7zf9      H      L             E       A, H, L
14   7np1      H      L             A       A, H, L
15   7sbg      H      L             C       A, H, L
16   7ued      H      L             M       A, H, L
17   7wo5      F      G             A       A, H, L
18   7sgm   

In [None]:
'/Users/cseomoon/ABAG/AF3/DB/AF3_structure/240923_missing_residue_processing/2024_0919/ABAG-SabDab_final_dataset_240729.tsv'

In [22]:
import py3Dmol

def visualize_pdb_with_py3dmol(pdb_file, width=800, height=600):
    """
    Load a PDB file and display it with py3Dmol, coloring by chain ID.
    """
    # 1) PDB 파일 내용 읽기
    with open(pdb_file, 'r') as f:
        pdb_str = f.read()
    
    # 2) 뷰어 생성 및 모델 로드
    view = py3Dmol.view(width=width, height=height)
    view.addModel(pdb_str, 'pdb')
    
    # 3) 체인별로 cartoon 스타일로 컬러링 ('chain' 색상표 사용)
    view.setStyle({'cartoon': {'colorscheme': 'chain'}})
    
    # 4) 뷰를 전체 구조에 맞춰 확대
    view.zoomTo()
    
    # 5) 반환된 뷰어를 화면에 표시
    return view

# 사용 예시
directory = "/home/cseomoon/project/ABAG/2025_H_L_A/original_pdb"
pdb_id='7nx3_ABAG.pdb'
pdb_path = os.path.join(directory, pdb_id)
view = visualize_pdb_with_py3dmol(pdb_path)
view.show()

In [1]:
import af_analysis
from af_analysis import analysis
from af_analysis import data
import pdb_numpy
import numpy as np
import json
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None) 

%matplotlib inline

In [3]:
def dockq_calculation(my_data):
    my_data.prep_dockq()
    analysis.calculate_dockQ(my_data,
                    rec_chains='A', lig_chains='H', 
                    native_rec_chains='A', native_lig_chains='H', 
                    verbose=False)  # 병렬 처리 시 출력 끄는 것이 좋음
    
def calculate_metrics(my_data):
    analysis.pdockq(my_data, verbose=False)
    analysis.pdockq2(my_data, verbose=False)
    analysis.mpdockq(my_data, verbose=False )
    analysis.LIS_matrix(my_data, verbose=False)
    analysis.add_interface_metrics(my_data, verbose=False)


def calculate_rmsd(my_data):
    my_data.add_chain_rmsd(align_chain='A', rmsd_chain='H')
    my_data.add_rmsd_scale()

def process_pdb_id(pdb_id):
    """개별 PDB ID 처리 함수"""
    try:
        print(f"Processing {pdb_id}")
        base_dir = f"/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/{pdb_id}"
        my_data = af_analysis.data.Data(directory=base_dir)
        my_data.analyze_chains(verbose=False)
        my_data.extract_chain_columns(verbose=False)
    #     my_data.prep_dockq(verbose=False)
    #     analysis.calculate_dockQ(my_data,
    #                 rec_chains='A', lig_chains='H', 
    #                 native_rec_chains='A', native_lig_chains='H', 
    #                 verbose=False)  # 병렬 처리 시 출력 끄는 것이 좋음
        
    #     analysis.pdockq(my_data, verbose=False)
    #     analysis.pdockq2(my_data, verbose=False)
    #     analysis.mpdockq(my_data, verbose=False )
    #     analysis.LIS_matrix(my_data, verbose=False)
    #     analysis.add_interface_metrics(my_data, verbose=False)   
    #     my_data.add_chain_rmsd(align_chain='A', rmsd_chain='H')
    #     my_data.add_rmsd_scale() 
        return my_data
    except Exception as e:
        print(f"Error processing {pdb_id}: {str(e)}")
        return None
data=process_pdb_id('6x97')
data.df

Processing 6x97


Unnamed: 0,pdb,query,seed,sample,data_file,chain_iptm,chain_pair_iptm,chain_pair_pae_min,chain_ptm,fraction_disordered,has_clash,ipTM,pTM,ranking_confidence,format,chain_plddt_A,chain_pae_A,chain_plddt_H,chain_pae_H,chain_plddt_L,chain_pae_L,chain_pair_pae_AH,chain_pair_pae_AL,chain_pair_pae_HL,avg_model_plddt,avg_internal_pae,avg_pair_pae,iptm_A,iptm_H,iptm_L,ptm_A,ptm_H,ptm_L,chain_pair_iptm_AH,chain_pair_iptm_AL,chain_pair_iptm_HL,chain_pair_pae_min_AH,chain_pair_pae_min_AL,chain_pair_pae_min_HL
0,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-1_sample-0/model.cif,6x97,1,0,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-1_sample-0/confidences.json,"[0.23, 0.54, 0.52]","[[0.84, 0.25, 0.21], [0.25, 0.84, 0.84], [0.21, 0.84, 0.87]]","[[0.76, 20.41, 21.09], [20.2, 0.76, 1.33], [21.22, 1.3, 0.76]]","[0.84, 0.84, 0.87]",0.0,0.0,0.33,0.65,0.39,AF3,79.728819,9.222435,83.471033,4.123866,84.552683,3.210606,26.452228,27.048712,5.645469,82.584179,5.518969,19.71547,0.23,0.54,0.52,0.84,0.84,0.87,0.25,0.21,0.84,20.305,21.155,1.315
1,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-1_sample-1/model.cif,6x97,1,1,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-1_sample-1/confidences.json,"[0.22, 0.54, 0.52]","[[0.84, 0.24, 0.21], [0.24, 0.85, 0.83], [0.21, 0.83, 0.87]]","[[0.76, 21.08, 21.55], [20.79, 0.76, 1.34], [21.58, 1.33, 0.76]]","[0.84, 0.85, 0.87]",0.0,0.0,0.32,0.65,0.38,AF3,80.031424,9.04465,83.478344,4.107489,84.522472,3.228272,26.858756,27.417137,5.718797,82.677413,5.460137,19.99823,0.22,0.54,0.52,0.84,0.85,0.87,0.24,0.21,0.83,20.935,21.565,1.335
2,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-1_sample-2/model.cif,6x97,1,2,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-1_sample-2/confidences.json,"[0.1, 0.46, 0.46]","[[0.84, 0.1, 0.1], [0.1, 0.84, 0.82], [0.1, 0.82, 0.86]]","[[0.76, 29.23, 29.71], [21.44, 0.76, 1.43], [22.74, 1.44, 0.76]]","[0.84, 0.84, 0.86]",0.0,0.0,0.27,0.61,0.33,AF3,79.242034,9.238223,81.915233,4.262661,83.971689,3.325717,30.116749,30.22303,6.07587,81.709652,5.608867,22.13855,0.1,0.46,0.46,0.84,0.84,0.86,0.1,0.1,0.82,25.335,26.225,1.435
3,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-1_sample-3/model.cif,6x97,1,3,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-1_sample-3/confidences.json,"[0.21, 0.53, 0.51]","[[0.84, 0.22, 0.19], [0.22, 0.84, 0.84], [0.19, 0.84, 0.87]]","[[0.76, 21.65, 22.03], [21.18, 0.76, 1.33], [22.08, 1.31, 0.76]]","[0.84, 0.84, 0.87]",0.0,0.0,0.31,0.64,0.38,AF3,79.78357,9.128988,83.323911,4.173722,84.376807,3.236151,27.215455,27.798692,5.701218,82.494763,5.512953,20.238455,0.21,0.53,0.51,0.84,0.84,0.87,0.22,0.19,0.84,21.415,22.055,1.32
4,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-1_sample-4/model.cif,6x97,1,4,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-1_sample-4/confidences.json,"[0.22, 0.54, 0.52]","[[0.84, 0.24, 0.2], [0.24, 0.84, 0.83], [0.2, 0.83, 0.87]]","[[0.76, 20.74, 21.29], [20.76, 0.76, 1.35], [21.73, 1.32, 0.76]]","[0.84, 0.84, 0.87]",0.0,0.0,0.32,0.64,0.39,AF3,79.631775,9.213288,83.443522,4.116138,84.399851,3.245897,26.749573,27.363007,5.691247,82.491716,5.525107,19.934609,0.22,0.54,0.52,0.84,0.84,0.87,0.24,0.2,0.83,20.75,21.51,1.335
5,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-2_sample-0/model.cif,6x97,2,0,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-2_sample-0/confidences.json,"[0.2, 0.53, 0.51]","[[0.84, 0.22, 0.19], [0.22, 0.85, 0.83], [0.19, 0.83, 0.86]]","[[0.76, 21.27, 21.91], [20.24, 0.76, 1.37], [21.03, 1.37, 0.76]]","[0.84, 0.85, 0.86]",0.0,0.0,0.32,0.65,0.38,AF3,79.978604,9.264953,83.6775,4.035782,84.222621,3.383997,26.83873,27.377678,5.751882,82.626242,5.561577,19.98943,0.2,0.53,0.51,0.84,0.85,0.86,0.22,0.19,0.83,20.755,21.47,1.37
6,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-2_sample-1/model.cif,6x97,2,1,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-2_sample-1/confidences.json,"[0.22, 0.53, 0.52]","[[0.84, 0.23, 0.21], [0.23, 0.85, 0.83], [0.21, 0.83, 0.86]]","[[0.76, 21.12, 21.34], [20.07, 0.76, 1.35], [20.8, 1.37, 0.76]]","[0.84, 0.85, 0.86]",0.0,0.0,0.32,0.65,0.38,AF3,80.215528,9.136378,83.834689,4.038918,84.468348,3.332572,26.62891,27.16728,5.703543,82.839521,5.502622,19.833244,0.22,0.53,0.52,0.84,0.85,0.86,0.23,0.21,0.83,20.595,21.07,1.36
7,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-2_sample-2/model.cif,6x97,2,2,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-2_sample-2/confidences.json,"[0.25, 0.55, 0.53]","[[0.84, 0.26, 0.23], [0.26, 0.84, 0.83], [0.23, 0.83, 0.86]]","[[0.76, 20.08, 20.48], [18.94, 0.76, 1.35], [19.92, 1.42, 0.76]]","[0.84, 0.84, 0.86]",0.0,0.0,0.34,0.66,0.4,AF3,80.286493,9.175648,83.697489,4.245212,84.462,3.395176,25.903292,26.456047,5.875519,82.815327,5.605346,19.41162,0.25,0.55,0.53,0.84,0.84,0.86,0.26,0.23,0.83,19.51,20.2,1.385
8,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-2_sample-3/model.cif,6x97,2,3,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-2_sample-3/confidences.json,"[0.21, 0.53, 0.52]","[[0.84, 0.23, 0.19], [0.23, 0.85, 0.84], [0.19, 0.84, 0.86]]","[[0.76, 21.52, 21.85], [20.08, 0.76, 1.34], [21.18, 1.39, 0.76]]","[0.84, 0.85, 0.86]",0.0,0.0,0.32,0.65,0.38,AF3,80.253327,9.119526,83.820889,4.017405,84.364671,3.331671,26.839577,27.382532,5.709062,82.812962,5.489534,19.977057,0.21,0.53,0.52,0.84,0.85,0.86,0.23,0.19,0.84,20.8,21.515,1.365
9,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-2_sample-4/model.cif,6x97,2,4,/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/6x97/seed-2_sample-4/confidences.json,"[0.14, 0.48, 0.49]","[[0.84, 0.14, 0.15], [0.14, 0.85, 0.83], [0.15, 0.83, 0.86]]","[[0.76, 21.86, 21.64], [21.2, 0.76, 1.5], [22.45, 1.43, 0.76]]","[0.84, 0.85, 0.86]",0.0,0.0,0.28,0.63,0.35,AF3,80.048975,9.110465,82.882167,4.116674,84.609528,3.412645,28.638802,28.621579,5.946698,82.513556,5.546595,21.069026,0.14,0.48,0.49,0.84,0.85,0.86,0.14,0.15,0.83,21.53,22.045,1.465


In [None]:
import af_analysis
from af_analysis import analysis
import pandas as pd
import numpy as np
import os
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing
import time
from datetime import datetime
import sys

def process_pdb_id(pdb_id):
    """개별 PDB ID 처리 함수"""
    try:
        print(f"Processing {pdb_id}")
        base_dir = f"/home/cseomoon/project/ABAG/2025_H_L_A/20250504_seeds_10/af3_results/{pdb_id}"
        if not os.path.exists(base_dir):
            return None, f"Directory not found for {pdb_id}"
            
        my_data = af_analysis.data.Data(directory=base_dir)
        my_data.prep_dockq(verbose=False)
        analysis.calculate_dockQ(my_data,
                    rec_chains='A', lig_chains='H', 
                    native_rec_chains='A', native_lig_chains='H', 
                    verbose=False)
        
        analysis.pdockq(my_data, verbose=False)
        analysis.pdockq2(my_data, verbose=False)
        analysis.mpdockq(my_data, verbose=False)
        analysis.LIS_matrix(my_data, verbose=False)
        analysis.add_interface_metrics(my_data, verbose=False)
        my_data.analyze_chains()   
        my_data.add_chain_rmsd(align_chain='A', rmsd_chain='H')
        my_data.add_rmsd_scale() 
        return my_data, None
    except Exception as e:
        return None, f"Error processing {pdb_id}: {str(e)}"

# 메인 코드
if __name__ == "__main__":
    start_time = time.time()
    
    # 입력 파일 확인
    input_file = '/home/cseomoon/project/ABAG/AbNb_benchmark/datastructure/input_datastructure.tsv'
    if not os.path.exists(input_file):
        print(f"Input file not found: {input_file}")
        sys.exit(1)
        
    # 출력 디렉토리 확인
    output_dir = "/home/cseomoon/project/ABAG/AbNb_benchmark/datastructure/new_dockq"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # 타임스탬프가 있는 출력 파일 이름 생성
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = f"{output_dir}/chain_metrics_results_{timestamp}.csv"
    
    try:
        df = pd.read_csv(input_file, sep='\t')
        pdb_ids = list(np.unique(df['PDB_ID']))
        total_pdb_ids = len(pdb_ids)
        
        print(f"Found {total_pdb_ids} unique PDB IDs to process")
        
        # 사용할 CPU 코어 수 설정 (전체 코어의 75% 사용)
        num_workers = max(1, int(multiprocessing.cpu_count() * 0.75))
        print(f"Using {num_workers} workers for parallel processing")
        
        # 병렬 처리 실행
        results = []
        errors = []
        completed = 0
        
        with ProcessPoolExecutor(max_workers=num_workers) as executor:
            futures = {executor.submit(process_pdb_id, pdb_id): pdb_id for pdb_id in pdb_ids}
            
            for future in as_completed(futures):
                pdb_id = futures[future]
                result, error = future.result()
                
                completed += 1
                progress = (completed / total_pdb_ids) * 100
                elapsed = time.time() - start_time
                eta = (elapsed / completed) * (total_pdb_ids - completed) if completed > 0 else 0
                
                print(f"Progress: {progress:.1f}% ({completed}/{total_pdb_ids}) - ETA: {eta:.1f} seconds")
                
                if result is not None:
                    results.append(result)
                    print(f"Completed {pdb_id}")
                else:
                    errors.append(error)
                    print(error)
        
        # 결과 병합 및 저장
        if results:
            data_list = af_analysis.data.concat_data(results)
            data_list.df.to_csv(output_file, index=False)
            print(f"Successfully processed {len(results)} out of {total_pdb_ids} folders")
            print(f"Results saved to {output_file}")
        else:
            print("No valid results to save")
            
        # 오류 요약 출력
        if errors:
            print(f"Encountered {len(errors)} errors:")
            for error in errors:
                print(f"  - {error}")
                
        # 총 실행 시간 출력
        total_time = time.time() - start_time
        print(f"Total execution time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
            
    except Exception as e:
        print(f"An error occurred during execution: {str(e)}")
        sys.exit(1)

In [8]:
import pandas as pd

def compare_dataframe_columns(df1: pd.DataFrame, df2: pd.DataFrame):
    """
    두 개의 DataFrame을 비교하여, df1에는 있지만 df2에는 없는 컬럼,
    df2에는 있지만 df1에는 없는 컬럼을 반환합니다.
    """
    cols1 = set(df1.columns)
    cols2 = set(df2.columns)

    missing_in_df2 = cols1 - cols2
    missing_in_df1 = cols2 - cols1

    return missing_in_df2, missing_in_df1

# 사용 예시
# df_a, df_b 는 이미 정의된 pandas DataFrame이라고 가정

df_a= pd.read_csv('/home/cseomoon/appl/af_analysis-0.1.4/data/ABAG_final_test_dataset_20250512.csv')
df_b= pd.read_csv('/home/cseomoon/appl/af_analysis-0.1.4/data/final_data_with_rosetta_scaledRMSD_20250423.csv')

missing_in_b, missing_in_a = compare_dataframe_columns(df_a, df_b)

print("df_a에는 있지만 df_b에는 없는 컬럼:", missing_in_b)
print("df_b에는 있지만 df_a에는 없는 컬럼:", missing_in_a)

df_a에는 있지만 df_b에는 없는 컬럼: {'LIS'}
df_b에는 있지만 df_a에는 없는 컬럼: set()
