In [3]:
import af_analysis
import pandas as pd

In [4]:
df=pd.read_csv('negative_sorted_decoy_set_train.csv')
df

Unnamed: 0,antibody,antigen,label,neg_type
0,7q0i,7tl0,0,easy
1,7q0i,7upb,0,easy
2,7q0i,7vux,0,easy
3,7q0i,8oxw,0,easy
4,7q0i,7x6a,0,hard
...,...,...,...,...
360,8tbq,7uzc,0,easy
361,8tbq,8heb,0,easy
362,8tbq,7x6a,0,easy
363,8tbq,7unb,0,easy


In [None]:
# ── ① 맨 위쪽 import 구역 ───────────────────────────────────────────
import random          # ← 새로 추가
import argparse        # ← 새로 추가


In [13]:
from concurrent.futures import ProcessPoolExecutor, as_completed
import os, sys, time, multiprocessing
from datetime import datetime
import pandas as pd
import af_analysis

def process_pdb_id(antibody_id,antigen_id):
    """개별 PDB ID 처리 함수"""
    try:
        pair = f"{antibody_id}_{antigen_id}"
        print(f"Processing {pair}")
        base_dir = f"/home/cseomoon/project/ABAG/AbNb_benchmark/AF3_results/{pair}"
        if not os.path.exists(base_dir):
            return None, f"Directory not found for {pair}"
            
        my_data = af_analysis.data.Data(directory=base_dir)
        my_data.prep_dockq(verbose=False)
        # analysis.calculate_dockQ(my_data,
        #             rec_chains='A', lig_chains='H', 
        #             native_rec_chains='A', native_lig_chains='H', 
        #             verbose=False)
        
        analysis.pdockq(my_data, verbose=False)
        analysis.pdockq2(my_data, verbose=False)
        analysis.mpdockq(my_data, verbose=False)
        analysis.LIS_matrix(my_data, verbose=False)
        analysis.add_interface_metrics(my_data, verbose=False)
        my_data.analyze_chains()   
        my_data.add_chain_rmsd(align_chain='A', rmsd_chain='H')
        my_data.add_rmsd_scale() 
        return my_data, None
    except Exception as e:
        return None, f"Error processing {pair}: {e}"
        
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--pilot", type=int, default=None,
                        help="테스트용으로 처리할 (무작위) 페어 개수")
    args = parser.parse_args()
    start_time = time.time()

    # 입력 파일 확인
    input_file = '/home/cseomoon/appl/af_analysis-0.1.4/data/sequence_classification/metric_calculation/negative_sorted_decoy_set_train.csv'
    if not os.path.exists(input_file):
        print(f"Input file not found: {input_file}")
        sys.exit(1)

    # 출력 디렉토리 준비
    output_dir = "/home/cseomoon/appl/af_analysis-0.1.4/data/sequence_classification/metric_calculation"
    os.makedirs(output_dir, exist_ok=True)
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = f"{output_dir}/chain_metrics_results_{timestamp}.csv"

    # negative_pairs 생성
    df = pd.read_csv(input_file)
    negative_pairs = [(row['antibody'], row['antigen']) for _, row in df.iterrows()]
    total_pairs = len(negative_pairs)


    # pilot 옵션이 있으면 무작위 샘플링
    if args.pilot is not None and 0 < args.pilot < total_pairs:
        random.seed(42)                       # 재현 가능하도록 고정(선택)
        negative_pairs = random.sample(negative_pairs, args.pilot)
        total_pairs = len(negative_pairs)
        print(f"[PILOT] {total_pairs} pairs만 선택해 실행합니다.")

    print(f"Found {total_pairs} pairs to process")
    print(f"Found {total_pairs} negative pairs to process")

    
   # 병렬 처리 설정
    num_workers = min(4, int(multiprocessing.cpu_count() * 0.75))
    print(f"Using {num_workers} workers for parallel processing")

    results = []
    errors = []
    completed = 0

    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        # futures: { Future: (antibody, antigen) }
        futures = {
            executor.submit(process_pdb_id, ab, ag): (ab, ag)
            for ab, ag in negative_pairs
        }

        for future in as_completed(futures):
            ab, ag = futures[future]
            pair = f"{ab}_{ag}"
            result, error = future.result()

            completed += 1
            progress = (completed / total_pairs) * 100
            elapsed = time.time() - start_time
            eta = (elapsed / completed) * (total_pairs - completed) if completed else 0
            print(f"Progress: {progress:.1f}% ({completed}/{total_pairs}) - ETA: {eta:.1f}s")

            if result is not None:
                results.append(result)
                print(f"Completed {pair}")
            else:
                errors.append(error)
                print(error)

    # 결과 저장
    if results:
        merged = af_analysis.data.concat_data(results)
        merged.df.to_csv(output_file, index=False)
        print(f"Successfully processed {len(results)}/{total_pairs} folders")
        print(f"Results saved to {output_file}")
    else:
        print("No valid results to save")

    if errors:
        print(f"Encountered {len(errors)} errors:")
        for err in errors:
            print("  -", err)

    total_time = time.time() - start_time
    print(f"Total execution time: {total_time:.1f}s ({total_time/60:.1f}min)")

Found 365 negative pairs to process
Using 4 workers for parallel processing
Processing 7q0i_7upbProcessing 7q0i_7tl0Processing 7q0i_7vuxProcessing 7q0i_8oxw



Processing 7q0i_7x6a
Progress: 0.3% (1/365) - ETA: 74.2s
Error processing 7q0i_7vux: name 'analysis' is not defined
Processing 7q6c_7tl0
Processing 7q6c_7uz7
Processing 7q6c_7xdb
Processing 7q6c_7wg3
Processing 7q6c_7u2d
Processing 7qnw_7t7b
Processing 7qnw_8dg9
Progress: 0.5% (2/365) - ETA: 42.6s
Error processing 7q0i_7upb: name 'analysis' is not defined
Progress: 0.8% (3/365) - ETA: 29.9s
Error processing 7q0i_7tl0: name 'analysis' is not defined
Progress: 1.1% (4/365) - ETA: 23.3s
Error processing 7q0i_8oxw: name 'analysis' is not defined
Progress: 1.4% (5/365) - ETA: 23.0s
Error processing 7q6c_7tl0: name 'analysis' is not defined
Progress: 1.6% (6/365) - ETA: 21.5s
Error processing 7q0i_7x6a: name 'analysis' is not defined
Progress: 1.9% (7/365) - ETA: 19.9s
Error processing 7q6c_7wg3: name 'analysis' is not defined
Progres

KeyboardInterrupt: 