In [1]:
import os
import pandas as pd
import numpy as np
import multiprocessing as mp
from tqdm import tqdm
import af_analysis
import time
from Bio.PDB import MMCIFParser, PDBIO

def convert_cif_to_pdb(cif_path):
    try:
        pdb_path = os.path.splitext(cif_path)[0] + '.pdb'
        if os.path.exists(pdb_path):
            return pdb_path
        parser = MMCIFParser()
        structure = parser.get_structure('structure', cif_path)
        io = PDBIO()
        io.set_structure(structure)
        io.save(pdb_path)
        print(f"Converted {cif_path} to {pdb_path}")
        return pdb_path
    except Exception as e:
        print(f"Error converting {cif_path} to PDB: {e}")
        return None

def process_model(file_path):
    try:
        if file_path.endswith('.cif'):
            pdb_path = convert_cif_to_pdb(file_path)
            if not pdb_path:
                return file_path, np.nan, "CIF to PDB conversion failed"
            model_path = pdb_path
        else:
            model_path = file_path

        model_dir = os.path.dirname(os.path.dirname(model_path))
        temp_data = af_analysis.data.Data(directory=model_dir)

        temp_data.df = pd.DataFrame({
            'model_path': [model_path],
            'pdb': [os.path.basename(model_dir)]
        })

        temp_data.calculate_binding_energy(n_jobs=1, verbose=False)

        if 'del_G_B' in temp_data.df.columns and not pd.isna(temp_data.df.loc[0, 'del_G_B']):
            return model_path, temp_data.df.loc[0, 'del_G_B'], None
        else:
            return model_path, np.nan, "Energy calculation failed"
    except Exception as e:
        return file_path, np.nan, str(e)

def run_rosetta_energy_calculation(csv_file, node_id=0, total_nodes=1):
    try:
        df = pd.read_csv(csv_file)
        print(f"Loaded {len(df)} rows from {csv_file}")
    except Exception as e:
        print(f"Error loading CSV file: {e}")
        return None

    if 'del_G_B' not in df.columns:
        df['del_G_B'] = np.nan

    if 'model_path' in df.columns:
        path_column = 'model_path'
    elif 'pdb' in df.columns:
        path_column = 'pdb'
    else:
        print("Error: No path column found")
        return None

    valid_paths = df[pd.notna(df[path_column])][path_column].tolist()
    files_to_process = [path for path in valid_paths if os.path.exists(path)]
    print(f"Found {len(files_to_process)} valid file paths")

    files_per_node = [files_to_process[i::total_nodes] for i in range(total_nodes)]
    my_files = files_per_node[node_id]
    print(f"Node {node_id} processing {len(my_files)} files")

    files_to_calculate = []
    for file_path in my_files:
        idx = df[df[path_column] == file_path].index
        if len(idx) > 0 and pd.isna(df.loc[idx[0], 'del_G_B']):
            files_to_calculate.append(file_path)

    print(f"Need to calculate {len(files_to_calculate)} files")

    if not files_to_calculate:
        print("No files to process.")
        return df

    num_cores = max(1, mp.cpu_count() - 2)
    print(f"Using {num_cores} CPU cores")

    start_time = time.time()
    results = []
    errors = []

    with mp.Pool(processes=num_cores) as pool:
        for file_path, energy, error in tqdm(
            pool.imap_unordered(process_model, files_to_calculate),
            total=len(files_to_calculate),
            desc="Calculating binding energy"
        ):
            idx = df[df[path_column] == file_path].index
            if len(idx) > 0:
                df.loc[idx[0], 'del_G_B'] = energy

            if error:
                errors.append(f"{file_path}: {error}")
            else:
                results.append(file_path)

    print(f"Success: {len(results)} / {len(files_to_calculate)}")
    if errors:
        print("Errors (top 5):")
        for msg in errors[:5]:
            print(f"  {msg}")

    elapsed = time.time() - start_time
    print(f"Completed in {elapsed:.2f} seconds")
    
    return df  # 결과를 Jupyter에서 확인할 수 있도록 반환

In [2]:
csv_path = "/home/cseomoon/appl/af_analysis-0.1.4/data/sequence_classification/train_set_AbNb/native/AbNb_final_h3_l3_plddt_20250522.csv"
node_id = 0
total_nodes = 1

# 실행
result_df = run_rosetta_energy_calculation(csv_path, node_id=node_id, total_nodes=total_nodes)

Loaded 3650 rows from /home/cseomoon/appl/af_analysis-0.1.4/data/sequence_classification/train_set_AbNb/native/AbNb_final_h3_l3_plddt_20250522.csv
Found 3650 valid file paths
Node 0 processing 3650 files
Need to calculate 0 files
No files to process.


In [None]:
 import pandas as pd                                                                                                                                                                                             
 import glob                                                                                                                                                                                                     
 import os                                                                                                                                                                                                       
                                                                                                                                                                                                                 
 # 결과 파일 찾기                                                                                                                                                                                                
 result_files = glob.glob("/home/cseomoon/appl/af_analysis-0.1.4/results/energy_calc/energy_result_*.csv")                                                                                                                                                   
 print(f"Found {len(result_files)} result files to merge")                                                                                                                                                       
                                                                                                                                                                                                                 
 if not result_files:                                                                                                                                                                                            
     print("No result files found!")                                                                                                                                                                             
     exit(1)                                                                                                                                                                                                     
                                                                                                                                                                                                                 
 # 모든 결과 읽어서 하나로 합치기                                                                                                                                                                                
 dfs = []                                                                                                                                                                                                        
 for f in result_files:                                                                                                                                                                                          
     try:                                                                                                                                                                                                        
         df = pd.read_csv(f)                                                                                                                                                                                     
         dfs.append(df)                                                                                                                                                                                          
         print(f"Loaded {f}: {len(df)} rows")                                                                                                                                                                    
     except Exception as e:                                                                                                                                                                                      
         print(f"Error loading {f}: {e}")                                                                                                                                                                        
                                                                                                                                                                                                                 
 if not dfs:                                                                                                                                                                                                     
     print("No valid data frames to merge!")                                                                                                                                                                     
     exit(1)                                                                                                                                                                                                     
                                                                                                                                                                                                                 
 # 합치기                                                                                                                                                                                                        
 all_results = pd.concat(dfs)                                                                                                                                                                                    
 print(f"Combined data frame has {len(all_results)} rows")                                                                                                                                                       
                                                                                                                                                                                                                 
 # 합친 결과 저장                                                                                                                                                                                                
 all_results.to_csv("${FINAL_OUTPUT}", index=False)                                                                                                                                                              
 print(f"Results saved to ${FINAL_OUTPUT}")                                                                                                                                                                      
 EOF                                                                                                                                                                                                             
                                                                                                                                                                                                                 
 echo "Merge completed"  