In [9]:
from pathlib import Path
path_spec_data=Path.cwd().parent.parent/"spec_data"
path_benchmark_data=Path.cwd().parent.parent/"benchmark_for_masstplus"

path_spec_data.mkdir(parents=True, exist_ok=True)
path_benchmark_data.mkdir(parents=True, exist_ok=True)

In [10]:
# Collect the files
import re
import os
pattern_pos=r"spectra-charge_1_batch_\d+\.mgf"
pattern_neg=r"spectra-charge_-1_batch_\d+\.mgf"
files=os.listdir(Path.cwd().parent.parent/"spec_data/benchmark_spec")
batch_mgf_pos=[]
for file in files:
    if re.match(pattern_pos, file):
        pos_num=file[23:-4]
        batch_mgf_pos.append(pos_num)


batch_mgf_neg=[]
for file in files:
    if re.match(pattern_neg, file):
        neg_num=file[24:-4]
        batch_mgf_neg.append(neg_num)

print(batch_mgf_pos)
print(batch_mgf_neg)

['29', '92', '68', '10', '91', '5', '67', '47', '14', '96']
['56', '32', '67', '70', '87', '98', '54', '75', '88', '21']


In [11]:
# Only record hybrid search

original_library_size = [1_000_000,]
    
query_size = 100

num_per_group=10_000_000
cache_list_threshold=1_000_000
ion_mode = [-1, 1]

steps=["build", "open_search", "neutral_loss_search", "hybrid_search"]


dynamic_search_types=[
    "fast_search",
    "fast_update",
    "convert_to_flash"
]
dynamic_script_path="19_dynamic_entropy_search_library_scale_hybrid_fast_update_mode_every_step.py"
flash_script_path="23_flash_entropy_search_library_scale_hybrid_every_step.py"

In [None]:
import subprocess
import pickle
import os
import time
import shutil
from typing import Union
import numpy as np
# from dynamic_entropy_search.dynamic_entropy_search import DynamicEntropySearch
from dynamic_entropy_search.convert_to_mgf import _write_spec


def run_usrbintime_by_arguments(
          arguments:list[str], 
          if_output:bool=False, 
          output_memory_file:Union[str,Path]=None, 
          output_time_file:Union[str, Path]=None):
    
    # arguments: script_path, str(charge), step
    command=["/usr/bin/time","-v","python"] + arguments

    if if_output: # Output to files as record
        with open(output_memory_file, "w") as f1, open(output_time_file, "w") as f2:
            subprocess.run(command, stderr=f1, stdout=f2, cwd=Path.cwd(), env=os.environ.copy())

    else: # Output is not needed
         
        subprocess.run(command, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL, cwd=Path.cwd(), env=os.environ.copy())
        
    return

def perform_search_and_record_one_by_one(
          script_path:Path, 
          search_type:str,
          charge:int,
          library_size:int,
          step:str,
          query_spectra_list:list,
          num_per_group:int,
          cache_list_threshold:int,
          method:str,
          if_output:bool=True, 
          ):
    
    for i, spec in enumerate(query_spectra_list):
        # Generate temp query file to perform search one by one
        temp_query_spec=path_spec_data/f"benchmark_spec/query_spectra-charge_{charge}_temp.pkl"
        with open(temp_query_spec, "wb") as temp1:

            # Write temp query pickle file
            pickle.dump(spec, temp1)

        if search_type is not None:
            arguments=[script_path, str(charge), str(num_per_group), str(cache_list_threshold), temp_query_spec, step]
            output_memory_file=path_benchmark_data/f"{method}_{search_type}_{charge}_{library_size}_memory_usage_{step}_step_query_{i}.txt"
            output_time_file=path_benchmark_data/f"{method}_{search_type}_{charge}_{library_size}_compare_time_{step}_step_query_{i}.txt"
        elif search_type is None:
            arguments=[script_path, str(charge), temp_query_spec, step]
            output_memory_file=path_benchmark_data/f"{method}_{charge}_{library_size}_memory_usage_{step}_step_query_{i}.txt"
            output_time_file=path_benchmark_data/f"{method}_{charge}_{library_size}_compare_time_{step}_step_query_{i}.txt"
        # Perform search and record memory usage and time
        run_usrbintime_by_arguments(arguments=arguments, if_output=if_output, output_memory_file=output_memory_file, output_time_file=output_time_file)
    

    return


for library_size in original_library_size:
    file_num=library_size//1000000

    for charge in ion_mode:
        for step in steps:
            if step=="build":
                if charge==-1:
                    random_file=batch_mgf_neg
                elif charge==1:
                    random_file=batch_mgf_pos

                # random_for_use=np.random.choice(100, file_num, replace=False)
                random_for_use=np.random.choice(random_file, file_num, replace=False)

                spec_bin=path_spec_data/f"35_random_export_ms2/charge_{charge}/batch_{random_for_use[0]}.bin"
                reference_mgf=path_spec_data/f"benchmark_spec/spectra-charge_{charge}_batch_{random_for_use[0]}.mgf"
            
            elif step=="open_search" or step=="neutral_loss_search" or step=="hybrid_search":

                query_pkl=path_spec_data/f"benchmark_spec/query_spectra-charge_{charge}-number_100.pkl"
                query_spectra=pickle.loads(open(query_pkl, "rb").read())

            if step=="build":

                # ### DynamicEntropySearch ###
                # # Remove the old index
                # path_comparison_dynamic_data=Path.cwd().parent.parent/f"comparison_data/dynamic/charge-{charge}"
                # if path_comparison_dynamic_data.exists():
                #     shutil.rmtree(path_comparison_dynamic_data)

                # path_comparison_dynamic_data_backup=Path.cwd().parent.parent/f"comparison_data/dynamic-backup/charge-{charge}"
                # if path_comparison_dynamic_data_backup.exists():
                #     shutil.rmtree(path_comparison_dynamic_data_backup)

                # arguments=[dynamic_script_path, str(charge), str(num_per_group), str(cache_list_threshold), spec_bin, step]
                
                # run_usrbintime_by_arguments(arguments=arguments, if_output=False)
                
                # path_comparison_dynamic_data_backup=Path.cwd().parent.parent/f"comparison_data/dynamic-backup/charge-{charge}"
                # if not path_comparison_dynamic_data_backup.exists():
                #     shutil.copytree(src=path_comparison_dynamic_data, dst=path_comparison_dynamic_data_backup)
                
                # ### FlashEntropySearch ###
                # # Remove the old index
                # path_comparison_flash_data=Path.cwd().parent.parent/f"comparison_data/flash/charge-{charge}"
                # if path_comparison_flash_data.exists():
                #     shutil.rmtree(path_comparison_flash_data)

                # arguments=[flash_script_path, str(charge), spec_bin, step]

                # run_usrbintime_by_arguments(arguments=arguments, if_output=False)
                
                ### MASST+ ###
                # Remove the old index
                path_masstp=Path.cwd().parent/"library/MASSTplus/build/masst_plus/tools/"
                path_masstp_lib=path_masstp/f"library-{charge}/"
                    
                if step=="build" and path_masstp_lib.exists():
                    shutil.rmtree(path_masstp_lib)
                    
                if step=="build":
                    command_str=f"./load {reference_mgf} -l {path_masstp_lib}"
                   
                result=subprocess.run(
                        ["/usr/bin/time", "-v", "sh", "-c", command_str],
                        stderr=subprocess.DEVNULL,
                        stdout=subprocess.PIPE,
                        env=os.environ.copy(),
                        cwd=path_masstp,
                        text=True
                    )


                
            if step=="hybrid_search" or step=="open_search" or step=="neutral_loss_search":

                # ### DynamicEntropySearch ###
                # method="dynamic"
                # for search_type in dynamic_search_types:
                    
                #     if search_type=="fast_search":
                #         perform_search_and_record_one_by_one(
                #             script_path=dynamic_script_path,
                #             search_type=search_type,
                #             charge=charge,
                #             library_size=library_size,
                #             step=step,
                #             query_spectra_list=query_spectra,
                #             num_per_group=num_per_group,
                #             cache_list_threshold=cache_list_threshold,
                #             method=method,
                #             if_output=True
                #         )
                    
                #     if search_type=="fast_update":
                #         # Simulate the out-of-order block condition by setting is_sorted to False
                #         dtype_block_info = np.dtype(
                #                             [
                #                                 ("start_idx", np.uint64),  # The start index of the block.
                #                                 ("data_len", np.uint32),  # The length of data in the block.
                #                                 ("reserved_len", np.uint32),  # The reserved length of the block.
                #                                 ("is_sorted", bool),  # Whether the block is sorted or not
                #                             ]
                #                         )
                #         path_comparison_dynamic_data=Path.cwd().parent.parent/f"comparison_data/dynamic/charge-{charge}"
                #         dynamic_entropy_search=DynamicEntropySearch(path_data=path_comparison_dynamic_data, num_per_group=num_per_group, cache_list_threshold=cache_list_threshold)
                #         dynamic_entropy_search.read()
                #         group_num=len(dynamic_entropy_search.group_start)

                #         for i in range(group_num):

                #             block_ion_info=np.memmap(path_comparison_dynamic_data/f"{i}/block_ions_info.bin", dtype=dtype_block_info, mode="r+")
                #             block_ion_info["is_sorted"].fill(False)
                #             block_ion_info.flush()

                #             block_nl_info=np.memmap(path_comparison_dynamic_data/f"{i}/block_nl_info.bin", dtype=dtype_block_info, mode="r+")
                #             block_nl_info["is_sorted"].fill(False)
                #             block_nl_info.flush()

                #         perform_search_and_record_one_by_one(
                #             script_path=dynamic_script_path,
                #             search_type=search_type,
                #             charge=charge,
                #             library_size=library_size,
                #             step=step,
                #             query_spectra_list=query_spectra,
                #             num_per_group=num_per_group,
                #             cache_list_threshold=cache_list_threshold,
                #             method=method,
                #             if_output=True)

                #     if search_type=="convert_to_flash":

                #         # Manually convert dynamic index to flash index
                #         path_comparison_dynamic_data=Path.cwd().parent.parent/f"comparison_data/dynamic/charge-{charge}"
                #         dynamic_entropy_search=DynamicEntropySearch(path_data=path_comparison_dynamic_data, num_per_group=num_per_group, cache_list_threshold=cache_list_threshold)
                #         dynamic_entropy_search.convert_to_fast_search()
                #         dynamic_entropy_search.read()
                #         group_num=len(dynamic_entropy_search.group_start)
                #         for i in range(group_num):

                #             inner_path=path_comparison_dynamic_data/f"{i}"
                #             dynamic_entropy_search.entropy_search.path_data=inner_path
                #             dynamic_entropy_search.entropy_search.read()
                #             dynamic_entropy_search.convert_current_index_to_flash()

                #         perform_search_and_record_one_by_one(
                #             script_path=dynamic_script_path,
                #             search_type=search_type,
                #             charge=charge,
                #             library_size=library_size,
                #             step=step,
                #             query_spectra_list=query_spectra,
                #             num_per_group=num_per_group,
                #             cache_list_threshold=cache_list_threshold,
                #             method=method,
                #             if_output=True
                #         )

                # shutil.rmtree(path_comparison_dynamic_data)
                # shutil.copytree(src=path_comparison_dynamic_data_backup, dst=path_comparison_dynamic_data)
                
                # ### FlashEntropySearch ###
                # method="flash"
                # perform_search_and_record_one_by_one(
                #     script_path=flash_script_path,
                #     search_type=None,
                #     charge=charge,
                #     library_size=library_size,
                #     step=step,
                #     query_spectra_list=query_spectra,
                #     num_per_group=None,
                #     cache_list_threshold=None,
                #     method=method,
                #     if_output=True
                # )

                if step=="hybrid_search":
                    ### MASST+ ###
                    method="masstplus"
                    # Create single spectrum files
                    for i, spec in enumerate(query_spectra):
                        temp_spec_mgf=path_spec_data/f"benchmark_spec/temp_spec.mgf"
                        with open(path_spec_data/f"benchmark_spec/temp_spec.mgf", "w") as temp1:
                            #Create temp mgf
                            _write_spec(spec, spec["peaks"], temp1)

                        path_masstp=Path.cwd().parent/"library/MASSTplus/build/masst_plus/tools/"
                        path_masstp_lib=path_masstp/f"library-{charge}/"
                            
                        # arguments: file_to_use, step
                        command_str=f"./search {temp_spec_mgf} -a -x -l {path_masstp_lib} -p 0.02 -t 0.0"
                            
                        with open(path_benchmark_data / f"{method}_{charge}_{library_size}_memory_usage_{step}_step_query_{i}.txt", "w") as f1,\
                            open(path_benchmark_data / f"{method}_{charge}_{library_size}_compare_time_{step}_step_query_{i}.txt", "w") as f2:
                            start_time=time.time()
                            result=subprocess.run(
                                    ["/usr/bin/time", "-v", "sh", "-c", command_str],
                                    stderr=f1,
                                    stdout=subprocess.PIPE,
                                    env=os.environ.copy(),
                                    cwd=path_masstp,
                                    text=True
                                )
                            elapsed_time=time.time()-start_time
                            f2.write(str(elapsed_time))
                            print(result.stdout)

                        with open(path_benchmark_data / f"masstplus_internal_time_record_{charge}.txt", "a") as f:
                            f.write(result.stdout)
                            


KeyboardInterrupt: 