In [29]:
from pathlib import Path

path_spec_data=Path.cwd().parent.parent/"spec_data"
path_benchmark_data=Path.cwd().parent.parent/"benchmark_for_test"

path_spec_data.mkdir(parents=True, exist_ok=True)
path_benchmark_data.mkdir(parents=True, exist_ok=True)

In [30]:
# Only record build and update

library_size = 1_000_000
ion_mode = [-1,1]
add_size=[100, 1_000, 10_000, 100_000, 1_000_000]
steps=["build", 
       "update"
       ]

file_num=library_size//1_000_000
num_per_group=10_000_000
cache_list_threshold=1_000_000
dynamic_script_path="19_dynamic_entropy_search_library_scale_hybrid_fast_update_mode_every_step.py"
flash_script_path="23_flash_entropy_search_library_scale_hybrid_every_step.py"

In [31]:
# Collect the files
import re
import os
pattern_pos=r"spectra-charge_1_batch_\d+\.mgf"
pattern_neg=r"spectra-charge_-1_batch_\d+\.mgf"
files=os.listdir(Path.cwd().parent.parent/"spec_data/benchmark_spec")
batch_mgf_pos=[]
for file in files:
    if re.match(pattern_pos, file):
        pos_num=file[23:-4]
        batch_mgf_pos.append(pos_num)


batch_mgf_neg=[]
for file in files:
    if re.match(pattern_neg, file):
        neg_num=file[24:-4]
        batch_mgf_neg.append(neg_num)

print(batch_mgf_pos)
print(batch_mgf_neg)

['29', '92', '68', '10', '91', '5', '67', '47', '14', '96']
['56', '32', '67', '70', '87', '98', '54', '75', '88', '21']


In [None]:
import subprocess
import pickle
import os
import time
import numpy as np
import shutil
import msgpack
from typing import Union

def run_usrbintime_by_arguments(
          arguments:list[str], 
          if_output:bool=False, 
          output_memory_file:Union[str,Path]=None, 
          output_time_file:Union[str, Path]=None):
    
    # arguments: script_path, str(charge), step
    command=["/usr/bin/time","-v","python"] + arguments

    if if_output: # Output to files as record
        with open(output_memory_file, "w") as f1, open(output_time_file, "w") as f2:
            subprocess.run(command, stderr=f1, stdout=f2, cwd=Path.cwd(), env=os.environ.copy())

    else: # Output is not needed
         
        subprocess.run(command, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL, cwd=Path.cwd(), env=os.environ.copy())
        
    return




for charge in ion_mode:
    if charge==-1:
        random_num=batch_mgf_neg
    elif charge==1:
        random_num=batch_mgf_pos
            
    random_select_num=np.random.choice(random_num, file_num, replace=False)

    for step in steps:
        # Step 1: Read the query spectra, library spectra and added spectra 
        if step=="build":

            # Assign reference spectra

            spectra_mgf=path_spec_data/f"benchmark_spec/spectra-charge_{charge}_batch_{random_select_num[0]}.mgf"
            spectra_bin=path_spec_data/f"35_random_export_ms2/charge_{charge}/batch_{random_select_num[0]}.bin"

            # Remove the old index
            path_comparison_dynamic_data=Path.cwd().parent.parent/f"comparison_data/dynamic/charge-{charge}"
            if path_comparison_dynamic_data.exists():
                shutil.rmtree(path_comparison_dynamic_data)

            path_comparison_dynamic_add=Path.cwd().parent.parent/f"comparison_data/dynamic/dynamic-charge-{charge}-add"
            if path_comparison_dynamic_add.exists():
                shutil.rmtree(path_comparison_dynamic_add)

            path_comparison_flash_data=Path.cwd().parent.parent/f"comparison_data/flash/charge-{charge}"
            if path_comparison_flash_data.exists():
                shutil.rmtree(path_comparison_flash_data)

            path_comparison_flash_add=Path.cwd().parent.parent/f"comparison_data/flash/flash-charge-{charge}-add"
            if path_comparison_flash_add.exists():
                shutil.rmtree(path_comparison_flash_add)

            path_masstp=Path.cwd().parent/"library/MASSTplus/build/masst_plus/tools/"
            path_masstp_lib=path_masstp/f"library-{charge}"
            if path_masstp_lib.exists():
                shutil.rmtree(path_masstp_lib)
            
            path_masstp_add=Path.cwd().parent/f"library/MASSTplus/build/masst_plus/tools/library-{charge}-add"
            if path_masstp_add.exists():
                shutil.rmtree(path_masstp_add)

            # Perform build
            ### Dynamic Entropy Search ###
            arguments=[dynamic_script_path, str(charge), str(num_per_group), str(cache_list_threshold), spectra_bin, step]
            run_usrbintime_by_arguments(arguments=arguments,
                                        if_output=False)
            
            #############################
            ### Flash Entropy Search ###
            arguments=[flash_script_path, str(charge), spectra_bin, step]
            run_usrbintime_by_arguments(arguments=arguments,
                                        if_output=False)
            
            #############################
            ### MASST+ ###
            command_str=f"./load {spectra_mgf} -l {path_masstp_lib}"
            subprocess.run(
                        ["/usr/bin/time", "-v", "sh", "-c", command_str],
                        stderr=subprocess.DEVNULL,
                        stdout=subprocess.DEVNULL,
                        env=os.environ.copy(),
                        cwd=path_masstp
                    )
            #############################

        elif step=="update":
            for i, added_size in enumerate(add_size):
                # Assign the update spectra
                if added_size < 1_000_000:
                    
                    insert_spectra_mgf=path_spec_data/f"benchmark_spec/spectra-charge_{charge}-number_{added_size}.mgf"
                    insert_spectra_bin=path_spec_data/f"benchmark_spec/spectra-charge_{charge}-number_{added_size}.pkl"

                else:
                    add_file_num=added_size//1_000_000
                    random_add_select_num=np.random.choice(random_num, add_file_num, replace=False)
                    
                    insert_spectra_mgf=path_spec_data/f"benchmark_spec/spectra-charge_{charge}_batch_{random_add_select_num[0]}.mgf"
                    insert_spectra_bin=path_spec_data/f"35_random_export_ms2/charge_{charge}/batch_{random_add_select_num[0]}.bin"



                # Duplicate the initial index for different added_size insertion 
                # Then perform update under the old path
                # Then delete the old path
                # Then rename the new path as the old path name

                ### Dynamic Entropy Search ###
                # step1
                src=Path.cwd().parent.parent/f"comparison_data/dynamic/charge-{charge}"
                dst=Path.cwd().parent.parent/f"comparison_data/dynamic/dynamic-charge-{charge}-add"
                if not dst.exists():
                    shutil.copytree(src=src, dst=dst)

                # step2
                arguments=[dynamic_script_path, str(charge), str(num_per_group), str(cache_list_threshold), insert_spectra_bin, step]
                output_memory_file=path_benchmark_data/f"dynamic_fast_update_{charge}_{added_size}_added_to_{library_size}_memory_usage_{step}_step_options_added_size.txt"
                output_time_file=path_benchmark_data/f"dynamic_fast_update_{charge}_{added_size}_added_to_{library_size}_compare_time_{step}_step_options_added_size.txt"
                run_usrbintime_by_arguments(arguments=arguments, 
                                            if_output=True, 
                                            output_memory_file=output_memory_file,
                                            output_time_file=output_time_file)

                # step3
                shutil.rmtree(src)

                # step4
                shutil.copytree(src=dst, dst=src)

                #############################

                ### Flash Entropy Search ###
                # step1
                src=Path.cwd().parent.parent/f"comparison_data/flash/charge-{charge}"
                dst=Path.cwd().parent.parent/f"comparison_data/flash/flash-charge-{charge}-add"
                if not dst.exists():
                    shutil.copytree(src=src, dst=dst)

                # step2
                arguments=[flash_script_path, str(charge), spectra_bin, insert_spectra_bin, step]
                output_memory_file=path_benchmark_data/f"flash_{charge}_{added_size}_added_to_{library_size}_memory_usage_{step}_step_options_added_size.txt"
                output_time_file=path_benchmark_data/f"flash_{charge}_{added_size}_added_to_{library_size}_compare_time_{step}_step_options_added_size.txt"
                run_usrbintime_by_arguments(arguments=arguments,
                                            if_output=True,
                                            output_memory_file=output_memory_file,
                                            output_time_file=output_time_file)
                # step3
                shutil.rmtree(src)

                # step4
                shutil.copytree(src=dst, dst=src)

                #############################

                ### MASST+ ###
                # step1
                src=Path.cwd().parent/f"library/MASSTplus/build/masst_plus/tools/library-{charge}"
                dst=Path.cwd().parent/f"library/MASSTplus/build/masst_plus/tools/library-{charge}-add"
                if not dst.exists():
                    shutil.copytree(src=src, dst=dst)

                # step2
                command_str=f"./load {insert_spectra_mgf} -l {src}"
                output_memory_file=path_benchmark_data/f"masstplus_{charge}_{added_size}_added_to_{library_size}_memory_usage_{step}_step_options_added_size.txt"
                output_time_file=path_benchmark_data/f"masstplus_{charge}_{added_size}_added_to_{library_size}_compare_time_{step}_step_options_added_size.txt"
                with open(output_memory_file, "w") as f1, open(output_time_file, "w") as f2:
                    start_time=time.time()
                    subprocess.run(
                            ["/usr/bin/time", "-v", "sh", "-c", command_str],
                            stderr=f1,
                            stdout=subprocess.DEVNULL,
                            env=os.environ.copy(),
                            cwd=path_masstp
                        )
                    elapsed_time=time.time()-start_time

                    f2.write(str(elapsed_time))
                
                # step3
                shutil.rmtree(src)

                # step4
                shutil.copytree(src=dst, dst=src)

                #############################
                    


     
# Remove the old index
for charge in ion_mode:
    path_comparison_dynamic_data=Path.cwd().parent.parent/f"comparison_data/dynamic/charge-{charge}"
    if path_comparison_dynamic_data.exists():
        shutil.rmtree(path_comparison_dynamic_data)

    path_comparison_dynamic_add=Path.cwd().parent.parent/f"comparison_data/dynamic/dynamic-charge-{charge}-add"
    if path_comparison_dynamic_add.exists():
        shutil.rmtree(path_comparison_dynamic_add)

    path_comparison_flash_data=Path.cwd().parent.parent/f"comparison_data/flash/charge-{charge}"
    if path_comparison_flash_data.exists():
        shutil.rmtree(path_comparison_flash_data)

    path_comparison_flash_add=Path.cwd().parent.parent/f"comparison_data/flash/flash-charge-{charge}-add"
    if path_comparison_flash_add.exists():
        shutil.rmtree(path_comparison_flash_add)

    path_masstp=Path.cwd().parent/"library/MASSTplus/build/masst_plus/tools/"
    path_masstp_lib=path_masstp/f"library-{charge}"
    if path_masstp_lib.exists():
        shutil.rmtree(path_masstp_lib)

    path_masstp_add=Path.cwd().parent/f"library/MASSTplus/build/masst_plus/tools/library-{charge}-add"
    if path_masstp_add.exists():
        shutil.rmtree(path_masstp_add)           

            