In [1]:
import pastaq as pq
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.colors as colors

In [2]:
input_files = [{'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE20_exc0-p1_2.ms2"},
               { 'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE20_exc2-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE20_exc10-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE30_exc0-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE30_exc2-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE30_exc10-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE40_exc0-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE40_exc2-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE40_exc10-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE50_exc0-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE50_exc2-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE50_exc10-p1_2.ms2"},
                {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE60_exc0-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE60_exc2-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE60_exc10-p1_2.ms2"},
               ]

In [3]:
feature_clusters_annotations_csv = pd.read_csv(r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\quant\feature_clusters_annotations.csv")

In [4]:
output_dir = r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID"

In [5]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
import ms_entropy as me

def combine_multiple_samples(feature_clusters_annotations_csv, input_files, output_dir):
    # Preprocess: build a lookup dictionary to avoid filtering the DataFrame each time
    annotations_lookup = defaultdict(list)

    for _, row in feature_clusters_annotations_csv.iterrows():
        if pd.notnull(row['msms_id']):
            key = (row['file_id'], row['msms_id'])
            annotations_lookup[key].append(row)

    combined_multiple_samples = []

    for file in input_files:
        if 'stem' not in file:
            base_name = os.path.splitext(os.path.basename(file['raw_path']))[0]
            file['stem'] = base_name
        stem = file['stem']
        in_path = os.path.join(output_dir, 'raw', f"{stem}.ms2")

        if not os.path.exists(in_path):
            continue

        raw_data = pq.read_raw_data(in_path)

        for scan in raw_data.scans:
            scan_number = scan.scan_number
            key = (stem, scan_number)
            annotations = annotations_lookup.get(key)

            if not annotations:
                continue

            ms2_mz = scan.mz
            ms2_intensity = scan.intensity
            ms2_rt = scan.retention_time

            if not ms2_mz or not ms2_intensity or len(ms2_mz) != len(ms2_intensity):
                continue

            # Convert to numpy array for faster sorting
            mz_array = np.array(ms2_mz)
            intensity_array = np.array(ms2_intensity)
            sorted_indices = np.argsort(mz_array)
            mz_intensity_pairs = list(zip(mz_array[sorted_indices], intensity_array[sorted_indices]))
            ms2_peaks = np.array(mz_intensity_pairs, dtype=np.float32)
            centroided_peaks = me.clean_spectrum(ms2_peaks, min_ms2_difference_in_da = 0.02, normalize_intensity = False)
            # Extract centroided m/z and intensity values from the sample
            cent_mz, cent_int = zip(*centroided_peaks)

            for row in annotations:
                combined_multiple_samples.append({
                    'cluster_id': row['cluster_id'],
                    'file_id': row['file_id'],
                    'feature_id': row['feature_id'],
                    'peak_id': row['peak_id'],
                    'msms_id': row['msms_id'],
                    'ms2_rt': ms2_rt,
                    'charge_state': row['charge_state'],
                    'cent_mz' : cent_mz,
                    'cent_int' : cent_int,
                    'ms2_peaks' : ms2_peaks
                })


    return combined_multiple_samples

In [6]:
combined_multiple_samples = combine_multiple_samples(feature_clusters_annotations_csv, input_files, output_dir)

In [6]:
combined_multiple_samples = combine_multiple_samples(feature_clusters_annotations_csv, input_files, output_dir)
from pathlib import Path  
filepath = Path('CID_metadata/CID_CORRECT/combined_multiple_samples.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
combined_multiple_samples_df = pd.DataFrame(combined_multiple_samples)
combined_multiple_samples_df.to_csv(Path('CID_metadata/CID_CORRECT/combined_multiple_samples.csv', index=False))

In [7]:
msp_file = r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\Msp_CID_2025_03_21_12_30_01_AlignmentResult_2025_03_19_11_06_05.msp"

In [None]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
import ms_entropy as me

def link_features(combined_multiple_samples_csv, input_files, output_dir):
    # Preprocess: build a lookup dictionary to avoid filtering the DataFrame each time
    annotations_lookup = defaultdict(list)

    for _, row in combined_multiple_samples_csv.iterrows():
        if pd.notnull(row['feature_id']):
            key = (row['file_id'], row['feature_id'])
            annotations_lookup[key].append(row)

    linked_features = []

    for file in input_files:
        if 'stem' not in file:
            base_name = os.path.splitext(os.path.basename(file['raw_path']))[0]
            file['stem'] = base_name
        stem = file['stem']
        in_path_features = os.path.join(output_dir, 'features', f"{stem}.features")

        if not os.path.exists(in_path_features):
            continue

        features = pq.read_features(in_path_features)

        for feature in features:
            id = feature.id
            key = (stem, id)
            annotations = annotations_lookup.get(key)

            if not annotations:
                continue

            features_df = pd.DataFrame({
                'feature_id': [feature.id for feature in features],
                'average_mz': [feature.average_mz for feature in features],
                'average_mz_sigma': [feature.average_mz_sigma for feature in features],
                'average_rt': [feature.average_rt for feature in features],
                'average_rt_sigma': [feature.average_rt_sigma for feature in features],
                'average_rt_delta': [feature.average_rt_delta for feature in features],
                'total_height': [feature.total_height for feature in features],
                'total_volume': [feature.total_volume for feature in features],
                'monoisotopic_mz': [feature.monoisotopic_mz for feature in features],
                'monoisotopic_rt' : [feature.monoisotopic_rt for feature in features],
                'monoisotopic_height': [feature.monoisotopic_height for feature in features],
                'monoisotopic_volume': [feature.monoisotopic_volume for feature in features],
                'charge_state': [feature.charge_state for feature in features],
                'peak_id': [feature.peak_ids for feature in features],
            })

            precursor_intensity =feature.monoisotopic_height
            precursor_rt = feature.monoisotopic_rt
            precursor_mz = feature.monoistotopic_mz
            precursor_vol = feature.monoisotopic_volume
            average_ms1_mz = feature.average_mz
            average_rt =feature.average_rt
            charge_state = feature.charge_state


            if not ms2_mz or not ms2_intensity or len(ms2_mz) != len(ms2_intensity):
                continue

            # Convert to numpy array for faster sorting
            mz_array = np.array(ms2_mz)
            intensity_array = np.array(ms2_intensity)
            sorted_indices = np.argsort(mz_array)
            mz_intensity_pairs = list(zip(mz_array[sorted_indices], intensity_array[sorted_indices]))
            ms2_peaks = np.array(mz_intensity_pairs, dtype=np.float32)
            centroided_peaks = me.clean_spectrum(ms2_peaks, min_ms2_difference_in_da = 0.02, normalize_intensity = False)
            # Extract centroided m/z and intensity values from the sample
            cent_mz, cent_int = zip(*centroided_peaks)

            for row in annotations:
                combined_multiple_samples.append({
                    'cluster_id': row['cluster_id'],
                    'file_id': row['file_id'],
                    'feature_id': row['feature_id'],
                    'peak_id': row['peak_id'],
                    'msms_id': row['msms_id'],
                    'ms2_rt': ms2_rt,
                    'charge_state': row['charge_state'],
                    'cent_mz' : cent_mz,
                    'cent_int' : cent_int,
                    'ms2_peaks' : ms2_peaks
                })


    return combined_multiple_samples

In [7]:
output_dir = r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID"

In [None]:
            precursor_intensity =feature.monoisotopic_height
            precursor_rt = feature.monoisotopic_rt
            precursor_mz = feature.monoistotopic_mz
            precursor_vol = feature.monoisotopic_volume
            average_ms1_mz = feature.average_mz
            average_rt =feature.average_rt
            charge_state = feature.charge_state

In [8]:
input_files = [{'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE20_exc0-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE20_exc2-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE20_exc10-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE30_exc0-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE30_exc2-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE30_exc10-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE40_exc0-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE40_exc2-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE40_exc10-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE50_exc0-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE50_exc2-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE50_exc10-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE60_exc0-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE60_exc2-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE60_exc10-p1_2.features"}
                ]

In [10]:
import os
import numpy as np
import pandas as pd
from collections import defaultdict
import ms_entropy as me
from sklearn.metrics import root_mean_squared_error

def link_msp(
    msp_file, input_files, output_dir, combined_multiple_samples,
    mz_tolerance=0.025, rt_tolerance=8.0
):
    annotations_lookup = defaultdict(list)
    linked_msp = []

    for ann in combined_multiple_samples:
        if pd.notnull(ann['feature_id']):
            key = (ann['file_id'], ann['feature_id'])
            annotations_lookup[key].append(ann)

    msp_data = pq.read_msp_file(msp_file)

    for file in input_files:
        if 'stem' not in file:
            file['stem'] = os.path.splitext(os.path.basename(file['raw_path']))[0]
        stem = file['stem']
        in_path_features = os.path.join(output_dir, 'features', f"{stem}.features")

        if not os.path.exists(in_path_features):
            print(f"Warning: Missing features file: {in_path_features}")
            continue

        features = pq.read_features(in_path_features)

        for feature in features:
            id = feature.id
            key = (stem, id)
            annotations = annotations_lookup.get(key)

            if not annotations:
                continue

            for annotation in annotations:
                file_id = annotation['file_id']
                cluster_id = annotation['cluster_id']
                msms_id = annotation['msms_id']
                ms2_sample_peaks = annotation['ms2_peaks']

                msp_data = pq.read_msp_file(msp_file)

                # Centroid the MS2 spectrum
                centroided_peaks = me.clean_spectrum(
                    ms2_sample_peaks,
                    min_ms2_difference_in_da=0.02,
                    normalize_intensity=False
                )

                centroided_arr = np.array(centroided_peaks)
                centroided_arr_list = centroided_arr.tolist()

                sample_mz, sample_intensity = zip(*centroided_peaks)
                sample_mz = np.array(sample_mz)
                sample_intensity = np.array(sample_intensity, dtype=np.float32)

                precursor_mz = feature.monoisotopic_mz
                precursor_rt = feature.monoisotopic_rt

                best_match = None

                for data in msp_data:
                    if 'precursor_mz' not in data or 'retention_time' not in data:
                        continue

                    mz_dist = np.abs(precursor_mz - data['precursor_mz'])
                    rt_dist = np.abs(precursor_rt - data['retention_time'])

                    if mz_dist <= mz_tolerance and rt_dist <= rt_tolerance:
                        score = (mz_dist * 20) + (rt_dist * 0.0625)
                        mass_error_ppm = (
                            (data['precursor_mz'] - precursor_mz) / data['precursor_mz']
                        ) * 1e6
                        y_true_mz = [data.get('precursor_mz')]
                        y_pred_mz = [precursor_mz]
                        rmse_mz = root_mean_squared_error(y_true_mz, y_pred_mz)

                        reference_peaks = data.get('peaks', [])
                        ref_mz, ref_intensity = zip(*reference_peaks)
                        ref_mz = np.array(ref_mz)
                        ref_intensity = np.array(ref_intensity, dtype=np.float32)

                        # Dot Product
                        matched_idx = np.searchsorted(ref_mz, sample_mz)
                        matched_idx = matched_idx[matched_idx < len(ref_mz)]

                        sample_spectrum = sample_intensity[:len(matched_idx)]
                        reference_spectrum = ref_intensity[matched_idx]

                        sample_norm = sample_spectrum / np.linalg.norm(sample_spectrum)
                        reference_norm = reference_spectrum / np.linalg.norm(reference_spectrum)
                        if len(centroided_peaks) < 3 or len(reference_peaks) < 3:
                            dot_product = None
                        else:
                            dot_product = np.dot(sample_norm, reference_norm)

                        similarity = me.calculate_entropy_similarity(
                            np.array(ms2_sample_peaks, dtype=np.float32),
                            np.array(reference_peaks, dtype=np.float32)
                        )
                        unweighted_similarity = me.calculate_unweighted_entropy_similarity(
                            np.array(ms2_sample_peaks, dtype=np.float32),
                            np.array(reference_peaks, dtype=np.float32)
                        )

                        if best_match is None or score < best_match['score']:
                            best_match = {
                                'score': float(score),
                                'mass_error_ppm': float(mass_error_ppm),
                                'name': data.get('name'),
                                'retention_time': data.get('retention_time'),
                                'precursor_mz': data.get('precursor_mz'),
                                'precursor_type': data.get('precursor_type'),
                                'smiles': data.get('smiles'),
                                'saturation': data.get('saturation'),
                                'msp_peaks': reference_peaks
                            }

                            annotated_feature = {
                                'cluster_id': cluster_id,
                                'file_id' : file_id,
                                'feature_id': id,
                                'msms_id': msms_id,
                                'peak_ids': feature.peak_ids,
                                'dot_product': float(dot_product) if dot_product is not None else 'NA',
                                'similarity': float(similarity),
                                'unweighted_similarity': float(unweighted_similarity),
                                'centroided_peaks': centroided_arr_list,
                                'rmse_mz': float(rmse_mz),
                                'mass_error_ppm': mass_error_ppm,
                                'precursor_intensity': feature.monoisotopic_height,
                                'precursor_rt': precursor_rt,
                                'precursor_vol': feature.monoisotopic_volume,
                                'average_ms1_mz': feature.average_mz,
                                'average_rt': feature.average_rt,
                                'charge_state': feature.charge_state,
                                'matches': [best_match]
                            }

                if best_match:
                    linked_msp.append(annotated_feature)

    return linked_msp


In [None]:
import os
import pandas as pd
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor
import pickle  # for shared data serialization
import pastaq as pq  

def build_annotations_lookup(combined_multiple_samples):
    lookup = defaultdict(list)
    for ann in combined_multiple_samples:
        if pd.notnull(ann['feature_id']):
            key = (ann['file_id'], ann['feature_id'])
            lookup[key].append(ann)
    return lookup

# This function processes ONE file
def process_file(file, output_dir, annotations_lookup):
    results = []

    if 'stem' not in file:
        file['stem'] = os.path.splitext(os.path.basename(file['raw_path']))[0]
    stem = file['stem']
    in_path_features = os.path.join(output_dir, 'features', f"{stem}.features")

    if not os.path.exists(in_path_features):
        print(f"Warning: Missing features file: {in_path_features}")
        return [] #returns empty list skipping file

    features = pq.read_features(in_path_features)

    for feature in features:
        id = feature.id
        key = (stem, id)
        annotations = annotations_lookup.get(key)

        if not annotations:
            continue

        for annotation in annotations:
            results.append({
                'cluster_id': annotation['cluster_id'],
                'file_id': annotation['file_id'],
                'feature_id': id,
                'msms_id': annotation['msms_id'],
                'peak_ids': feature.peak_ids,
                'precursor_intensity': feature.monoisotopic_height,
                'precursor_rt': feature.monoisotopic_rt,
                'precursor_vol': feature.monoisotopic_volume,
                'average_ms1_mz': feature.average_mz,
                'average_rt': feature.average_rt,
                'charge_state': feature.charge_state,
                'ms2_sample_peaks': annotation['ms2_peaks']
            })

    return results


In [None]:
def link_features(input_files, output_dir, combined_multiple_samples):
    annotations_lookup = build_annotations_lookup(combined_multiple_samples)

    # Optional: If annotations_lookup is large, use a shared read-only copy
    # using pickle to avoid memory overhead
    with ProcessPoolExecutor() as executor:
        futures = [
            executor.submit(process_file, file, output_dir, annotations_lookup)
            for file in input_files
        ] #run tasks in parallel

        linked_features = []
        for future in futures:
            linked_features.extend(future.result())

    return linked_features


In [None]:
linked_features = link_features(input_files, output_dir, combined_multiple_samples=combined_multiple_samples)
from pathlib import Path  
filepath = Path('CID_metadata/CID_CORRECT/linked_features.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
linked_features_df = pd.DataFrame(linked_features)
linked_features_df.to_csv(Path('CID_metadata/CID_CORRECT/linked_features.csv', index=False))

In [None]:
from sklearn.metrics import root_mean_squared_error
from scipy.spatial import KDTree
import ms_entropy as me

# Update -> working (~7.5 min)
# function to annotate the features csv file output from running the DDA pipeline with msp annotations:
def link_msp(linked_features_csv, msp_data, mz_tolerance=0.025, rt_tolerance=8.0): # Changed to be the same as default in MSdial for MS2 -> maybe set to be customizable parameter -> changed rt_tol to 8
    linked_msp = []

    # Loop through a list of Feature objects
    for row in linked_features_csv.itertuples(index=False):
        average_mz = row.average_mz  # using 'average_mz' from features as mz (is actually weighted average)
        total_intensity = row.total_height  # using 'total_height' from features
        average_rt = row.average_rt  # average retention time from features (in seconds)
        total_volume = row.total_volume # total volume of the feature
        precursor_mz = row.monoisotopic_mz  # Using monoisotopic mz from features as precursor mz (fitted mz from peak)
        precursor_intensity = row.monoisotopic_height # intensity of the precursor (fitted height from peak)
        precursor_volume = row.monoisotopic_volume # volume of the precursor (peaks?)
        precursor_rt = row.monoisotopic_rt #retention time (in seconds) of the precursor
        charge_state = row.charge_state #charge state
        feature_id = row.feature_id
        feature_peak_ids = row.peak_ids # the ids of the peaks generated by the pastaq peaks object
        cluster_id = row.cluster_id
        file_id =  row.file_id
        ms2_sample_peaks = row.ms2_sample_peaks
        normalized_area = precursor_volume * 114.7977026

    # Load and filter MSP annotations once
    msp_data = pq.read_msp_file(msp_file)
    filtered_msp = [
        ann for ann in msp_data
        if 'peaks' in ann and 'precursor_mz' in ann and 'retention_time' in ann]
    lib_mzs = np.array([ann['precursor_mz'] for ann in filtered_msp])
    tree = KDTree(lib_mzs.reshape(-1, 1))
    kd_tree = tree.query_ball_point([[precursor_mz]], r=mz_tolerance)[0]
    candidates = [filtered_msp[i] for i in kd_tree]
    for candidate in candidates:
        # Calculate mz and rt distances directly for scalars
        mz_distance = np.abs(precursor_mz - ann['precursor_mz']) # changed this from 'mz' to 'precursor_mz'
        rt_distance = np.abs(precursor_rt - ann['retention_time'])  # changed this from 'retention_time' to 'precursor_rt' 

        # Apply the tolerance checks
        if mz_distance <= mz_tolerance and rt_distance <= rt_tolerance:
            # If the distances are within tolerance, calculate the match score with normalization factor
            match_score = (rt_distance*0.025) + (mz_distance*20) # lower score is better/ closer match; added weighting
                
                # Calculate mass error in ppm using formula
                mass_error_ppm = ((annotation['precursor_mz'] - precursor_mz) / annotation['precursor_mz']) * 10**6
                
                #Calculate root mean squared error for best match
                y_true_mz = [annotation.get('precursor_mz')]
                y_pred_mz = [precursor_mz]
                rmse_mz = root_mean_squared_error(y_true_mz, y_pred_mz)
                
                if best_match is None or match_score < best_match['score']:
                    best_match = {
                        'score': match_score,  # Store the match score
                        'mass_error_ppm' : mass_error_ppm,
                        'rmse_mz' : rmse_mz,
                        'name': annotation.get('name', None),
                        'saturation' : annotation.get('saturation', None),
                        'retention_time': annotation.get('retention_time', None),
                        'precursor_mz': annotation.get('precursor_mz', None),
                        'precursor_type': annotation.get('precursor_type', None),
                        'smiles': annotation.get('smiles', None),
                        'msp_peaks': annotation.get('peaks', None),
                    }
        
        # Create the annotated scan with only the best match
        annotated_feature_csv = {
            'average_mz': average_mz,
            'total_intensity': total_intensity,  # Intensity values
            'average_rt': average_rt,  # Average retention time (in seconds)
            'total_volume' : total_volume,
            'precursor_mz': precursor_mz,
            'precursor_intensity' : precursor_intensity,
            'precursor_rt' : precursor_rt,
            'precursor_volume' : precursor_volume,
            'normalized_area' : normalized_area,
            'charge_state' : charge_state,
            'feature_id': feature_id,
            'feature_peak_ids' : feature_peak_ids,
            'matches': []  # List to hold the top match
        }

        # Add the best match if available
        if best_match:
            annotated_feature_csv['matches'].append(best_match)
        else:
            annotated_feature_csv['matches'] = None  # No match found
        
        annotated_features_csv.append(annotated_feature_csv)

    # Return after all features are processed
    return annotated_features_csv

In [None]:
import os
import numpy as np
import pandas as pd
from collections import defaultdict

def link_features(input_files, output_dir, combined_multiple_samples):
    annotations_lookup = defaultdict(list)

    for file in input_files:
        if 'stem' not in file:
            file['stem'] = os.path.splitext(os.path.basename(file['raw_path']))[0]
            stem = file['stem']
            in_path_features = os.path.join(output_dir, 'features', f"{stem}.features")

            
        if not os.path.exists(in_path_features):
            print(f"Warning: Missing features file: {in_path_features}")
            continue

    features = pq.read_features(in_path_features)

    features_df = pd.DataFrame({
        'feature_id': [feature.id for feature in features],
        'average_mz': [feature.average_mz for feature in features],
        'average_mz_sigma': [feature.average_mz_sigma for feature in features],
        'average_rt': [feature.average_rt for feature in features],
        'average_rt_sigma': [feature.average_rt_sigma for feature in features],
        'average_rt_delta': [feature.average_rt_delta for feature in features],
        'total_height': [feature.total_height for feature in features],
        'total_volume': [feature.total_volume for feature in features],
        'monoisotopic_mz': [feature.monoisotopic_mz for feature in features],
        'monoisotopic_rt' : [feature.monoisotopic_rt for feature in features],
        'monoisotopic_height': [feature.monoisotopic_height for feature in features],
        'monoisotopic_volume': [feature.monoisotopic_volume for feature in features],
        'charge_state': [feature.charge_state for feature in features],
        'peak_id': [feature.peak_ids for feature in features],
    })
    key = (stem, id)
    annotations = annotations_lookup.get(key)
        if not annotations:
            continue


    for ann in combined_multiple_samples:
        if pd.notnull(ann['feature_id']):
            key = (ann['file_id'], ann['feature_id'])
            annotations_lookup[key].append(ann)

        linked_features = []

        features = pq.read_features(in_path_features)
        for feature in features:
            id = feature.id
            key = (stem, id)
            annotations = annotations_lookup.get(key)



            for annotation in annotations:
                file_id = annotation['file_id']
                cluster_id = annotation['cluster_id']
                msms_id = annotation['msms_id']
                ms2_sample_peaks = annotation['ms2_peaks']

                
                linked_features.append ({
                    'cluster_id': cluster_id,
                    'file_id' : file_id,
                    'feature_id': id,
                    'msms_id': msms_id,
                    'peak_ids': feature.peak_ids,
                    'precursor_intensity': feature.monoisotopic_height,
                    'precursor_rt': feature.monoisotopic_rt,
                    'precursor_vol': feature.monoisotopic_volume,
                    'average_ms1_mz': feature.average_mz,
                    'average_rt': feature.average_rt,
                    'charge_state': feature.charge_state,
                    'ms2_sample_peaks' : ms2_sample_peaks
                     })

    return linked_features


In [10]:
linked_features = link_features(input_files, output_dir, combined_multiple_samples=combined_multiple_samples)
from pathlib import Path  
filepath = Path('CID_metadata/CID_CORRECT/linked_features.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
linked_features_df = pd.DataFrame(linked_features)
linked_features_df.to_csv(Path('CID_metadata/CID_CORRECT/linked_features.csv', index=False))


KeyboardInterrupt: 

In [None]:
            features = pastaq.read_features(in_path_features)

            _custom_log("Generating features quantitative table", logger)
            features_df = pd.DataFrame({
                'feature_id': [feature.id for feature in features],
                'average_mz': [feature.average_mz for feature in features],
                'average_mz_sigma': [feature.average_mz_sigma for feature in features],
                'average_rt': [feature.average_rt for feature in features],
                'average_rt_sigma': [feature.average_rt_sigma for feature in features],
                'average_rt_delta': [feature.average_rt_delta for feature in features],
                'total_height': [feature.total_height for feature in features],
                'total_volume': [feature.total_volume for feature in features],
                'monoisotopic_mz': [feature.monoisotopic_mz for feature in features],
                'monoisotopic_rt' : [feature.monoisotopic_rt for feature in features],
                'monoisotopic_height': [feature.monoisotopic_height for feature in features],
                'monoisotopic_volume': [feature.monoisotopic_volume for feature in features],
                'charge_state': [feature.charge_state for feature in features],
                'peak_id': [feature.peak_ids for feature in features],
            })

In [None]:
import numpy as np
from scipy.spatial import KDTree

# Load and filter MSP annotations once
msp_entries = [ann for ann in pq.read_msp_file(msp_file)
               if 'peaks' in ann and 'precursor_mz' in ann and 'retention_time' in ann]
precursor_array = np.array([ann['precursor_mz'] for ann in msp_entries]).reshape(-1,1)

# Build the tree (fast lookups O(log M)): :contentReference[oaicite:2]{index=2}
tree = KDTree(precursor_array)

# Reshape the query point to 2D array for KDTree
query_point = np.array([[precursor_mz]])
idxs = tree.query_ball_point(query_point, r=mz_tolerance)[0]  # all indices within tol

candidates = [msp_entries[i] for i in idxs]
# Now only run detailed matching (RT, entropy, etc.) on these candidates


In [None]:
# Reshape the query point to 2D array for KDTree
query_point = np.array([[precursor_mz]])
idxs = tree.query_ball_point(query_point, r=mz_tolerance)[0]  # all indices within tol

candidates = [msp_entries[i] for i in idxs]
# Now only run detailed matching (RT, entropy, etc.) on these candidates


In [None]:
# 1. Prepare tree
msp_entries = [...]
precursor_array = np.array([...]).reshape(-1,1)
tree = KDTree(precursor_array)

# 2. Inside your loop, for each peak:
query = np.array([[ms1_mz]])
idxs = tree.query_ball_point(query, r=mz_tolerance)[0]
for idx in idxs:
    ann = msp_entries[idx]
    # Check rt tolerance, compute scores...


In [None]:
linked_msp = link_msp(msp_file=msp_file, output_dir=output_dir, input_files=input_files, combined_multiple_samples=combined_multiple_samples, mz_tolerance=0.025, rt_tolerance=8.0)
from pathlib import Path  
filepath = Path('CID_metadata/CID_CORRECT/linked_msp_matches.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
linked_msp_df = pd.DataFrame(linked_msp)
linked_msp_df.to_csv(Path('CID_metadata/CID_CORRECT/linked_msp_matches.csv', index=False))

In [None]:
                            # Calculate geometric mean weights
                            weights = np.sqrt(sample_norm * reference_norm)

                            # Compute weighted dot product
                            weighted_dot_product = np.sum(weights * sample_norm * reference_norm)

In [None]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
# from concurrent.futures import ProcessPoolExecutor  # Uncomment for parallelism

def combine_multiple_samples(feature_clusters_annotations_csv, input_files, output_dir):
    # Preprocess: build a lookup dictionary to avoid filtering the DataFrame each time
    annotations_lookup = defaultdict(list)

    for _, row in feature_clusters_annotations_csv.iterrows():
        if pd.notnull(row['msms_id']):
            key = (row['file_id'], row['msms_id'])
            annotations_lookup[key].append(row)

    combined_multiple_samples = []

    for file in input_files:
        if 'stem' not in file:
            base_name = os.path.splitext(os.path.basename(file['raw_path']))[0]
            file['stem'] = base_name
        stem = file['stem']
        in_path = os.path.join(output_dir, 'raw', f"{stem}.ms2")

        if not os.path.exists(in_path):
            continue

        raw_data = pq.read_raw_data(in_path)

        for scan in raw_data.scans:
            scan_number = scan.scan_number
            key = (stem, scan_number)
            annotations = annotations_lookup.get(key)

            if not annotations:
                continue

            ms2_mz = scan.mz
            ms2_intensity = scan.intensity
            ms2_rt = scan.retention_time
            ms2_precursor_info = scan.precursor_information

            if not ms2_mz or not ms2_intensity or len(ms2_mz) != len(ms2_intensity):
                continue

            # Use NumPy for faster sorting
            mz_array = np.array(ms2_mz)
            intensity_array = np.array(ms2_intensity)
            sorted_indices = np.argsort(mz_array)
            mz_intensity_pairs = list(zip(mz_array[sorted_indices], intensity_array[sorted_indices]))

            for row in annotations:
                combined_multiple_samples.append({
                    'cluster_id': row['cluster_id'],
                    'file_id': row['file_id'],
                    'feature_id': row['feature_id'],
                    'peak_id': row['peak_id'],
                    'msms_id': row['msms_id'],
                    'ms2_rt': ms2_rt,
                    'ms2_mz_intensity_pairs': mz_intensity_pairs,
                    'charge_state': row['charge_state'],
                    'ms2_precursor_info': ms2_precursor_info
                })

    return combined_multiple_samples


In [None]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor, as_completed

def build_annotations_lookup(df):
    """Create a dictionary for fast lookup by (file_id, msms_id)."""
    annotations_lookup = defaultdict(list)
    for _, row in df.iterrows():
        if pd.notnull(row['msms_id']):
            key = (row['file_id'], row['msms_id'])
            annotations_lookup[key].append(row.to_dict())  # Use dicts for serialization
    return annotations_lookup

def process_file(file, output_dir, annotations_lookup):
    import pastaq as pq
    
    results = []
    if 'stem' not in file:
        file['stem'] = os.path.splitext(os.path.basename(file['raw_path']))[0]
    stem = file['stem']
    in_path = os.path.join(output_dir, 'raw', f"{stem}.ms2")

    if not os.path.exists(in_path):
        return results

    raw_data = pq.read_raw_data(in_path)

    for scan in raw_data.scans:
        scan_number = scan.scan_number
        key = (stem, scan_number)
        annotations = annotations_lookup.get(key)

        if not annotations:
            continue

        ms2_mz = scan.mz
        ms2_intensity = scan.intensity
        ms2_rt = scan.retention_time
        ms2_precursor_info = scan.precursor_information

        if not ms2_mz or not ms2_intensity or len(ms2_mz) != len(ms2_intensity):
            continue

        # NumPy sorting
        mz_array = np.array(ms2_mz)
        intensity_array = np.array(ms2_intensity)
        sorted_indices = np.argsort(mz_array)
        mz_intensity_pairs = list(zip(mz_array[sorted_indices], intensity_array[sorted_indices]))

        for row in annotations:
            results.append({
                'cluster_id': row['cluster_id'],
                'file_id': row['file_id'],
                'feature_id': row['feature_id'],
                'peak_id': row['peak_id'],
                'msms_id': row['msms_id'],
                'ms2_rt': ms2_rt,
                'ms2_mz_intensity_pairs': mz_intensity_pairs,
                'charge_state': row['charge_state'],
                'ms2_precursor_info': ms2_precursor_info
            })
    return results

def combine_multiple_samples(feature_clusters_annotations_csv, input_files, output_dir, max_workers=4):
    annotations_lookup = build_annotations_lookup(feature_clusters_annotations_csv)

    combined_multiple_samples = []
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(process_file, file, output_dir, annotations_lookup)
            for file in input_files
        ]

        for future in as_completed(futures):
            combined_multiple_samples.extend(future.result())

    return combined_multiple_samples


In [None]:
import numpy as np

# Preprocess reference data once
reference_mz = np.array(reference_mz)
reference_intensity = np.array(reference_intensity, dtype=np.float32)

# Sort sample_mz_values if not already sorted
sample_mz_values = np.sort(np.array(sample_mz_values))

# Use searchsorted with sorted arrays
matched_indices = np.searchsorted(reference_mz, sample_mz_values)

# Clip indices to ensure they are within bounds
matched_indices = np.clip(matched_indices, 0, len(reference_mz) - 1)

# Extract corresponding intensities
sample_spectrum = np.array(sample_maxima_intensities[:len(matched_indices)])
reference_spectrum = reference_intensity[matched_indices]

# Normalize spectra
sample_spectrum_norm = sample_spectrum / np.linalg.norm(sample_spectrum)
reference_spectrum_norm = reference_spectrum / np.linalg.norm(reference_spectrum)

# Compute dot product
dot_product = np.dot(sample_spectrum_norm, reference_spectrum_norm)


In [None]:
msp_data = pq.read_msp_file(msp_file)
filtered_msp = [
    ann for ann in msp_data
    if 'peaks' in ann and 'precursor_mz' in ann and 'retention_time' in ann
]
lib_mzs = np.array([ann['precursor_mz'] for ann in filtered_msp])


In [None]:
msp_data = pq.read_msp_file(msp_file)
filtered_msp = [
    ann for ann in msp_data
    if 'peaks' in ann and 'precursor_mz' in ann and 'retention_time' in ann
]
lib_mzs = np.array([ann['precursor_mz'] for ann in filtered_msp])

from scipy.spatial import KDTree
tree = KDTree(lib_mzs.reshape(-1, 1))

idxs = tree.query_ball_point([[ms1_mz]], r=mz_tolerance)[0]
candidates = [filtered_msp[i] for i in idxs]

In [None]:
from scipy.spatial import KDTree
tree = KDTree(lib_mzs.reshape(-1, 1))


In [None]:
idxs = tree.query_ball_point([[ms1_mz]], r=mz_tolerance)[0]
candidates = [filtered_msp[i] for i in idxs]


In [None]:
msp_data = pq.read_msp_file(msp_file)
filtered_msp = [
  ann for ann in msp_data
  if 'peaks' in ann and 'precursor_mz' in ann and 'retention_time' in ann
]

precursors = np.array([ann['precursor_mz'] for ann in filtered_msp])
rts = np.array([ann['retention_time'] for ann in filtered_msp])
mask = (np.abs(precursors - ms1_mz) <= mz_tolerance) & (np.abs(rts - ms1_rt) <= rt_tolerance)
candidates = [filtered_msp[i] for i in np.where(mask)[0]]


In [None]:
msp_data = pq.read_msp_file(msp_file)
filtered_msp = [
  ann for ann in msp_data
  if 'peaks' in ann and 'precursor_mz' in ann and 'retention_time' in ann
]

precursors = np.array([ann['precursor_mz'] for ann in filtered_msp])
rts = np.array([ann['retention_time'] for ann in filtered_msp])
mask = (np.abs(precursors - ms1_mz) <= mz_tolerance) & (np.abs(rts - ms1_rt) <= rt_tolerance)
candidates = [filtered_msp[i] for i in np.where(mask)[0]]

In [None]:
precursors = np.array([ann['precursor_mz'] for ann in filtered_msp])
rts = np.array([ann['retention_time'] for ann in filtered_msp])
mask = (np.abs(precursors - ms1_mz) <= mz_tolerance) & (np.abs(rts - ms1_rt) <= rt_tolerance)
candidates = [filtered_msp[i] for i in np.where(mask)[0]]


In [None]:
centroided_peaks = me.clean_spectrum(ms2_sample_peaks, ...)
cent_mz, cent_int = zip(*centroided_peaks)
# reuse cent_mz, cent_int for all candidate MSP annotations
