In [None]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
import ms_entropy as me

def combine_multiple_samples(feature_clusters_annotations_csv, input_files, output_dir):
    # Preprocess: build a lookup dictionary to avoid filtering the DataFrame each time
    annotations_lookup = defaultdict(list)

    for _, row in feature_clusters_annotations_csv.iterrows():
        if pd.notnull(row['msms_id']):
            key = (row['file_id'], row['msms_id'])
            annotations_lookup[key].append(row)

    combined_multiple_samples = []

    for file in input_files:
        if 'stem' not in file:
            base_name = os.path.splitext(os.path.basename(file['raw_path']))[0]
            file['stem'] = base_name
        stem = file['stem']
        in_path = os.path.join(output_dir, 'raw', f"{stem}.ms2")

        if not os.path.exists(in_path):
            continue

        raw_data = pq.read_raw_data(in_path)

        for scan in raw_data.scans:
            scan_number = scan.scan_number
            key = (stem, scan_number)
            annotations = annotations_lookup.get(key)

            if not annotations:
                continue

            ms2_mz = scan.mz
            ms2_intensity = scan.intensity
            ms2_rt = scan.retention_time

            if not ms2_mz or not ms2_intensity or len(ms2_mz) != len(ms2_intensity):
                continue

            # Convert to numpy array for faster sorting
            mz_array = np.array(ms2_mz)
            intensity_array = np.array(ms2_intensity)
            sorted_indices = np.argsort(mz_array)
            mz_intensity_pairs = list(zip(mz_array[sorted_indices], intensity_array[sorted_indices]))
            ms2_peaks = np.array(mz_intensity_pairs, dtype=np.float32)

            for row in annotations:
                combined_multiple_samples.append({
                    'cluster_id': row['cluster_id'],
                    'file_id': row['file_id'],
                    'feature_id': row['feature_id'],
                    'peak_id': row['peak_id'],
                    'msms_id': row['msms_id'],
                    'ms2_rt': ms2_rt,
                    'charge_state': row['charge_state'],
                    'ms2_peaks' : ms2_peaks
                })


    return combined_multiple_samples

In [None]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
import ms_entropy as me
from pathlib import Path

# Fast & working!
def link_features(combined_multiple_samples, input_files, output_dir):
    # Preprocess: build a lookup dictionary to avoid filtering the DataFrame each time
    annotations_lookup = defaultdict(list)
    linked_features = []

    for item in combined_multiple_samples:
        if pd.notnull(item['feature_id']):
            key = (item['file_id'], item['feature_id'])
            annotations_lookup[key].append(item)

    for file in input_files:
        if 'stem' not in file:
            base_name = os.path.splitext(os.path.basename(file['raw_path']))[0]
            file['stem'] = base_name
        stem = file['stem']
        in_path_features = os.path.join(output_dir, 'features', f"{stem}.features")

        if not os.path.exists(in_path_features):
            print('missing feature file/s')
            continue

        features = pq.read_features(in_path_features)

        for feature in features:
            id = feature.id
            key = (stem, id)
            annotations = annotations_lookup.get(key)

            if not annotations:
                continue

            if isinstance(annotations, list):
                for annotation in annotations:
                    linked_features.append({
                        'cluster_id': annotation['cluster_id'],
                        'file_id': annotation['file_id'],
                        'feature_id': id,
                        'feature_peak_ids': feature.peak_ids,
                        'peak_id' : annotation['peak_id'],
                        'msms_id': annotation['msms_id'],
                        'precursor_mz' : feature.monoisotopic_mz,
                        'precursor_rt': feature.monoisotopic_rt,
                        'precursor_intensity': feature.monoisotopic_height,
                        'precursor_vol': feature.monoisotopic_volume,
                        'total_intensity': feature.total_height,
                        'total_volume': feature.total_volume,
                        'average_ms1_mz': feature.average_mz,
                        'average_rt': feature.average_rt,
                        'charge_state': feature.charge_state,
                        'ms2_sample_peaks': annotation['ms2_peaks'],                  
                        })

    return linked_features


In [None]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
import ms_entropy as me
from pathlib import Path
from sklearn.metrics import root_mean_squared_error
import ast

# Fast and working!
def link_msp(linked_features, msp_data, mz_tolerance=0.025, rt_tolerance=8.0):
    linked_msp_features = []

    # Loop through a list of Feature objects
    for feature in linked_features:
        cluster_id = feature['cluster_id']
        file_id = feature['file_id']
        feature_id = feature['feature_id']
        feature_peak_ids = feature['feature_peak_ids']
        peak_id = feature['peak_id']
        msms_id = feature['msms_id']
        precursor_mz = feature['precursor_mz']
        precursor_rt = feature['precursor_rt']
        precursor_intensity = feature['precursor_intensity']
        precursor_volume = feature['precursor_vol']
        average_ms1_mz = feature['average_ms1_mz']
        average_rt = feature['average_rt']
        charge_state = feature['charge_state']
        ms2_sample_peaks = feature['ms2_sample_peaks']
        total_intensity = feature['total_intensity'] 
        total_volume = feature['total_volume']                   
        normalized_area = precursor_volume * 114.7977026

        # Centroid the MS2 spectrum
        centroided_peaks = me.clean_spectrum(
                    ms2_sample_peaks,
                    min_ms2_difference_in_da=0.02,
                    normalize_intensity=False
                )

        centroided_arr = np.array(centroided_peaks)
        centroided_arr_list = centroided_arr.tolist()

        # Find all matching annotations for the current scan
        best_match = None  # To store the best match found

        for annotation in msp_data:
            # Ensure required fields are present in the annotation
            if 'precursor_mz' not in annotation or 'retention_time' not in annotation:
                continue  # Skip this annotation

            # Calculate mz and rt distances directly for scalars
            mz_distance = np.abs(precursor_mz - annotation['precursor_mz']) # changed this from 'mz' to 'precursor_mz'
            rt_distance = np.abs(precursor_rt - annotation['retention_time'])  # changed this from 'retention_time' to 'precursor_rt' 

            # Apply the tolerance checks
            if mz_distance <= mz_tolerance and rt_distance <= rt_tolerance:
                # If the distances are within tolerance, calculate the match score with normalization factor
                match_score = (rt_distance*0.025) + (mz_distance*20) # lower score is better/ closer match; added weighting
                
                # Calculate mass error in ppm using formula
                mass_error_ppm = ((annotation['precursor_mz'] - precursor_mz) / annotation['precursor_mz']) * 10**6
                
                #Calculate root mean squared error for best match
                y_true_mz = [annotation.get('precursor_mz')]
                y_pred_mz = [precursor_mz]
                rmse_mz = root_mean_squared_error(y_true_mz, y_pred_mz)

                # Convert peaks to numpy arrays for similarity calculation (MS_Entropy)
                peaks_query = np.array(ms2_sample_peaks, dtype=np.float32) #peaks from given samples
                peaks_reference = np.array([annotation.get('peaks')], dtype=np.float32)  # peaks from msp
                if peaks_reference.ndim == 3 and peaks_reference.shape[0] == 1:
                    peaks_reference = peaks_reference[0]  # remove the first singleton dimension
                    
                # Calculate similarity scores (MS_entropy)
                unweighted_similarity = me.calculate_unweighted_entropy_similarity(peaks_query, peaks_reference)
                similarity = me.calculate_entropy_similarity(peaks_query, peaks_reference) # entropy based intensity weights are applied to the peaks

                # Extract m/z and intensity values
                reference_mz, reference_intensity = zip(*peaks_reference)

                # Convert to NumPy arrays
                # Parse the mz and intensity arrays
                sample_mz, sample_intensity = zip(*centroided_peaks)
                sample_mz = np.array(sample_mz)
                sample_intensity = np.array(sample_intensity, dtype=np.float32)
                reference_mz = np.array(reference_mz)
                reference_intensity = np.array(reference_intensity, dtype=np.float32)

                # Match peaks based on m/z values
                matched_indices = np.searchsorted(reference_mz, sample_mz)

                # Ensure indices are within bounds (removes any indices that are out of bounds)
                matched_indices = matched_indices[matched_indices < len(reference_mz)]

                # Example mass spectra intensities
                sample_spectrum = np.array(sample_intensity[:len(matched_indices)])
                reference_spectrum = np.array(reference_intensity[matched_indices])

                # Normalize both vectors to unit length (L2 norm)
                sample_spectrum_normalized = sample_spectrum / np.linalg.norm(sample_spectrum)
                reference_spectrum_normalized = reference_spectrum / np.linalg.norm(reference_spectrum)

                # Compute dot product (cosine similarity if normalized)
                if len(sample_intensity) < 3 or len(reference_intensity) < 3:
                    dot_product = None
                else:
                    dot_product = np.dot(sample_spectrum_normalized, reference_spectrum_normalized)
                
                if best_match is None or match_score < best_match['score']:
                    best_match = {
                        'score': float(match_score),  # Store the match score
                        'name': annotation.get('name', None),
                        'saturation' : annotation.get('saturation', None),
                        'retention_time': annotation.get('retention_time', None),
                        'precursor_mz': annotation.get('precursor_mz', None),
                        'precursor_type': annotation.get('precursor_type', None),
                        'smiles': annotation.get('smiles', None),
                        'msp_peaks': annotation.get('peaks', None),
                    }
        
                    # Create the annotated scan with only the best match
                    linked_msp_feature = {
                        'cluster_id' : cluster_id,
                        'file_id' : file_id,
                        'feature_id': feature_id,
                        'feature_peak_ids' : feature_peak_ids,
                        'peak_id': peak_id,
                        'msms_id' : msms_id,
                        'average_ms1_mz': average_ms1_mz,
                        'total_intensity': total_intensity,  # Intensity values
                        'average_rt': average_rt,  # Average retention time (in seconds)
                        'total_volume' : total_volume,
                        'precursor_mz': precursor_mz,
                        'precursor_intensity' : precursor_intensity,
                        'precursor_rt' : precursor_rt,
                        'precursor_volume' : precursor_volume,
                        'normalized_area' : normalized_area,
                        'charge_state' : charge_state,
                        'centroided_ms2_peaks' : centroided_arr_list,
                        'mass_error_ppm' : float(mass_error_ppm),
                        'rmse_mz' : float(rmse_mz),
                        'unweighted_entropy_similarity' : unweighted_similarity,
                        'entropy_similarity' : similarity,
                        'dot_product': float(dot_product) if dot_product is not None else 'NA',
                        'matches': []  # List to hold the top match
                        }

        # Add the best match if available
        if best_match:
            linked_msp_feature['matches'].append(best_match)
        else:
            linked_msp_feature['matches'] = None  # No match found
        
        linked_msp_features.append(linked_msp_feature)

    # Return after all features are processed
    return linked_msp_features

In [None]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
import ms_entropy as me
from pathlib import Path
from sklearn.metrics import root_mean_squared_error
import ast

# Fast and working!
def link_msp(linked_features, msp_data, mz_tolerance=0.025, rt_tolerance=8.0):
    linked_msp_features = []

    # Loop through a list of Feature objects
    for feature in linked_features:
        cluster_id = feature['cluster_id']
        file_id = feature['file_id']
        feature_id = feature['feature_id']
        feature_peak_ids = feature['feature_peak_ids']
        peak_id = feature['peak_id']
        msms_id = feature['msms_id']
        precursor_mz = feature['precursor_mz']
        precursor_rt = feature['precursor_rt']
        precursor_intensity = feature['precursor_intensity']
        precursor_volume = feature['precursor_vol']
        average_ms1_mz = feature['average_ms1_mz']
        average_rt = feature['average_rt']
        charge_state = feature['charge_state']
        ms2_sample_peaks = feature['ms2_sample_peaks']
        total_intensity = feature['total_intensity']
        total_volume = feature['total_volume']
        normalized_area = precursor_volume * 114.7977026

        # Centroid the MS2 spectrum
        centroided_peaks = me.clean_spectrum(
            ms2_sample_peaks,
            min_ms2_difference_in_da=0.02,
            normalize_intensity=False
        )

        centroided_arr = np.array(centroided_peaks)
        centroided_arr_list = centroided_arr.tolist()

        # Find all matching annotations for the current scan
        best_match = None

        for annotation in msp_data:
            if 'precursor_mz' not in annotation or 'retention_time' not in annotation:
                continue

            # Calculate mz and rt distances directly for scalars
            mz_distance = np.abs(precursor_mz - annotation['precursor_mz'])
            rt_distance = np.abs(precursor_rt - annotation['retention_time'])

            if mz_distance <= mz_tolerance and rt_distance <= rt_tolerance:
                match_score = (rt_distance * 0.025) + (mz_distance * 20)
                mass_error_ppm = ((annotation['precursor_mz'] - precursor_mz) / annotation['precursor_mz']) * 1e6

                y_true_mz = [annotation.get('precursor_mz')]
                y_pred_mz = [precursor_mz]
                rmse_mz = root_mean_squared_error(y_true_mz, y_pred_mz)

                peaks_query = np.array(ms2_sample_peaks, dtype=np.float32)
                peaks_reference = np.array([annotation.get('peaks')], dtype=np.float32)
                if peaks_reference.ndim == 3 and peaks_reference.shape[0] == 1:
                    peaks_reference = peaks_reference[0]

                unweighted_similarity = me.calculate_unweighted_entropy_similarity(peaks_query, peaks_reference)
                similarity = me.calculate_entropy_similarity(peaks_query, peaks_reference)

                reference_mz, reference_intensity = zip(*peaks_reference)
                sample_mz, sample_intensity = zip(*centroided_peaks)

                sample_mz = np.array(sample_mz)
                sample_intensity = np.array(sample_intensity, dtype=np.float32)
                reference_mz = np.array(reference_mz)
                reference_intensity = np.array(reference_intensity, dtype=np.float32)

                matched_indices = np.searchsorted(reference_mz, sample_mz)
                matched_indices = matched_indices[matched_indices < len(reference_mz)]

                sample_spectrum = sample_intensity[:len(matched_indices)]
                reference_spectrum = reference_intensity[matched_indices]

                sample_spectrum_normalized = sample_spectrum / np.linalg.norm(sample_spectrum)
                reference_spectrum_normalized = reference_spectrum / np.linalg.norm(reference_spectrum)

                dot_product = None
                if len(sample_intensity) >= 3 and len(reference_intensity) >= 3:
                    dot_product = np.dot(sample_spectrum_normalized, reference_spectrum_normalized)

                if best_match is None or match_score < best_match['score']:
                    best_match = {
                        'score': float(match_score),
                        'name': annotation.get('name', None),
                        'saturation': annotation.get('saturation', None),
                        'retention_time': annotation.get('retention_time', None),
                        'precursor_mz': annotation.get('precursor_mz', None),
                        'precursor_type': annotation.get('precursor_type', None),
                        'smiles': annotation.get('smiles', None),
                        'msp_peaks': annotation.get('peaks', None),
                    }

        linked_msp_feature = {
            'cluster_id': cluster_id,
            'file_id': file_id,
            'feature_id': feature_id,
            'feature_peak_ids': feature_peak_ids,
            'peak_id': peak_id,
            'msms_id': msms_id,
            'average_ms1_mz': average_ms1_mz,
            'total_intensity': total_intensity,
            'average_rt': average_rt,
            'total_volume': total_volume,
            'precursor_mz': precursor_mz,
            'precursor_intensity': precursor_intensity,
            'precursor_rt': precursor_rt,
            'precursor_volume': precursor_volume,
            'normalized_area': normalized_area,
            'charge_state': charge_state,
            'centroided_ms2_peaks': centroided_arr_list,
            'mass_error_ppm': float(mass_error_ppm) if best_match else None,
            'rmse_mz': float(rmse_mz) if best_match else None,
            'unweighted_entropy_similarity': unweighted_similarity if best_match else None,
            'entropy_similarity': similarity if best_match else None,
            'dot_product': float(dot_product) if dot_product is not None else 'NA',
            'matches': [best_match] if best_match else None,
            'raw_ms2_peaks' : peaks_query,
            'msp_peaks' : peaks_reference
        }

        linked_msp_features.append(linked_msp_feature)

    return linked_msp_features


In [None]:
import pandas as pd
import numpy as np
import re
from itertools import combinations

def dot_product_with_tolerance(mz1, int1, mz2, int2, tol=0.02):
    matched1, matched2 = [], []

    for i, m1 in enumerate(mz1):
        for j, m2 in enumerate(mz2):
            if abs(m1 - m2) <= tol:
                matched1.append(int1[i])
                matched2.append(int2[j])
                break  # prevent duplicate matching

    if not matched1 or not matched2:
        return 0.0

    # Convert into numpy arrays
    s1 = np.array(matched1)
    s2 = np.array(matched2)
    # Normalization: each vector is divided by its Euclidean norm to convert it into a unit vector.
    s1 /= np.linalg.norm(s1)
    s2 /= np.linalg.norm(s2)
    # Calculate dot product
    return float(np.dot(s1, s2))

# Calculate dot products
results = []
for peak_id, group in df.groupby('peak_id'):
    if len(group) < 2:
        continue
    for (i1, row1), (i2, row2) in combinations(group.iterrows(), 2):
        dot_product = dot_product_with_tolerance(row1['cent_mz'], row1['cent_intensity'],
                                        row2['cent_mz'], row2['cent_intensity'])
        unweighted_similarity = me.calculate_unweighted_entropy_similarity(row1['ms2_peaks'], row2['ms2_peaks'])
        similarity = me.calculate_entropy_similarity(row1['ms2_peaks'], row2['ms2_peaks'])
        results.append({
            'dot_product': dot_product,
            'unweighted_entropy_similarity' : unweighted_similarity,
            'entropy_similarity' : similarity,
            'peak_id': peak_id,
            'msms_id_1': row1['msms_id'],
            'mz1' : row1['cent_mz'],
            'int1' : row1['cent_intensity'],
            'feature_id1' : row1['feature_id'],
            'cluster_id1' : row1['cluster_id'],
            'file_id1' : row1['file_id'],
            'msms_id_2': row2['msms_id'],
            'mz2' : row2['cent_mz'],
            'int2' : row2['cent_intensity'],
            'feature_id2' : row2['feature_id'],
            'cluster_id2' : row2['cluster_id'],
            'file_id2' : row2['file_id']
        })

# Create the results DataFrame
dot_product_df = pd.DataFrame(results)
print(dot_product_df.head())

 

In [None]:
import numpy as np
import re

# WORKING - not debug
def parse_array_from_string(s):
    if isinstance(s, str):
        return np.array([float(x) for x in re.findall(r"[-+]?\d*\.\d+|\d+", s)])
    return np.array([])

def read_msp_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    spectra_data = []
    current_spectrum = {}
    peak_data_started = False
    
    for line in lines:
        line = line.strip()

        if line.startswith("Num Peaks"):
            peak_data_started = True
            continue

        if not peak_data_started:
            if line.startswith("NAME:"):
                if "|" in line:
                    # If the line contains '|', split the line after the '|' character
                    parts = line.split('|')
                    current_spectrum['name'] = parts[0].replace("NAME:", "").strip()  # Remove "NAME:" and strip any extra spaces
                    current_spectrum['saturation'] = parts[1].strip()  # After the '|'
                else:
                    # Otherwise, just use the name
                    current_spectrum['name'] = line.split(":", 1)[1].strip()
               
            elif line.startswith("PRECURSORMZ:"):
                current_spectrum['precursor_mz'] = float(line.split(":", 1)[1].strip())
            elif line.startswith("PRECURSORTYPE:"):
                current_spectrum['precursor_type'] = line.split(":", 1)[1].strip()
            elif line.startswith("IONMODE:"):
                current_spectrum['ion_mode'] = line.split(":", 1)[1].strip()              
            elif line.startswith("RETENTIONTIME:"):
                _, raw = line.split(":", 1)
                val = raw.strip()
                if not val:
                    print("[SKIP] empty retention_time")
                    continue
                try:
                    rt = float(val)
                except ValueError:
                        rt_parsed = parse_array_from_string(val)
                        if isinstance(rt_parsed, (list, np.ndarray)) and len(rt_parsed) == 1:
                            rt = float(rt_parsed[0])
                        else:
                            print("[SKIP] array invalid, skipping")
                            continue
                current_spectrum['retention_time'] = rt * 60 # convert to seconds
            elif line.startswith("Name: "):
                if current_spectrum:
                    spectra_data.append(current_spectrum)
                    current_spectrum = {"Name": line.split(":",1)[1].strip()}
            elif line.startswith("FORMULA:"):
                current_spectrum['formula'] = line.split(":", 1)[1].strip()
            elif line.startswith("INCHIKEY:"):
                current_spectrum['inchi_key'] = line.split(":", 1)[1].strip()
            elif line.startswith("SMILES:"):
                current_spectrum['smiles'] = line.split(":", 1)[1].strip()
            elif line.startswith("COMMENT:"):
                current_spectrum['comment'] = line.split(":", 1)[1].strip()

        else:
            try:
                mz, intensity = map(float, line.split())
                current_spectrum.setdefault('peaks', []).append((mz, intensity))
            except ValueError:
                # This is where the spectrum data is stored and new spectrum begins
                if current_spectrum:
                    spectra_data.append(current_spectrum)
                current_spectrum = {}  # Reset for the next spectrum
                peak_data_started = False  # Reset peak reading flag

    # Add last spectrum if it exists
    if current_spectrum:
        spectra_data.append(current_spectrum)
    
    return spectra_data