In [1]:
import pastaq as pq
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.colors as colors

In [2]:
input_files = [{'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE20_exc0-p1_2.ms2"},
               { 'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE20_exc2-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE20_exc10-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE30_exc0-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE30_exc2-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE30_exc10-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE40_exc0-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE40_exc2-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE40_exc10-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE50_exc0-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE50_exc2-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE50_exc10-p1_2.ms2"},
                {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE60_exc0-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE60_exc2-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE60_exc10-p1_2.ms2"},
               ]

In [3]:
output_dir = r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID"

In [4]:
feature_clusters_annotations_csv = pd.read_csv(r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\quant\feature_clusters_annotations.csv")

In [5]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
import ms_entropy as me

def combine_multiple_samples(feature_clusters_annotations_csv, input_files, output_dir):
    # Preprocess: build a lookup dictionary to avoid filtering the DataFrame each time
    annotations_lookup = defaultdict(list)

    for _, row in feature_clusters_annotations_csv.iterrows():
        if pd.notnull(row['msms_id']):
            key = (row['file_id'], row['msms_id'])
            annotations_lookup[key].append(row)

    combined_multiple_samples = []

    for file in input_files:
        if 'stem' not in file:
            base_name = os.path.splitext(os.path.basename(file['raw_path']))[0]
            file['stem'] = base_name
        stem = file['stem']
        in_path = os.path.join(output_dir, 'raw', f"{stem}.ms2")

        if not os.path.exists(in_path):
            continue

        raw_data = pq.read_raw_data(in_path)

        for scan in raw_data.scans:
            scan_number = scan.scan_number
            key = (stem, scan_number)
            annotations = annotations_lookup.get(key)

            if not annotations:
                continue

            ms2_mz = scan.mz
            ms2_intensity = scan.intensity
            ms2_rt = scan.retention_time

            if not ms2_mz or not ms2_intensity or len(ms2_mz) != len(ms2_intensity):
                continue

            # Convert to numpy array for faster sorting
            mz_array = np.array(ms2_mz)
            intensity_array = np.array(ms2_intensity)
            sorted_indices = np.argsort(mz_array)
            mz_intensity_pairs = list(zip(mz_array[sorted_indices], intensity_array[sorted_indices]))
            ms2_peaks = np.array(mz_intensity_pairs, dtype=np.float32)

            for row in annotations:
                combined_multiple_samples.append({
                    'cluster_id': row['cluster_id'],
                    'file_id': row['file_id'],
                    'feature_id': row['feature_id'],
                    'peak_id': row['peak_id'],
                    'msms_id': row['msms_id'],
                    'ms2_rt': ms2_rt,
                    'charge_state': row['charge_state'],
                    'ms2_peaks' : ms2_peaks
                })


    return combined_multiple_samples

In [6]:
combined_multiple_samples = combine_multiple_samples(feature_clusters_annotations_csv, input_files, output_dir)

In [7]:
input_files = [{'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE20_exc0-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE20_exc2-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE20_exc10-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE30_exc0-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE30_exc2-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE30_exc10-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE40_exc0-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE40_exc2-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE40_exc10-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE50_exc0-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE50_exc2-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE50_exc10-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE60_exc0-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE60_exc2-p1_2.features"},
                {'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\features\p_CE60_exc10-p1_2.features"}
                ]

In [8]:
output_dir = r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID"

In [9]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
import ms_entropy as me
from pathlib import Path

# Fast & working!
def link_features(combined_multiple_samples, input_files, output_dir):
    # Preprocess: build a lookup dictionary to avoid filtering the DataFrame each time
    annotations_lookup = defaultdict(list)
    linked_features = []

    for item in combined_multiple_samples:
        if pd.notnull(item['feature_id']):
            key = (item['file_id'], item['feature_id'])
            annotations_lookup[key].append(item)

    for file in input_files:
        if 'stem' not in file:
            base_name = os.path.splitext(os.path.basename(file['raw_path']))[0]
            file['stem'] = base_name
        stem = file['stem']
        in_path_features = os.path.join(output_dir, 'features', f"{stem}.features")

        if not os.path.exists(in_path_features):
            print('missing feature file/s')
            continue

        features = pq.read_features(in_path_features)

        for feature in features:
            id = feature.id
            key = (stem, id)
            annotations = annotations_lookup.get(key)

            if not annotations:
                continue

            if isinstance(annotations, list):
                for annotation in annotations:
                    linked_features.append({
                        'cluster_id': annotation['cluster_id'],
                        'file_id': annotation['file_id'],
                        'feature_id': id,
                        'feature_peak_ids': feature.peak_ids,
                        'peak_id' : annotation['peak_id'],
                        'msms_id': annotation['msms_id'],
                        'precursor_mz' : feature.monoisotopic_mz,
                        'precursor_rt': feature.monoisotopic_rt,
                        'precursor_intensity': feature.monoisotopic_height,
                        'precursor_vol': feature.monoisotopic_volume,
                        'total_intensity': feature.total_height,
                        'total_volume': feature.total_volume,
                        'average_ms1_mz': feature.average_mz,
                        'average_rt': feature.average_rt,
                        'charge_state': feature.charge_state,
                        'ms2_sample_peaks': annotation['ms2_peaks'],                  
                        })

    return linked_features


In [10]:
linked_features = link_features(combined_multiple_samples=combined_multiple_samples, input_files=input_files, output_dir=output_dir)

In [11]:
msp_file = r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\Msp_CID_2025_03_21_12_30_01_AlignmentResult_2025_03_19_11_06_05.msp"

In [12]:
msp_data = pq.read_msp_file(msp_file)

In [13]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
import ms_entropy as me
from pathlib import Path
from sklearn.metrics import root_mean_squared_error
import ast

def link_msp(linked_features, msp_data, mz_tolerance=0.025, rt_tolerance=8.0):
    linked_msp_features = []

    # Loop through a list of Feature objects
    for feature in linked_features:
        cluster_id = feature['cluster_id']
        file_id = feature['file_id']
        feature_id = feature['feature_id']
        feature_peak_ids = feature['feature_peak_ids']
        peak_id = feature['peak_id']
        msms_id = feature['msms_id']
        precursor_mz = feature['precursor_mz']
        precursor_rt = feature['precursor_rt']
        precursor_intensity = feature['precursor_intensity']
        precursor_volume = feature['precursor_vol']
        average_ms1_mz = feature['average_ms1_mz']
        average_rt = feature['average_rt']
        charge_state = feature['charge_state']
        ms2_sample_peaks = feature['ms2_sample_peaks']
        total_intensity = feature['total_intensity'] 
        total_volume = feature['total_volume']                   
        normalized_area = precursor_volume * 114.7977026

        # Centroid the MS2 spectrum
        centroided_peaks = me.clean_spectrum(
                    ms2_sample_peaks,
                    min_ms2_difference_in_da=0.02,
                    normalize_intensity=False
                )

        centroided_arr = np.array(centroided_peaks)
        centroided_arr_list = centroided_arr.tolist()

        # Find all matching annotations for the current scan
        best_match = None  # To store the best match found

        for annotation in msp_data:
            # Ensure required fields are present in the annotation
            if 'precursor_mz' not in annotation or 'retention_time' not in annotation:
                continue  # Skip this annotation

            # Calculate mz and rt distances directly for scalars
            mz_distance = np.abs(precursor_mz - annotation['precursor_mz']) # changed this from 'mz' to 'precursor_mz'
            rt_distance = np.abs(precursor_rt - annotation['retention_time'])  # changed this from 'retention_time' to 'precursor_rt' 

            # Apply the tolerance checks
            if mz_distance <= mz_tolerance and rt_distance <= rt_tolerance:
                # If the distances are within tolerance, calculate the match score with normalization factor
                match_score = (rt_distance*0.025) + (mz_distance*20) # lower score is better/ closer match; added weighting
                
                # Calculate mass error in ppm using formula
                mass_error_ppm = ((annotation['precursor_mz'] - precursor_mz) / annotation['precursor_mz']) * 10**6
                
                #Calculate root mean squared error for best match
                y_true_mz = [annotation.get('precursor_mz')]
                y_pred_mz = [precursor_mz]
                rmse_mz = root_mean_squared_error(y_true_mz, y_pred_mz)

                # Convert peaks to numpy arrays for similarity calculation (MS_Entropy)
                peaks_query = np.array(ms2_sample_peaks, dtype=np.float32) #peaks from given samples
                peaks_reference = np.array([annotation.get('peaks')], dtype=np.float32)  # peaks from msp
                if peaks_reference.ndim == 3 and peaks_reference.shape[0] == 1:
                    peaks_reference = peaks_reference[0]  # remove the first singleton dimension
                    
                # Calculate similarity scores (MS_entropy)
                unweighted_similarity = me.calculate_unweighted_entropy_similarity(peaks_query, peaks_reference)
                similarity = me.calculate_entropy_similarity(peaks_query, peaks_reference) # entropy based intensity weights are applied to the peaks

                # Extract m/z and intensity values
                reference_mz, reference_intensity = zip(*peaks_reference)

                # Convert to NumPy arrays
                # Parse the mz and intensity arrays
                sample_mz, sample_intensity = zip(*centroided_peaks)
                sample_mz = np.array(sample_mz)
                sample_intensity = np.array(sample_intensity, dtype=np.float32)
                reference_mz = np.array(reference_mz)
                reference_intensity = np.array(reference_intensity, dtype=np.float32)

                # Match peaks based on m/z values
                matched_indices = np.searchsorted(reference_mz, sample_mz)

                # Ensure indices are within bounds (removes any indices that are out of bounds)
                matched_indices = matched_indices[matched_indices < len(reference_mz)]

                # Example mass spectra intensities
                sample_spectrum = np.array(sample_intensity[:len(matched_indices)])
                reference_spectrum = np.array(reference_intensity[matched_indices])

                # Normalize both vectors to unit length (L2 norm)
                sample_spectrum_normalized = sample_spectrum / np.linalg.norm(sample_spectrum)
                reference_spectrum_normalized = reference_spectrum / np.linalg.norm(reference_spectrum)

                # Compute dot product (cosine similarity if normalized)
                if len(sample_intensity) < 3 or len(reference_intensity) < 3:
                    dot_product = None
                else:
                    dot_product = np.dot(sample_spectrum_normalized, reference_spectrum_normalized)
                
                if best_match is None or match_score < best_match['score']:
                    best_match = {
                        'score': float(match_score),  # Store the match score
                        'name': annotation.get('name', None),
                        'saturation' : annotation.get('saturation', None),
                        'retention_time': annotation.get('retention_time', None),
                        'precursor_mz': annotation.get('precursor_mz', None),
                        'precursor_type': annotation.get('precursor_type', None),
                        'smiles': annotation.get('smiles', None),
                        'msp_peaks': annotation.get('peaks', None),
                    }
        
                    # Create the annotated scan with only the best match
                    linked_msp_feature = {
                        'cluster_id' : cluster_id,
                        'file_id' : file_id,
                        'peak_id': peak_id,
                        'msms_id' : msms_id,
                        'average_ms1_mz': average_ms1_mz,
                        'total_intensity': total_intensity,  # Intensity values
                        'average_rt': average_rt,  # Average retention time (in seconds)
                        'total_volume' : total_volume,
                        'precursor_mz': precursor_mz,
                        'precursor_intensity' : precursor_intensity,
                        'precursor_rt' : precursor_rt,
                        'precursor_volume' : precursor_volume,
                        'normalized_area' : normalized_area,
                        'charge_state' : charge_state,
                        'feature_id': feature_id,
                        'feature_peak_ids' : feature_peak_ids,
                        'centroided_ms2_peaks' : centroided_arr_list,
                        'mass_error_ppm' : float(mass_error_ppm),
                        'rmse_mz' : float(rmse_mz),
                        'unweighted_entropy_similarity' : unweighted_similarity,
                        'entropy_similarity' : similarity,
                        'dot_product': float(dot_product) if dot_product is not None else 'NA',
                        'matches': []  # List to hold the top match
                        }

        # Add the best match if available
        if best_match:
            linked_msp_feature['matches'].append(best_match)
        else:
            linked_msp_feature['matches'] = None  # No match found
        
        linked_msp_features.append(linked_msp_feature)

    # Return after all features are processed
    return linked_msp_features

In [14]:
linked_msp = link_msp(linked_features, msp_data, mz_tolerance=0.025, rt_tolerance=8.0)

In [15]:
from pathlib import Path  
filepath = Path('CID_metadata/CID_CORRECT/linked_msp.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
linked_msp_df = pd.DataFrame(linked_msp)
linked_msp_df.to_csv(Path('CID_metadata/CID_CORRECT/linked_msp.csv', index=False))