In [3]:
import pastaq as pq
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.colors as colors

In [4]:
input_files = [{'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE20_exc0-p1_2.ms2"},
               { 'raw_path': r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE20_exc2-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE20_exc10-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE30_exc0-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE30_exc2-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE30_exc10-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE40_exc0-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE40_exc2-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE40_exc10-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE50_exc0-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE50_exc2-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE50_exc10-p1_2.ms2"},
                {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE60_exc0-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE60_exc2-p1_2.ms2"},
               {'raw_path' : r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\raw\p_CE60_exc10-p1_2.ms2"},
               ]

In [5]:
output_dir = r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID"

In [6]:
feature_clusters_annotations_csv = pd.read_csv(r"C:\Users\diego.DESKTOP-7OSFK5B\Documents\MSc_Research_Project1\CID_files\pastaq_CID\quant\feature_clusters_annotations.csv")

In [7]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
import ms_entropy as me

def combine_multiple_samples(feature_clusters_annotations_csv, input_files, output_dir):
    # Preprocess: build a lookup dictionary to avoid filtering the DataFrame each time
    annotations_lookup = defaultdict(list)

    for _, row in feature_clusters_annotations_csv.iterrows():
        if pd.notnull(row['msms_id']):
            key = (row['file_id'], row['msms_id'])
            annotations_lookup[key].append(row)

    combined_multiple_samples = []

    for file in input_files:
        if 'stem' not in file:
            base_name = os.path.splitext(os.path.basename(file['raw_path']))[0]
            file['stem'] = base_name
        stem = file['stem']
        in_path = os.path.join(output_dir, 'raw', f"{stem}.ms2")

        if not os.path.exists(in_path):
            continue

        raw_data = pq.read_raw_data(in_path)

        for scan in raw_data.scans:
            scan_number = scan.scan_number
            key = (stem, scan_number)
            annotations = annotations_lookup.get(key)

            if not annotations:
                continue

            ms2_mz = scan.mz
            ms2_intensity = scan.intensity
            ms2_rt = scan.retention_time

            if not ms2_mz or not ms2_intensity or len(ms2_mz) != len(ms2_intensity):
                continue

            # Convert to numpy array for faster sorting
            mz_array = np.array(ms2_mz)
            intensity_array = np.array(ms2_intensity)
            sorted_indices = np.argsort(mz_array)
            mz_intensity_pairs = list(zip(mz_array[sorted_indices], intensity_array[sorted_indices]))
            ms2_peaks = np.array(mz_intensity_pairs, dtype=np.float32)
            centroided_peaks = me.clean_spectrum(
                    ms2_peaks,
                    min_ms2_difference_in_da=0.02,
                    normalize_intensity=False
                )
            cent_mz, cent_intensity = zip(*centroided_peaks)
            cent_mz_arr = np.array(cent_mz)
            cent_mz_arr_list = cent_mz_arr.tolist()
            cent_intensity_arr = np.array(cent_intensity)
            cent_intensity_arr_list = cent_intensity_arr.tolist()

            for row in annotations:
                combined_multiple_samples.append({
                    'cluster_id': row['cluster_id'],
                    'file_id': row['file_id'],
                    'feature_id': row['feature_id'],
                    'peak_id': row['peak_id'],
                    'msms_id': row['msms_id'],
                    'ms2_rt': ms2_rt,
                    'charge_state': row['charge_state'],
                    'ms2_peaks' : ms2_peaks,
                    'cent_mz' : cent_mz_arr_list,
                    'cent_intensity' : cent_intensity_arr_list
                })


    return combined_multiple_samples

In [8]:
combined_multiple_samples = combine_multiple_samples(feature_clusters_annotations_csv, input_files, output_dir)


In [9]:
df = pd.DataFrame(combined_multiple_samples)

In [None]:
import numpy as np
import pandas as pd
from itertools import combinations
import ms_entropy as me
#Working version 1
def dot_product_with_tolerance(mz1, int1, mz2, int2, tol=0.02):
    matched1, matched2 = [], []
    for i, m1 in enumerate(mz1):
        for j, m2 in enumerate(mz2):
            if abs(m1 - m2) <= tol:
                matched1.append(int1[i])
                matched2.append(int2[j])
                break  # prevent duplicate matching
    if not matched1 or not matched2:
        return 0.0
    s1 = np.array(matched1)
    s2 = np.array(matched2)
    s1 /= np.linalg.norm(s1)
    s2 /= np.linalg.norm(s2)
    return float(np.dot(s1, s2))

def process_spectral_similarity(df, top_n=3, mz_tolerance=0.02, mz_merge_thresh=0.01):
    required_columns = ['peak_id', 'cluster_id', 'file_id', 'feature_id', 'msms_id',
                        'ms2_rt', 'charge_state', 'ms2_peaks', 'cent_mz', 'cent_intensity']
    assert all(col in df.columns for col in required_columns), "Missing required columns in df"

    results = []

    for peak_id, group in df.groupby('peak_id'):
        if len(group) < 2:
            row = group.iloc[0]
            cent_mz_intensity_pairs = list(zip(row['cent_mz'], row['cent_intensity']))
            results.append({
                'cluster_id': row['cluster_id'],
                'feature_id': row['feature_id'],
                'peak_id': peak_id,
                'avg_ms2_retention_time': row['ms2_rt'],
                'num_msms': 1,
                'charge_state': row['charge_state'],
                'avg_ms2_cent_peaks': cent_mz_intensity_pairs,
                'raw_ms2_sample_peaks': row['ms2_peaks']
            })
        else:
            msms_list = []
            for (i1, row1), (i2, row2) in combinations(group.iterrows(), 2):
                dot_product = dot_product_with_tolerance(row1['cent_mz'], row1['cent_intensity'],
                                                         row2['cent_mz'], row2['cent_intensity'],
                                                         tol=mz_tolerance)
                entropy_sim = me.calculate_entropy_similarity(row1['ms2_peaks'], row2['ms2_peaks'])

                msms_list.append({
                    'dot_product': dot_product,
                    'entropy_similarity': entropy_sim,
                    'ms2_retention_time': row1['ms2_rt'],
                    'cent_mz_intensity_pairs': list(zip(row1['cent_intensity'], row1['cent_mz']))  # (intensity, mz)
                })

            if not msms_list:
                continue

            sorted_msms = sorted(msms_list, key=lambda x: x['entropy_similarity'], reverse=True)
            top_msms = sorted_msms[:top_n]

            dot_product_list = [msms['dot_product'] for msms in top_msms]
            entropy_similarity_list = [msms['entropy_similarity'] for msms in top_msms]
            avg_dot_product = np.mean(dot_product_list)
            avg_entropy_similarity = np.mean(entropy_similarity_list)
            avg_retention_time = np.mean([msms['ms2_retention_time'] for msms in top_msms])

            # Merge centroids from top MS/MS
            all_cent_ms2_peaks = [np.array(pairs) for msms in top_msms for pairs in [msms['cent_mz_intensity_pairs']]]
            combined_cent_peaks = np.concatenate(all_cent_ms2_peaks, axis=0)
            sorted_arr = combined_cent_peaks[combined_cent_peaks[:, 1].argsort()]

            groups = []
            current_group = [sorted_arr[0]]
            for row in sorted_arr[1:]:
                if abs(row[1] - current_group[-1][1]) <= mz_merge_thresh:
                    current_group.append(row)
                else:
                    groups.append(np.array(current_group))
                    current_group = [row]
            groups.append(np.array(current_group))  # Add last group

            averaged_results = []
            for group in groups:
                avg_intensity = np.mean(group[:, 0])
                avg_mz = np.mean(group[:, 1])
                averaged_results.append([avg_intensity, avg_mz])
            avg_ms2_cent_peaks = np.array(averaged_results)

            results.append({
                'peak_id': peak_id,
                'dot_product_list': dot_product_list,
                'avg_dot_product': avg_dot_product,
                'avg_ms2_retention_time': avg_retention_time,
                'num_msms': len(top_msms),
                'entropy_similarity_list': entropy_similarity_list,
                'avg_entropy_similarity': avg_entropy_similarity,
                'avg_ms2_cent_peaks': avg_ms2_cent_peaks.tolist()
            })

    return pd.DataFrame(results)


In [8]:
import numpy as np
import pandas as pd
from itertools import combinations
import ms_entropy as me
#Version 2
def dot_product_with_tolerance(mz1, int1, mz2, int2, tol=0.02):
    matched1, matched2 = [], []
    for i, m1 in enumerate(mz1):
        for j, m2 in enumerate(mz2):
            if abs(m1 - m2) <= tol:
                matched1.append(int1[i])
                matched2.append(int2[j])
                break  # prevent duplicate matching
    if not matched1 or not matched2:
        return 0.0
    s1 = np.array(matched1)
    s2 = np.array(matched2)
    s1 /= np.linalg.norm(s1)
    s2 /= np.linalg.norm(s2)
    return float(np.dot(s1, s2))

def process_spectral_similarity(df, top_n=3, mz_tolerance=0.02, mz_merge_thresh=0.01):
    required_columns = ['peak_id', 'cluster_id', 'file_id', 'feature_id', 'msms_id',
                        'ms2_rt', 'charge_state', 'ms2_peaks', 'cent_mz', 'cent_intensity']
    assert all(col in df.columns for col in required_columns), "Missing required columns in df"

    results = []

    for peak_id, group in df.groupby('peak_id'):
        if len(group) < 2:
            row = group.iloc[0]
            cent_mz_intensity_pairs = list(zip(row['cent_mz'], row['cent_intensity']))
            results.append({
                'cluster_id': row['cluster_id'],
                'feature_id': row['feature_id'],
                'peak_id': peak_id,
                'avg_ms2_retention_time': row['ms2_rt'],
                'num_msms': 1,
                'charge_state': row['charge_state'],
                'avg_ms2_cent_peaks': cent_mz_intensity_pairs,
                'raw_ms2_sample_peaks': row['ms2_peaks']
            })
        else:
            msms_list = []
            for (i1, row1), (i2, row2) in combinations(group.iterrows(), 2):
                dot_product = dot_product_with_tolerance(row1['cent_mz'], row1['cent_intensity'],
                                                         row2['cent_mz'], row2['cent_intensity'],
                                                         tol=mz_tolerance)
                entropy_sim = me.calculate_entropy_similarity(row1['ms2_peaks'], row2['ms2_peaks'])

                msms_list.append({
                    'cluster_id' : row1['cluster_id'],
                    'feature_id' : row1['feature_id'],
                    'dot_product': dot_product,
                    'entropy_similarity': entropy_sim,
                    'ms2_retention_time': row1['ms2_rt'],
                    'cent_mz_intensity_pairs': list(zip(row1['cent_mz'], row1['cent_intensity']))  #Fixed to (mz, intensity) from (intensity, mz)
                })

            if not msms_list:
                continue

            sorted_msms = sorted(msms_list, key=lambda x: x['entropy_similarity'], reverse=True)
            top_msms = sorted_msms[:top_n]

            cluster_id = [msms['cluster_id'] for msms in top_msms]
            feature_id = [msms['feature_id'] for msms in top_msms]
            dot_product_list = [msms['dot_product'] for msms in top_msms]
            entropy_similarity_list = [msms['entropy_similarity'] for msms in top_msms]
            avg_dot_product = np.mean(dot_product_list)
            avg_entropy_similarity = np.mean(entropy_similarity_list)
            avg_retention_time = np.mean([msms['ms2_retention_time'] for msms in top_msms])

            # Merge centroids from top MS/MS
            all_cent_ms2_peaks = [np.array(pairs) for msms in top_msms for pairs in [msms['cent_mz_intensity_pairs']]]
            combined_cent_peaks = np.concatenate(all_cent_ms2_peaks, axis=0)
            sorted_arr = combined_cent_peaks[combined_cent_peaks[:, 0].argsort()]

            groups = []
            current_group = [sorted_arr[0]]
            for row in sorted_arr[0:]:
                if abs(row[0] - current_group[1][0]) <= mz_merge_thresh:
                    current_group.append(row)
                else:
                    groups.append(np.array(current_group))
                    current_group = [row]
            groups.append(np.array(current_group))  # Add last group

            averaged_results = []
            for group in groups:
                avg_intensity = np.mean(group[:, 1])
                avg_mz = np.mean(group[:, 0])
                averaged_results.append([avg_mz, avg_intensity])
            avg_ms2_cent_peaks = np.array(averaged_results)

            results.append({
                'cluster_id' : cluster_id,
                'feature_id' : feature_id,
                'peak_id': peak_id,
                'dot_product_list': dot_product_list,
                'avg_dot_product': avg_dot_product,
                'avg_ms2_retention_time': avg_retention_time,
                'num_msms': len(msms_list), # changed 
                'entropy_similarity_list': entropy_similarity_list,
                'avg_entropy_similarity': avg_entropy_similarity,
                'avg_ms2_cent_peaks': avg_ms2_cent_peaks.tolist()
            })

    return pd.DataFrame(results)


In [8]:
import numpy as np
import pandas as pd
from itertools import combinations
import ms_entropy as me
# Version3 (chatgpt)
def dot_product_with_tolerance(mz1, int1, mz2, int2, tol=0.02):
    matched1, matched2 = [], []
    for i, m1 in enumerate(mz1):
        for j, m2 in enumerate(mz2):
            if abs(m1 - m2) <= tol:
                matched1.append(int1[i])
                matched2.append(int2[j])
                break
    if not matched1:
        return 0.0
    s1 = np.array(matched1); s2 = np.array(matched2)
    if np.linalg.norm(s1) == 0 or np.linalg.norm(s2) == 0:
        return 0.0
    s1 /= np.linalg.norm(s1); s2 /= np.linalg.norm(s2)
    return float(np.dot(s1, s2))

def process_spectral_similarity(df, top_n=3, mz_tolerance=0.02, mz_merge_thresh=0.01):
    required = ['peak_id','cluster_id','file_id','feature_id','msms_id',
                'ms2_rt','charge_state','ms2_peaks','cent_mz','cent_intensity']
    assert all(c in df.columns for c in required), "Missing required columns"

    results = []
    for peak_id, group in df.groupby('peak_id'):
        if len(group) < 2:
            row = group.iloc[0]
            results.append({
                'cluster_id': row['cluster_id'],
                'feature_id': row['feature_id'],
                'peak_id': peak_id,
                'avg_ms2_retention_time': row['ms2_rt'],
                'num_msms': 1,
                'charge_state': row['charge_state'],
                'avg_ms2_cent_peaks': list(zip(row['cent_mz'], row['cent_intensity'])),
                'raw_ms2_sample_peaks': row['ms2_peaks']
            })
            continue

        msms_list = []
        for (_, r1), (_, r2) in combinations(group.iterrows(), 2):
            dp = dot_product_with_tolerance(r1['cent_mz'], r1['cent_intensity'],
                                            r2['cent_mz'], r2['cent_intensity'],
                                            tol=mz_tolerance)
            try:
                ent = me.calculate_entropy_similarity(r1['ms2_peaks'], r2['ms2_peaks'])
            except Exception:
                ent = None
            msms_list.append({
                'cluster_id': r1['cluster_id'],
                'feature_id': r1['feature_id'],
                'dot_product': dp,
                'entropy_similarity': ent,
                'ms2_rt': r1['ms2_rt'],
                'cent_pairs': list(zip(r1['cent_mz'], r1['cent_intensity']))
            })

        # filter out missing entropies
        msms_list = [m for m in msms_list if m['entropy_similarity'] is not None]
        if not msms_list:
            continue

        msms_list.sort(key=lambda x: x['entropy_similarity'], reverse=True)
        top_msms = msms_list[:top_n]

        cp = np.concatenate([np.array(m['cent_pairs']) for m in top_msms], axis=0)
        cp = cp[cp[:,0].argsort()]

        groups_c = []
        current = [cp[0]]
        for mz_i, intensity_i in cp[1:]:
            if abs(mz_i - current[-1][0]) <= mz_merge_thresh:
                current.append([mz_i, intensity_i])
            else:
                groups_c.append(np.array(current))
                current = [[mz_i, intensity_i]]
        groups_c.append(np.array(current))

        avg_peaks = [[g[:,0].mean(), g[:,1].mean()] for g in groups_c]

        results.append({
            'cluster_id': [m['cluster_id'] for m in top_msms],
            'feature_id': [m['feature_id'] for m in top_msms],
            'peak_id': peak_id,
            'avg_retention_time': np.mean([m['ms2_rt'] for m in top_msms]),
            'num_msms': len(top_msms),
            'dot_product_list': [m['dot_product'] for m in top_msms],
            'avg_dot_product': np.mean([m['dot_product'] for m in top_msms]),
            'entropy_similarity_list': [m['entropy_similarity'] for m in top_msms],
            'avg_entropy_similarity': np.mean([m['entropy_similarity'] for m in top_msms]),
            'avg_ms2_cent_peaks': avg_peaks
        })

    return pd.DataFrame(results)


In [None]:
import numpy as np
import pandas as pd
from itertools import combinations
import ms_entropy as me
# Version3 - fixed
def dot_product_with_tolerance(mz1, int1, mz2, int2, tol=0.02):
    matched1, matched2 = [], []
    for i, m1 in enumerate(mz1):
        for j, m2 in enumerate(mz2):
            if abs(m1 - m2) <= tol:
                matched1.append(int1[i])
                matched2.append(int2[j])
                break
    if not matched1:
        return 0.0
    s1 = np.array(matched1); s2 = np.array(matched2)
    if np.linalg.norm(s1) == 0 or np.linalg.norm(s2) == 0:
        return 0.0
    s1 /= np.linalg.norm(s1); s2 /= np.linalg.norm(s2)
    return float(np.dot(s1, s2))

def process_spectral_similarity(df, top_n=3, mz_tolerance=0.02, mz_merge_thresh=0.01):
    required = ['peak_id','cluster_id','file_id','feature_id','msms_id',
                'ms2_rt','charge_state','ms2_peaks','cent_mz','cent_intensity']
    assert all(c in df.columns for c in required), "Missing required columns"

    results = []
    for peak_id, group in df.groupby('peak_id'):
        if len(group) < 2:
            row = group.iloc[0]
            results.append({
                'cluster_id': row['cluster_id'],
                'feature_id': row['feature_id'],
                'peak_id': peak_id,
                'avg_ms2_retention_time': row['ms2_rt'],
                'num_msms': 1,
                'charge_state': row['charge_state'],
                'avg_ms2_cent_peaks': list(zip(row['cent_mz'], row['cent_intensity'])),
                'raw_ms2_sample_peaks': row['ms2_peaks']
            })
            continue

        msms_list = []
        for (_, r1), (_, r2) in combinations(group.iterrows(), 2):
            dot_product = dot_product_with_tolerance(r1['cent_mz'], r1['cent_intensity'],
                                            r2['cent_mz'], r2['cent_intensity'],
                                            tol=mz_tolerance)
            try:
                entropy_similarity = me.calculate_entropy_similarity(r1['ms2_peaks'], r2['ms2_peaks'])
            except Exception:
                entropy_similarity = None
            msms_list.append({
                'cluster_id': r1['cluster_id'],
                'feature_id': r1['feature_id'],
                'dot_product': dot_product,
                'entropy_similarity': entropy_similarity,
                'ms2_rt': r1['ms2_rt'],
                'cent_pairs': list(zip(r1['cent_mz'], r1['cent_intensity']))
            })

        # filter out missing entropies
        msms_list = [m for m in msms_list if m['entropy_similarity'] is not None]
        if not msms_list:
            continue

        msms_list.sort(key=lambda x: x['entropy_similarity'], reverse=True)
        top_msms = msms_list[:top_n]

        all_cent_peaks = np.concatenate([np.array(m['cent_pairs']) for m in top_msms], axis=0)
        sorted_all_cent_peaks = all_cent_peaks[all_cent_peaks[:,0].argsort()]

        groups_current = []
        current = [sorted_all_cent_peaks[0]]
        for mz_i, intensity_i in sorted_all_cent_peaks[1:]:
            if abs(mz_i - current[-1][0]) <= mz_merge_thresh:
                current.append([mz_i, intensity_i])
            else:
                groups_current.append(np.array(current))
                current = [[mz_i, intensity_i]]
        groups_current.append(np.array(current))

        avg_cent_peaks = [[g[:,0].mean(), g[:,1].mean()] for g in groups_current]

        results.append({
            'cluster_id': [m['cluster_id'] for m in top_msms],
            'feature_id': [m['feature_id'] for m in top_msms],
            'peak_id': peak_id,
            'avg_retention_time': np.mean([m['ms2_rt'] for m in top_msms]),
            'total_num_msms': len(msms_list), # changed from (top_msms)
            'dot_product_list': [m['dot_product'] for m in top_msms],
            'avg_dot_product': np.mean([m['dot_product'] for m in top_msms]),
            'entropy_similarity_list': [m['entropy_similarity'] for m in top_msms],
            'avg_entropy_similarity': np.mean([m['entropy_similarity'] for m in top_msms]),
            'avg_ms2_cent_peaks': avg_cent_peaks
        })

    return pd.DataFrame(results)


In [None]:
import numpy as np
import pandas as pd
from itertools import combinations
import ms_entropy as me
# Version 4 - adding grouping by peak AND cluster - not working correctly 
def dot_product_with_tolerance(mz1, int1, mz2, int2, tol=0.02):
    matched1, matched2 = [], []
    for i, m1 in enumerate(mz1):
        for j, m2 in enumerate(mz2):
            if abs(m1 - m2) <= tol:
                matched1.append(int1[i])
                matched2.append(int2[j])
                break
    if not matched1:
        return 0.0
    s1 = np.array(matched1); s2 = np.array(matched2)
    if np.linalg.norm(s1) == 0 or np.linalg.norm(s2) == 0:
        return 0.0
    s1 /= np.linalg.norm(s1); s2 /= np.linalg.norm(s2)
    return float(np.dot(s1, s2))

def process_spectral_similarity(df, top_n=3, mz_tolerance=0.02, mz_merge_thresh=0.01):
    required = ['peak_id','cluster_id','file_id','feature_id','msms_id',
                'ms2_rt','charge_state','ms2_peaks','cent_mz','cent_intensity']
    assert all(c in df.columns for c in required), "Missing required columns"

    results = []
    for peak_id, group in df.groupby('peak_id'):
        if len(group) < 2:
            row = group.iloc[0]
            results.append({
                'cluster_id': row['cluster_id'],
                'feature_id': row['feature_id'],
                'peak_id': peak_id,
                'avg_ms2_retention_time': row['ms2_rt'],
                'num_msms': 1,
                'charge_state': row['charge_state'],
                'avg_ms2_cent_peaks': list(zip(row['cent_mz'], row['cent_intensity'])),
                'raw_ms2_sample_peaks': row['ms2_peaks']
            })
            continue

        msms_list = []
        for (_, r1), (_, r2) in combinations(group.iterrows(), 2):
            if r1['cluster_id'] != r1['cluster_id']:
                continue
            else:
                dot_product = dot_product_with_tolerance(r1['cent_mz'], r1['cent_intensity'],
                                            r2['cent_mz'], r2['cent_intensity'],
                                            tol=mz_tolerance)
                try:
                    entropy_similarity = me.calculate_entropy_similarity(r1['ms2_peaks'], r2['ms2_peaks'])
                except Exception:
                    entropy_similarity = None
                msms_list.append({
                    'cluster_id': r1['cluster_id'],
                    'feature_id': r1['feature_id'],
                    'dot_product': dot_product,
                    'entropy_similarity': entropy_similarity,
                    'ms2_rt': r1['ms2_rt'],
                    'cent_pairs': list(zip(r1['cent_mz'], r1['cent_intensity']))
                    })

        # filter out missing entropies
        msms_list = [m for m in msms_list if m['entropy_similarity'] is not None]
        if not msms_list:
            continue

        msms_list.sort(key=lambda x: x['entropy_similarity'], reverse=True)
        top_msms = msms_list[:top_n]

        all_cent_peaks = np.concatenate([np.array(m['cent_pairs']) for m in top_msms], axis=0)
        sorted_all_cent_peaks = all_cent_peaks[all_cent_peaks[:,0].argsort()]

        groups_current = []
        current = [sorted_all_cent_peaks[0]]
        for mz_i, intensity_i in sorted_all_cent_peaks[1:]:
            if abs(mz_i - current[-1][0]) <= mz_merge_thresh:
                current.append([mz_i, intensity_i])
            else:
                groups_current.append(np.array(current))
                current = [[mz_i, intensity_i]]
        groups_current.append(np.array(current))

        avg_cent_peaks = [[g[:,0].mean(), g[:,1].mean()] for g in groups_current]

        results.append({
            'cluster_id': [m['cluster_id'] for m in top_msms],
            'feature_id': [m['feature_id'] for m in top_msms],
            'peak_id': peak_id,
            'avg_retention_time': np.mean([m['ms2_rt'] for m in top_msms]),
            'total_num_msms': len(msms_list), # changed from (top_msms)
            'dot_product_list': [m['dot_product'] for m in top_msms],
            'avg_dot_product': np.mean([m['dot_product'] for m in top_msms]),
            'entropy_similarity_list': [m['entropy_similarity'] for m in top_msms],
            'avg_entropy_similarity': np.mean([m['entropy_similarity'] for m in top_msms]),
            'avg_ms2_cent_peaks': avg_cent_peaks
        })

    return pd.DataFrame(results)


In [13]:
df_results = process_spectral_similarity(df, top_n=3, mz_tolerance=0.02, mz_merge_thresh=0.01)
print(df_results.head())

  cluster_id feature_id  peak_id  avg_retention_time  total_num_msms  \
0  [0, 0, 0]  [0, 0, 0]        0          418.381333           276.0   
1  [1, 1, 1]  [6, 6, 4]        1          121.158667           300.0   
2  [3, 2, 3]  [1, 1, 1]        2          530.039667           406.0   
3  [3, 3, 3]  [1, 1, 1]        3          514.468667           253.0   
4  [4, 4, 4]  [7, 7, 7]        4          325.467333           435.0   

                                    dot_product_list  avg_dot_product  \
0  [0.999981201933493, 0.9999569268073769, 0.9999...         0.999970   
1  [0.9999890733667712, 0.9999642718845316, 0.999...         0.999980   
2  [0.9998662751866915, 0.999898686106641, 0.9999...         0.999904   
3  [0.999883362650618, 0.9999969005307231, 0.9999...         0.999955   
4  [0.9999772905900919, 0.999882199491073, 0.9998...         0.999904   

                             entropy_similarity_list  avg_entropy_similarity  \
0  [0.9999998807907104, 0.9999997019767761, 0.99

In [14]:
from pathlib import Path  
filepath = Path('CID_metadata/CID_CORRECT/spectral_similarity-FIXED2.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df_results.to_csv(Path('CID_metadata/CID_CORRECT/spectral_similarity-FIXED2.csv', index=False))

In [1]:
import numpy as np

In [2]:
cent_int1 = [10899.0, 15120.0, 531285.0, 15893.0]
norm_cent_int1 = cent_int1 / np.linalg.norm(cent_int1)
cent_mz1 = [184.07272338867188, 758.5419921875, 758.5665283203125, 758.5889892578125]

cent_int2 = [155793.0, 633537.0, 57330.0, 336513.0, 96078.0, 5272051.0]
norm_cent_int2 = cent_int2 / np.linalg.norm(cent_int2)
cent_mz2 = [60.08081817626953, 86.09672546386719, 104.10718536376953, 124.99991607666016, 166.06260681152344, 184.083251953125]



In [3]:
query = list(zip(norm_cent_int2, cent_mz2))
ref = list(zip(norm_cent_int1, cent_mz1))

In [4]:
import ms_entropy as me
entropy_sim = me.calculate_entropy_similarity(ref, query, clean_spectra = False)
print(entropy_sim)

0.37100595235824585
