In [1]:
from __future__ import annotations

import copy
import json
import os
import pickle
import re
from re import Pattern
from typing import Any, ClassVar, Literal

import dask
import dask.bag as db
import numpy as np
import pandas as pd
import polars as pl
import pyopenms as oms
import rtree
from MetaMSTools.ms_tools import (
    AdductDetector,
    AdductDetectorConfig,
    FeatureFinder,
    FeatureFinderConfig,
    FeatureLinker,
    OpenMSDataWrapper,
    RTAligner,
    TICSmoother,
)
from pydantic import BaseModel, ConfigDict, Field
from sqlalchemy import create_engine, text


def get_data_wrapper():
    qc_datas = OpenMSDataWrapper(
        file_paths=[
            "../data/raw_files/QC1.mzML",
            "../data/raw_files/QC2.mzML"
        ]
    )
    qc_datas.init_exps()
    qc_datas = TICSmoother()(qc_datas)
    feature_config = FeatureFinderConfig()
    feature_config.feature_finding_metabo.charge_upper_bound = 1
    feature_config.feature_finding_metabo.charge_lower_bound = 1
    qc_datas = FeatureFinder(config=feature_config)(qc_datas)
    qc_datas.infer_ref_feature_for_align()
    qc_datas = RTAligner()(qc_datas)
    qc_datas = FeatureLinker()(qc_datas)
    adduct_config = AdductDetectorConfig(
        charge_min=1,
        charge_max=1,
    )
    qc_datas = AdductDetector(
        config=adduct_config
    )(qc_datas)
    return qc_datas

In [2]:
datas = get_data_wrapper()

Progress of 'mass trace detection':
-- done [took 4.40 s (CPU), 0.28 s (Wall)] -- 
Progress of 'mass trace detection':
-- done [took 0.21 s (CPU), 0.21 s (Wall)] -- 
Progress of 'elution peak detection':
-- done [took 3.10 s (CPU), 0.17 s (Wall)] -- 
Progress of 'elution peak detection':
-- done [took 6.25 s (CPU), 0.22 s (Wall)] -- 
Progress of 'assembling mass traces to features':
Loading metabolite isotope model with 5% RMS error
-- done [took 2.92 s (CPU), 0.12 s (Wall)] -- 
Progress of 'assembling mass traces to features':
-- done [took 3.41 s (CPU), 0.11 s (Wall)] -- 
Progress of 'Linking features':
-- done [took 0.57 s (CPU), 0.02 s (Wall)] -- 
Adding neutral: ---------- Adduct -----------------
Charge: 0
Amount: 1
MassSingle: 13.9793
Formula: H-2O1
log P: -2.30259

MassExplainer table size: 92
Generating Masses with threshold: -6.90776 ...
<Loading metabolite isotope model with 5% RMS error> occurred 2 times
done
0 of 13 valid net charge compomer results did not pass the featur



In [81]:
class BaseMap(BaseModel):

    model_config = ConfigDict({"arbitrary_types_allowed": True})

    exp_name: str = Field(
        ...,
        data_type="metadata",
        save_mode="json",
        description="实验名称"
    )
    metadata: dict = Field(
        default={},
        data_type="metadata",
        save_mode="json",
        description="数据的metadata信息。"
    )

    def __getstate__(self):
        state = copy.deepcopy(super().__getstate__())
        for k,v in state['__dict__'].items():
            if isinstance(v, pl.DataFrame):
                state['__dict__'][k] = v.to_pandas()
        return state

    def __setstate__(self, state):
        init_func = []
        for k,f in self.model_fields.items():
            if f.annotation == pl.DataFrame or f.annotation == pl.DataFrame | None:
                if isinstance(state['__dict__'][k], pd.DataFrame):
                    state['__dict__'][k] = pl.from_pandas(state['__dict__'][k])
            elif f.annotation == rtree.index.Index | None:
                if isinstance(state['__dict__'][k], rtree.index.Index):
                    if "init_func" in f.json_schema_extra:
                        init_func.append(f.json_schema_extra["init_func"])
            elif f.annotation == rtree.index.Index:
                if "init_func" in f.json_schema_extra:
                    init_func.append(f.json_schema_extra["init_func"])
        super().__setstate__(state)
        for func_name in init_func:
            getattr(self, func_name)()

    def save(self, save_dir_path: str):

        if not os.path.exists(save_dir_path):
            os.makedirs(save_dir_path)

        metadata_path = os.path.join(save_dir_path, "metadata.json")
        index_dir_path = os.path.join(save_dir_path, "index")
        if not os.path.exists(index_dir_path):
            os.makedirs(index_dir_path)
        data_dir_path = os.path.join(save_dir_path, "data")
        if not os.path.exists(data_dir_path):
            os.makedirs(data_dir_path)
        sqlite_db_path = os.path.join(data_dir_path, "data.sqlite")
        engine = create_engine(f"sqlite:///{sqlite_db_path}")

        metadata_to_save = {"module_type": self.__class__.__name__}
        for k,f in self.model_fields.items():
            if f.json_schema_extra['data_type'] == 'metadata':
                metadata_to_save[k] = getattr(self, k)
            elif f.json_schema_extra['data_type'] == 'index':
                if isinstance(getattr(self, k), rtree.index.Index):
                    rtree_save_path = os.path.join(index_dir_path, k)
                    if os.path.exists(rtree_save_path + ".dat"):
                        os.remove(rtree_save_path + ".dat")
                    if os.path.exists(rtree_save_path + ".idx"):
                        os.remove(rtree_save_path + ".idx")
                    tree:rtree.index.Index = getattr(self, f.json_schema_extra['build_func'])(rtree_save_path)
                    tree.close()
                elif isinstance(getattr(self, k), pd.Index):
                    index_save_path = os.path.join(index_dir_path, k+".csv")
                    pd.Series(getattr(self, k)).to_csv(index_save_path, header=False)
                else:
                    other_index_save_path = os.path.join(index_dir_path, k+".pkl")
                    with open(other_index_save_path, 'wb') as f:
                        pickle.dump(getattr(self, k), f)
            elif f.json_schema_extra['data_type'] == 'data':
                if f.json_schema_extra['save_mode'] == 'sqlite':
                    data = getattr(self, k)
                    if isinstance(data, pl.DataFrame):
                        with engine.connect() as conn:
                            data.write_database(table_name=k, connection=conn, if_table_exists="replace")
                    elif isinstance(data, pd.DataFrame):
                        with engine.connect() as conn:
                            data.to_sql(k, conn, if_exists="replace")
                    else:
                        raise ValueError(f"Unsupported data type to save as sqlite: {type(data)}")
                else:
                    other_data_save_path = os.path.join(data_dir_path, k+".pkl")
                    with open(other_data_save_path, 'wb') as f:
                        pickle.dump(getattr(self, k), f)

        with open(metadata_path, 'w') as f:
            json.dump(metadata_to_save, f)

        engine.dispose()

    @classmethod
    def _base_load(cls, save_dir_path: str) -> dict[str, Any]:

        data_dict = {}

        metadata_path = os.path.join(save_dir_path, "metadata.json")
        index_dir_path = os.path.join(save_dir_path, "index")
        data_dir_path = os.path.join(save_dir_path, "data")

        if not os.path.exists(metadata_path):
            raise ValueError(f"Metadata file not found in {save_dir_path}")
        with open(metadata_path) as f:
            metadata:dict = json.load(f)
            exp_name = metadata.pop('exp_name')
            metadata.pop('module_type')
        data_dict['exp_name'] = exp_name
        data_dict['metadata'] = metadata

        if os.path.exists(index_dir_path):
            for k,f in cls.model_fields.items():
                if f.json_schema_extra['data_type'] == 'index':
                    if f.json_schema_extra['save_mode'] == 'rtree':
                        rtree_save_path = os.path.join(index_dir_path, k)
                        if os.path.exists(rtree_save_path + ".dat") and os.path.exists(rtree_save_path + ".idx"):
                            data_dict[k] = rtree.index.Index(rtree_save_path)
                    elif f.annotation == pd.Index or f.annotation == pd.Index | None:
                        index_save_path = os.path.join(index_dir_path, k+".csv")
                        if os.path.exists(index_save_path):
                            data_dict[k] = pd.Index(pd.read_csv(index_save_path, header=None, index_col=0).iloc[:,0])
                    else:
                        other_index_save_path = os.path.join(index_dir_path, k+".pkl")
                        if os.path.exists(other_index_save_path):
                            with open(other_index_save_path, 'rb') as f:
                                data_dict[k] = pickle.load(f)

        if os.path.exists(data_dir_path):
            sqlite_db_path = os.path.join(data_dir_path, "data.sqlite")
            if os.path.exists(sqlite_db_path):
                engine = create_engine(f"sqlite:///{sqlite_db_path}")
                for k,f in cls.model_fields.items():
                    if f.json_schema_extra['data_type'] == 'data':
                        if f.json_schema_extra['save_mode'] == 'sqlite':
                            with engine.connect() as conn:
                                if conn.execute(text(
                                    f"SELECT name \
                                        FROM sqlite_master \
                                        WHERE type='table' \
                                        AND name='{k}'"
                                )).fetchone() is not None:
                                    if f.annotation == pl.DataFrame or f.annotation == pl.DataFrame | None:
                                        data_dict[k] = pl.read_database(query=f"SELECT * FROM {k}", connection=conn)
                                    elif f.annotation == pd.DataFrame or f.annotation == pd.DataFrame | None:
                                        data_dict[k] = pd.read_sql_query(f"SELECT * FROM {k}", conn)
                        else:
                            other_data_save_path = os.path.join(data_dir_path, k+".pkl")
                            if os.path.exists(other_data_save_path):
                                with open(other_data_save_path, 'rb') as f:
                                    data_dict[k] = pickle.load(f)
                engine.dispose()

        return data_dict

    @classmethod
    def load(cls, save_dir_path: str):

        return cls(**cls._base_load(save_dir_path))


In [None]:
class SpectrumMap(BaseMap):

    scan_id_matcher: ClassVar[Pattern] = re.compile(r'scan=(\d+)')

    ms1_index: pd.Index | None = Field(
        default=None,
        data_type="index",
        save_mode="csv",
        description="MS1谱图的索引"
    )
    ms1_df: pl.DataFrame | None = Field(
        default=None,
        data_type="data",
        save_mode="sqlite",
        description="MS1谱图的DataFrame，基于polars"
    )
    ms2_index: pd.Index | None = Field(
        default=None,
        data_type="index",
        save_mode="csv",
        description="MS2谱图的索引"
    )
    ms2_df: pl.DataFrame | None = Field(
        default=None,
        data_type="data",
        save_mode="sqlite",
        description="MS2谱图的DataFrame，基于polars"
    )
    ms2_rtree_index: rtree.index.Index | None = Field(
        default=None,
        data_type="index",
        save_mode="rtree",
        build_func='build_ms2_rtree_index',
        init_func='init_ms2_rtree_index',
        description="基于R-Tree的MS2索引，可以基于mz和rt范围快速索引MS2数据。\
                    可以通过`init_ms2_rtree_index`方法从ms2_df中初始化。",
    )

    @staticmethod
    def get_exp_meta(exp: oms.MSExperiment) -> dict[str, str]:
        spec: oms.MSSpectrum = exp[0]
        meta_info_string = spec.getMetaValue("filter string")
        meta_info_list = meta_info_string.split(" ")
        ms_type = meta_info_list[0]
        ion_mode = meta_info_list[1]
        ion_source = meta_info_list[3]
        return {
            "ms_type": ms_type,
            "ion_mode": ion_mode,
            "ion_source": ion_source,
        }

    @staticmethod
    def get_scan_index(spec: oms.MSSpectrum) -> int:
        scan_id_match = SpectrumMap.scan_id_matcher.search(spec.getNativeID())
        if scan_id_match:
            return int(scan_id_match.group(1))
        else:
            raise ValueError(
                f"Cannot extract scan index from \
                spectrum native ID: {spec.getNativeID()}"
            )

    @staticmethod
    def ms2spec2dfdict(spec: oms.MSSpectrum) -> dict[
        Literal[
            "spec_id",
            "rt",
            "precursor_mz",
            "base_peak_mz",
            "base_peak_intensity",
            "mz_array",
            "intensity_array",
        ],
        int | float | np.ndarray
    ]:
        spec_id = SpectrumMap.get_scan_index(spec)
        rt = spec.getRT()
        precursor_mz = spec.getPrecursors()[0].getMZ()
        base_peak_mz = spec.getMetaValue("base peak m/z")
        base_peak_intensity = spec.getMetaValue("base peak intensity")
        mz_array, intensity_array = spec.get_peaks()
        return {
            "spec_id": spec_id,
            "rt": rt,
            "precursor_mz": precursor_mz,
            "base_peak_mz": base_peak_mz,
            "base_peak_intensity": base_peak_intensity,
            "mz_array": mz_array.astype(np.float32),
            "intensity_array": intensity_array.astype(np.float32),
        }

    @staticmethod
    def ms1spec2dfdict(spec: oms.MSSpectrum) -> dict[
        Literal[
            "spec_id",
            "rt",
            "mz_array",
            "intensity_array",
        ],
        int | float | np.ndarray
    ]:
        spec_id = SpectrumMap.get_scan_index(spec)
        rt = spec.getRT()
        mz_array, intensity_array = spec.get_peaks()
        return {
            "spec_id": spec_id,
            "rt": rt,
            "mz_array": mz_array.astype(np.float32),
            "intensity_array": intensity_array.astype(np.float32),
        }

    def insert_ms1_id_to_ms2(self) -> None:
        '''
        如果MS2谱图没有对应的MS1谱图ID，则插入null
        '''
        if self.ms1_df is None or self.ms2_df is None:
            raise ValueError(
                "MS1 and MS2 dataframes must be loaded \
                    before inserting MS1 IDs to MS2 dataframe"
            )
        ms1_df_mapping = self.ms1_df.with_columns(
            pl.col('spec_id').alias('ms1_id')
        ).select(['spec_id','ms1_id'])
        self.ms2_df = self.ms2_df.join_asof(
            ms1_df_mapping,
            left_on='spec_id',
            right_on='spec_id',
            strategy='backward'
        )

    def convert_scan_to_spec_id(self) -> None:
        if self.ms1_df.schema['spec_id'] in (
            pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64
        ):
            self.ms1_df = self.ms1_df.with_columns(
                (f"{self.exp_name}::ms1::" + self.ms1_df['spec_id'].cast(str)).alias('spec_id')
            )
            self.ms1_index = pd.Index(self.ms1_df['spec_id'].to_list())
        if self.ms2_df.schema['spec_id'] in (
            pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64
        ):
            self.ms2_df = self.ms2_df.with_columns(
                (f"{self.exp_name}::ms2::" + self.ms2_df['spec_id'].cast(str)).alias('spec_id')
            )
            self.ms2_index = pd.Index(self.ms2_df['spec_id'].to_list())
        if self.ms2_df.schema['ms1_id'] in (
            pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64
        ):
            self.ms2_df = self.ms2_df.with_columns(
                (f"{self.exp_name}::ms1::" + self.ms2_df['ms1_id'].cast(str)).alias('ms1_id')
            )

    def modify_ms2_rt(self) -> None:
        ms1_rt_df = self.ms1_df.select(['spec_id', 'rt']).rename({'rt': 'ms1_rt','spec_id':'ms1_id'})
        joined_df = self.ms2_df.join(ms1_rt_df, on='ms1_id', how='left')
        self.ms2_df = joined_df.with_columns(
            pl.when(pl.col('ms1_id').is_not_null())
            .then(pl.col('ms1_rt'))
            .otherwise(pl.col('rt'))
            .alias('rt')
        ).drop('ms1_rt')

    def build_ms2_rtree_index(self, path : str | None = None) -> rtree.index.Index:
        if self.ms2_df is None:
            raise ValueError(
                "MS2 dataframe must be loaded before initializing R-tree index"
            )
        ms2_rtree_index = rtree.index.Index(path)
        for i,(rt,precursor_mz) in enumerate(
            zip(
                self.ms2_df['rt'],
                self.ms2_df['precursor_mz'],
            )
        ):
            ms2_rtree_index.insert(
                id=i,
                coordinates=(precursor_mz, rt, precursor_mz, rt),
                obj=i
            )
        return ms2_rtree_index

    def init_ms2_rtree_index(self):
        self.ms2_rtree_index = self.build_ms2_rtree_index()

    def search_ms2_by_range(
        self,
        coordinates: tuple[
            float, # min_mz
            float, # min_rt
            float, # max_mz
            float, # max_rt
        ],
        return_type: Literal["id", "indices", "df"] = "id",
    ) -> list[int] | list[str] | pl.DataFrame:
        if self.ms2_rtree_index is None:
            self.init_ms2_rtree_index()
        index = list(self.ms2_rtree_index.intersection(coordinates, objects="raw"))
        if return_type == "id":
            return self.ms2_index[index].tolist()
        elif return_type == "df":
            return self.ms2_df[index]
        else:
            return index

    @classmethod
    def from_oms(
        cls,
        exp: oms.MSExperiment,
        exp_name: str,
        worker_type: Literal["threads", "processes", "synchronous"] = "threads",
        num_workers: int | None = None,
    ) -> SpectrumMap:
        spec_bag = db.from_sequence(exp,npartitions=num_workers)
        ms1_bag = spec_bag.filter(lambda x: x.getMSLevel() == 1)
        ms2_bag = spec_bag.filter(lambda x: x.getMSLevel() == 2)
        ms1_bag = ms1_bag.map(cls.ms1spec2dfdict)
        ms2_bag = ms2_bag.map(cls.ms2spec2dfdict)
        ms1,ms2 = dask.compute(ms1_bag,ms2_bag,scheduler=worker_type,num_workers=num_workers)
        ms1_df = pl.DataFrame(ms1)
        ms1_df = ms1_df.with_columns(
            (pl.col('rt') / 60.0).cast(pl.Float32),
        )
        ms1_index = pd.Index(ms1_df['spec_id'])
        ms2_df = pl.DataFrame(ms2)
        ms2_df = ms2_df.with_columns(
            (pl.col('rt') / 60.0).cast(pl.Float32),
            pl.col('precursor_mz').cast(pl.Float32),
            pl.col('base_peak_mz').cast(pl.Float32),
            pl.col('base_peak_intensity').cast(pl.Float32),
        )
        ms2_index = pd.Index(ms2_df['spec_id'])
        metadata = cls.get_exp_meta(exp)
        spectrum_map = cls(
            exp_name=exp_name,
            metadata=metadata,
            ms1_index=ms1_index,
            ms1_df=ms1_df,
            ms2_index=ms2_index,
            ms2_df=ms2_df,
        )
        spectrum_map.insert_ms1_id_to_ms2()
        spectrum_map.convert_scan_to_spec_id()
        spectrum_map.modify_ms2_rt()
        return spectrum_map

    def save(self, save_dir_path: str):

        self_to_save = copy.copy(self)
        self_to_save.ms1_df = self.ms1_df.with_columns(
            pl.col("mz_array").map_elements(
                lambda x: json.dumps(x.tolist()),
                return_dtype=pl.String,
            ),
            pl.col("intensity_array").map_elements(
                lambda x: json.dumps(x.tolist()),
                return_dtype=pl.String,
            ),
        )
        self_to_save.ms2_df = self.ms2_df.with_columns(
            pl.col("mz_array").map_elements(
                lambda x: json.dumps(x.tolist()),
                return_dtype=pl.String,
            ),
            pl.col("intensity_array").map_elements(
                lambda x: json.dumps(x.tolist()),
                return_dtype=pl.String,
            ),
        )

        super(SpectrumMap, self_to_save).save(save_dir_path)

    @classmethod
    def load(cls, save_dir_path: str):

        data_dict = cls._base_load(save_dir_path)

        if 'ms1_df' in data_dict:
            if isinstance(data_dict['ms1_df'], pl.DataFrame):
                data_dict['ms1_df'] = data_dict['ms1_df'].with_columns(
                    pl.col("mz_array").map_elements(
                        lambda x: np.array(json.loads(x)),
                        return_dtype=pl.Object,
                    ),
                    pl.col("intensity_array").map_elements(
                        lambda x: np.array(json.loads(x)),
                        return_dtype=pl.Object,
                    ),
                )
        if 'ms2_df' in data_dict:
            if isinstance(data_dict['ms2_df'], pl.DataFrame):
                data_dict['ms2_df'] = data_dict['ms2_df'].with_columns(
                    pl.col("mz_array").map_elements(
                        lambda x: np.array(json.loads(x)),
                        return_dtype=pl.Object,
                    ),
                    pl.col("intensity_array").map_elements(
                        lambda x: np.array(json.loads(x)),
                        return_dtype=pl.Object,
                    ),
                )

        return cls(**data_dict)

In [59]:
spectrum_map = SpectrumMap.from_oms(datas.exps[0], datas.exp_names[0])
spectrum_map.init_ms2_rtree_index()

In [60]:
spectrum_map.save("../cache/test_spectrum_map")

In [39]:
reload_spectrum_map = SpectrumMap.load("../cache/test_spectrum_map")

In [40]:
reload_spectrum_map.ms2_rtree_index

rtree.index.Index(bounds=[113.96365356445312, 3.3347432613372803, 925.5966796875, 9.965903282165527], size=718)

In [35]:
reload_spectrum_map.ms1_df.to_pandas()

Unnamed: 0,spec_id,rt,mz_array,intensity_array
0,QC1.mzML::ms1::797,3.343986,"[200.09165954589844, 200.1280975341797, 200.17...","[4236.865234375, 2695.840576171875, 6285.76562..."
1,QC1.mzML::ms1::802,3.363965,"[200.09190368652344, 200.12815856933594, 200.1...","[1270.613525390625, 7574.3134765625, 12525.169..."
2,QC1.mzML::ms1::807,3.384004,"[200.00518798828125, 200.0704345703125, 200.09...","[2496.521240234375, 759.7963256835938, 1507.59..."
3,QC1.mzML::ms1::812,3.404740,"[200.09190368652344, 200.12811279296875, 200.1...","[5781.83984375, 971.9057006835938, 4665.149414..."
4,QC1.mzML::ms1::817,3.425502,"[200.0050811767578, 200.06124877929688, 200.07...","[0.0, 6932.18603515625, 6468.7822265625, 2833...."
...,...,...,...,...
329,QC1.mzML::ms1::2423,9.905934,"[200.1282196044922, 200.18399047851562, 200.20...","[0.0, 13750.63671875, 19217.314453125, 17962.5..."
330,QC1.mzML::ms1::2428,9.926629,"[200.0918426513672, 200.1132049560547, 200.128...","[3245.28173828125, 0.0, 2677.9501953125, 11551..."
331,QC1.mzML::ms1::2433,9.945886,"[200.1279754638672, 200.20091247558594, 200.97...","[556.13037109375, 23764.205078125, 25525.15234..."
332,QC1.mzML::ms1::2438,9.965903,"[200.09169006347656, 200.12802124023438, 200.2...","[0.0, 10572.2529296875, 16608.46875, 17686.082..."


In [36]:
reload_spectrum_map.ms2_df.to_pandas()

Unnamed: 0,spec_id,rt,precursor_mz,base_peak_mz,base_peak_intensity,mz_array,intensity_array,ms1_id
0,QC1.mzML::ms2::795,3.334743,173.128479,173.128403,20729.001953,[200.68319702148438],[2170.664794921875],
1,QC1.mzML::ms2::796,3.339176,233.128433,174.091309,106759.648438,"[212.95030212402344, 216.10177612304688, 233.1...","[3391.868408203125, 8426.791015625, 4821.99658...",
2,QC1.mzML::ms2::801,3.343986,207.985245,184.969223,129272.453125,"[202.97972106933594, 207.98536682128906, 208.1...","[2566.66259765625, 7957.56396484375, 19053.580...",QC1.mzML::ms1::797
3,QC1.mzML::ms2::804,3.363965,224.128128,165.054581,57772.613281,"[203.77976989746094, 224.1281280517578, 229.21...","[2704.05126953125, 19318.791015625, 2518.68774...",QC1.mzML::ms1::802
4,QC1.mzML::ms2::808,3.384004,131.117920,90.947571,41532.687500,[207.17068481445312],[2081.906982421875],QC1.mzML::ms1::807
...,...,...,...,...,...,...,...,...
713,QC1.mzML::ms2::2434,9.945886,279.093353,219.056870,33164.667969,"[201.0457000732422, 219.05686950683594, 252.44...","[2686.182373046875, 33164.66796875, 2422.13159...",QC1.mzML::ms1::2433
714,QC1.mzML::ms2::2436,9.945886,371.315826,147.065063,56023.519531,"[241.1798553466797, 259.189697265625, 260.7955...","[5598.5498046875, 2642.4189453125, 2600.861083...",QC1.mzML::ms1::2433
715,QC1.mzML::ms2::2440,9.965903,224.128128,155.974701,34200.113281,[224.12831115722656],[11006.4296875],QC1.mzML::ms1::2438
716,QC1.mzML::ms2::2441,9.965903,233.128494,174.091278,65971.070312,"[216.10186767578125, 233.127685546875]","[5839.0439453125, 4289.2373046875]",QC1.mzML::ms1::2438


In [41]:
reload_spectrum_map.search_ms2_by_range(
    (250,3,260,4)
)

['QC1.mzML::ms2::923',
 'QC1.mzML::ms2::946',
 'QC1.mzML::ms2::894',
 'QC1.mzML::ms2::940',
 'QC1.mzML::ms2::815',
 'QC1.mzML::ms2::834',
 'QC1.mzML::ms2::856',
 'QC1.mzML::ms2::879',
 'QC1.mzML::ms2::900',
 'QC1.mzML::ms2::921']

In [89]:
class FeatureMap(BaseMap):

    feature_index: pd.Index | None = Field(
        default=None,
        data_type="index",
        save_mode="csv",
        description="Feature信息表的索引"
    )
    feature_info: pl.DataFrame | None = Field(
        default=None,
        data_type="data",
        save_mode="sqlite",
        description="Feature信息表"
    )
    feature_rtree_index: rtree.index.Index | None = Field(
        default=None,
        data_type="index",
        save_mode="rtree",
        build_func="build_feature_rtree_index",
        init_func="init_feature_rtree_index",
        description="Feature的R-tree索引"
    )
    hull_index: pd.Index | None = Field(
        default=None,
        data_type="index",
        save_mode="csv",
        description="Hull信息表的索引"
    )
    hull_info: pl.DataFrame | None = Field(
        default=None,
        data_type="data",
        save_mode="sqlite",
        description="Hull信息表"
    )
    hull_rtree_index: rtree.index.Index | None = Field(
        default=None,
        data_type="index",
        save_mode="rtree",
        build_func="build_hull_rtree_index",
        init_func="init_hull_rtree_index",
        description="Hull的R-tree索引"
    )

    @staticmethod
    def get_feature_metadata(feature: oms.Feature) -> dict[
        Literal[
            'hull_num',"hull_mz","hull_rt","hull_intensity",
            "isotope_pattern",
            "adduct_type","adduct_mass",
        ],
        str | float | int | np.ndarray
    ]:
        all_keys = []
        feature.getKeys(all_keys)
        all_keys = set(all_keys)
        metadata = {
            "hull_num": feature.getMetaValue("num_of_masstraces"),
            "hull_mz": np.array(feature.getMetaValue("masstrace_centroid_mz"),dtype=np.float32),
            "hull_rt": np.array(feature.getMetaValue("masstrace_centroid_rt"),dtype=np.float32),
            "hull_intensity": np.array(feature.getMetaValue("masstrace_intensity"),dtype=np.float32),
            "isotope_pattern": np.cumsum(feature.getMetaValue("isotope_distances"),dtype=np.float32),
        }
        if "dc_charge_adducts" in all_keys:
            metadata["adduct_type"] = feature.getMetaValue("dc_charge_adducts")
            metadata["adduct_mass"] = feature.getMetaValue("dc_charge_adduct_mass")
        return metadata

    @staticmethod
    def get_feature_info(
        feature_map: oms.FeatureMap,
        worker_type: Literal["threads", "processes", "synchronous"] = "threads",
        num_workers: int | None = None,
    ) -> pl.DataFrame:
        feature_info = feature_map.get_df()[
            ["RT","mz","intensity","MZstart","RTstart","MZend","RTend"]
        ]
        feature_info.index.name = "feature_id"
        feature_info = pl.from_pandas(feature_info,include_index=True)
        feature_info = feature_info.with_columns(
            pl.col("RT").cast(pl.Float32),
            pl.col("mz").cast(pl.Float32),
            pl.col("intensity").cast(pl.Float32),
            pl.col("MZstart").cast(pl.Float32),
            pl.col("RTstart").cast(pl.Float32),
            pl.col("MZend").cast(pl.Float32),
            pl.col("RTend").cast(pl.Float32),
        )
        feature_bag = db.from_sequence(feature_map, npartitions=num_workers)
        feature_metadata_bag = feature_bag.map(FeatureMap.get_feature_metadata)
        feature_metadata_list = dask.compute(
            feature_metadata_bag, scheduler=worker_type, num_workers=num_workers
        )[0]
        feature_metadata_df = pl.DataFrame(
            feature_metadata_list,
        )
        feature_metadata_df = feature_metadata_df.with_columns(
            pl.col("hull_num").cast(pl.Int32),
        )
        feature_info = pl.concat([feature_info, feature_metadata_df], how="horizontal")
        return feature_info

    @staticmethod
    def get_hull_range(
        hull: oms.ConvexHull2D,
    ) -> dict[
        Literal["MZstart","RTstart","MZend","RTend"],
        float
    ]:
        rt_points = hull.getHullPoints()[:,0]
        mz_points = hull.getHullPoints()[:,1]
        return {
            "MZstart": np.min(mz_points),
            "RTstart": np.min(rt_points),
            "MZend": np.max(mz_points),
            "RTend": np.max(rt_points),
        }

    @staticmethod
    def get_hulls(
        feature_map: oms.FeatureMap,
        feature_xic: dict[dict[oms.MSChromatogram]],
    ) -> pl.DataFrame:
        rt_hulls = {}
        for feature_rt_hulls in feature_xic:
            for rt_hull in feature_rt_hulls:
                rt_hulls[rt_hull.getNativeID().replace("_","::")] = rt_hull
        mz_hulls = {}
        for feature in feature_map:
            for i,mz_hull in enumerate(feature.getConvexHulls()):
                mz_hulls[f"{feature.getUniqueId()}::{i}"] = mz_hull
        hulls = []
        hulls_id = list(mz_hulls.keys())
        for hull_id in hulls_id:
            hull = {}
            hull['hull_id'] = hull_id
            hull.update(FeatureMap.get_hull_range(mz_hulls[hull_id]))
            rt_points, intens_points = rt_hulls[hull_id].get_peaks()
            mz_points = mz_hulls[hull_id].getHullPoints()[:,1][:len(rt_points)]
            hull['rt_points'] = rt_points.astype(np.float32)
            hull['mz_points'] = mz_points.astype(np.float32)
            hull['intens_points'] = intens_points.astype(np.float32)
            hulls.append(hull)
        hulls = pl.DataFrame(hulls)
        return hulls

    @classmethod
    def from_oms(
        cls,
        feature_map: oms.FeatureMap,
        feature_xic: list[list[oms.MSChromatogram]],
        exp_name: str,
        worker_type: Literal["threads", "processes", "synchronous"] = "threads",
        num_workers: int | None = None,
    ) -> FeatureMap:
        feature_info = cls.get_feature_info(feature_map, worker_type, num_workers)
        hull_info = cls.get_hulls(feature_map, feature_xic)
        feature_info = feature_info.with_columns(
            (f"{exp_name}::" + pl.col("feature_id").cast(str)).alias("feature_id"),
        )
        feature_index = pd.Index(feature_info["feature_id"].to_list())
        hull_info = hull_info.with_columns(
            (f"{exp_name}::" + pl.col("hull_id").cast(str)).alias("hull_id"),
        )
        hull_index = pd.Index(hull_info["hull_id"].to_list())
        return cls(
            exp_name=exp_name,
            feature_info=feature_info,
            hull_info=hull_info,
            feature_index=feature_index,
            hull_index=hull_index,
        )

    def get_oms_feature_map(self) -> oms.FeatureMap:
        feature_map = oms.FeatureMap()
        for feature_id, feature_row in self.feature_info.iterrows():
            feature = oms.Feature()
            feature.setUniqueId(int(feature_id.split("::")[1]))
            feature.setMZ(feature_row["mz"])
            feature.setRT(feature_row["RT"])
            feature.setIntensity(feature_row["intensity"])
            feature_map.push_back(feature)
        return feature_map

    def build_feature_rtree_index(self, path : str | None = None) -> rtree.index.Index:
        if self.feature_info is None:
            raise ValueError(
                "Feature info dataframe must be loaded before initializing R-tree index"
            )
        feature_rtree_index = rtree.index.Index(path)
        for i,(mz_start,rt_start,mz_end,rt_end) in enumerate(
            zip(
                self.feature_info['MZstart'],
                self.feature_info['RTstart'],
                self.feature_info['MZend'],
                self.feature_info['RTend'],
            )
        ):
            feature_rtree_index.insert(
                id=i,
                coordinates=(mz_start,rt_start,mz_end,rt_end),
                obj=i
            )
        return feature_rtree_index

    def init_feature_rtree_index(self, path : str | None = None) -> None:
        self.feature_rtree_index = self.build_feature_rtree_index(path)

    def search_feature_by_range(
        self,
        coordinates: tuple[
            float, # min_mz
            float, # min_rt
            float, # max_mz
            float, # max_rt
        ],
        return_type: Literal["id", "indices", "df"] = "id",
    ) -> list[int] | list[str] | pl.DataFrame:
        if self.feature_rtree_index is None:
            self.init_feature_rtree_index()
        index = list(self.feature_rtree_index.intersection(coordinates))
        if return_type == "id":
            return self.feature_index[index].tolist()
        elif return_type == "df":
            return self.feature_info[index]
        else:
            return index

    def build_hull_rtree_index(self, path : str | None = None) -> rtree.index.Index:
        if self.hull_info is None:
            raise ValueError(
                "Hulls dataframe must be loaded before initializing R-tree index"
            )
        hull_rtree_index = rtree.index.Index(path)
        for i,(mz_start,rt_start,mz_end,rt_end) in enumerate(
            zip(
                self.hull_info['MZstart'],
                self.hull_info['RTstart'],
                self.hull_info['MZend'],
                self.hull_info['RTend'],
            )
        ):
            hull_rtree_index.insert(
                id=i,
                coordinates=(mz_start,rt_start,mz_end,rt_end),
                obj=i
            )
        return hull_rtree_index

    def init_hull_rtree_index(self, path : str | None = None) -> None:
        self.hull_rtree_index = self.build_hull_rtree_index(path)

    def search_hull_by_range(
        self,
        coordinates: tuple[
            float, # min_mz
            float, # min_rt
            float, # max_mz
            float, # max_rt
        ],
        return_type: Literal["id", "indices", "df"] = "id",
    ) -> list[int] | list[str] | pl.DataFrame:
        if self.hull_rtree_index is None:
            self.init_hull_rtree_index()
        index = list(self.hull_rtree_index.intersection(coordinates))
        if return_type == "id":
            return self.hull_index[index].tolist()
        elif return_type == "df":
            return self.hull_info[index]
        else:
            return index

    def save(self, save_dir_path: str):

        self_to_save = copy.copy(self)

        self_to_save.feature_info = self.feature_info.with_columns(
            pl.col("hull_mz").map_elements(
                lambda x: json.dumps(x.tolist()),
                return_dtype=pl.String,
            ),
            pl.col("hull_rt").map_elements(
                lambda x: json.dumps(x.tolist()),
                return_dtype=pl.String,
            ),
            pl.col("hull_intensity").map_elements(
                lambda x: json.dumps(x.tolist()),
                return_dtype=pl.String,
            ),
            pl.col("isotope_pattern").map_elements(
                lambda x: json.dumps(x.tolist()),
                return_dtype=pl.String,
            ),
        )

        self_to_save.hull_info = self.hull_info.with_columns(
            pl.col("rt_points").map_elements(
                lambda x: json.dumps(x.tolist()),
                return_dtype=pl.String,
            ),
            pl.col("mz_points").map_elements(
                lambda x: json.dumps(x.tolist()),
                return_dtype=pl.String,
            ),
            pl.col("intens_points").map_elements(
                lambda x: json.dumps(x.tolist()),
                return_dtype=pl.String,
            ),
        )

        super(FeatureMap, self_to_save).save(save_dir_path)

    @classmethod
    def load(cls, save_dir_path: str):

        data_dict = cls._base_load(save_dir_path)

        feature_info: pl.DataFrame | None = data_dict.pop("feature_info")

        if feature_info is not None:
            data_dict['feature_info'] = feature_info.with_columns(
                pl.col("hull_mz").map_elements(
                    lambda x: np.array(json.loads(x)),
                    return_dtype=pl.Object,
                ),
                pl.col("hull_rt").map_elements(
                    lambda x: np.array(json.loads(x)),
                    return_dtype=pl.Object,
                ),
                pl.col("hull_intensity").map_elements(
                    lambda x: np.array(json.loads(x)),
                    return_dtype=pl.Object,
                ),
                pl.col("isotope_pattern").map_elements(
                    lambda x: np.array(json.loads(x)),
                    return_dtype=pl.Object,
                ),
            )
        hull_info: pl.DataFrame | None = data_dict.pop("hull_info")
        if hull_info is not None:
            data_dict['hull_info'] = hull_info.with_columns(
                pl.col("rt_points").map_elements(
                    lambda x: np.array(json.loads(x)),
                    return_dtype=pl.Object,
                ),
                pl.col("mz_points").map_elements(
                    lambda x: np.array(json.loads(x)),
                    return_dtype=pl.Object,
                ),
                pl.col("intens_points").map_elements(
                    lambda x: np.array(json.loads(x)),
                    return_dtype=pl.Object,
                ),
            )

        return cls(**data_dict)

In [90]:
feature_map = FeatureMap.from_oms(datas.features[0],datas.chromatogram_peaks[0],datas.exp_names[0])
feature_map.init_feature_rtree_index()
feature_map.init_hull_rtree_index()

In [91]:
feature_map.save("../cache/test_feature_map")

In [92]:
reload_feature_map = FeatureMap.load("../cache/test_feature_map")

In [93]:
reload_feature_map.feature_info.to_pandas()

Unnamed: 0,feature_id,RT,mz,intensity,MZstart,RTstart,MZend,RTend,hull_num,hull_mz,hull_rt,hull_intensity,isotope_pattern
0,QC1.mzML::3140128202761656841,215.297836,445.902374,1.043826e+05,445.902130,200.639130,446.902557,219.024033,2,"[445.9023742675781, 446.902099609375]","[215.29783630371094, 201.83787536621094]","[104382.5703125, 26270.7109375]",[0.9997183680534363]
1,QC1.mzML::17186567986998007591,231.165848,252.181900,1.539104e+06,252.181808,228.764023,253.185608,244.602783,2,"[252.18190002441406, 253.1851806640625]","[231.1658477783203, 231.1658477783203]","[1539104.25, 321779.25]",[1.0032715797424316]
2,QC1.mzML::15006895844463803923,233.607956,344.228027,1.974188e+05,344.226501,221.427612,345.231903,247.019714,2,"[344.22802734375, 345.23150634765625]","[233.6079559326172, 237.21585083007812]","[197418.84375, 47032.88671875]",[1.0034769773483276]
3,QC1.mzML::6603054536402124583,243.397186,300.203064,1.728374e+06,300.202606,240.991180,301.206665,256.907562,2,"[300.20306396484375, 301.20599365234375]","[243.39718627929688, 243.39718627929688]","[1728374.25, 125906.328125]",[1.0029122829437256]
4,QC1.mzML::13800840845817960591,244.602783,297.190918,2.255495e+05,297.190704,240.991180,298.194336,255.663635,2,"[297.19091796875, 298.19427490234375]","[244.602783203125, 243.39718627929688]","[225549.453125, 35371.0625]",[1.0033713579177856]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,QC1.mzML::820081768432578652,551.715149,579.064331,7.300085e+04,579.063843,546.777588,580.067810,567.546021,2,"[579.0643310546875, 580.0674438476562]","[551.7151489257812, 556.65380859375]","[73000.8515625, 34937.7734375]",[1.003105878829956]
71,QC1.mzML::7775401195095411494,565.102600,671.789124,1.122403e+05,671.787720,556.653809,672.792969,577.414490,2,"[671.7891235351562, 672.7918701171875]","[565.1026000976562, 561.4547119140625]","[112240.2890625, 42289.27734375]",[1.0027822256088257]
72,QC1.mzML::16848338621840768339,572.526733,598.411926,5.571515e+04,598.411560,565.102600,599.415588,578.617126,2,"[598.4119262695312, 599.4152221679688]","[572.5267333984375, 574.966796875]","[55715.15234375, 25916.025390625]",[1.003268837928772]
73,QC1.mzML::1103000384613136512,576.165039,705.811768,5.492179e+04,705.810608,573.770325,706.818054,585.897400,2,"[705.811767578125, 706.814697265625]","[576.1650390625, 577.4144897460938]","[54921.79296875, 30861.908203125]",[1.002939224243164]


In [94]:
reload_feature_map.hull_info.to_pandas()

Unnamed: 0,hull_id,MZstart,RTstart,MZend,RTend,rt_points,mz_points,intens_points
0,QC1.mzML::3140128202761656841::0,445.902130,200.639130,445.902527,219.024033,"[200.63912963867188, 201.83787536621094, 203.0...","[445.90234375, 445.90240478515625, 445.9021301...","[5323.93310546875, 5364.64990234375, 7542.6015..."
1,QC1.mzML::3140128202761656841::1,446.901550,200.639130,446.902557,212.854370,"[200.63912963867188, 201.83787536621094, 203.0...","[446.9025573730469, 446.9021301269531, 446.902...","[2132.606689453125, 2963.083251953125, 1872.60..."
2,QC1.mzML::17186567986998007591::0,252.181808,228.764023,252.181946,244.602783,"[228.76402282714844, 229.9199676513672, 231.16...","[252.1818084716797, 252.18191528320312, 252.18...","[6225.435546875, 203993.890625, 562776.4375, 4..."
3,QC1.mzML::17186567986998007591::1,253.185089,228.764023,253.185608,239.703873,"[228.76402282714844, 229.9199676513672, 231.16...","[253.1852264404297, 253.18521118164062, 253.18...","[3768.0859375, 27921.669921875, 152985.078125,..."
4,QC1.mzML::15006895844463803923::0,344.226501,221.427612,344.228577,247.019714,"[221.4276123046875, 222.67210388183594, 226.36...","[344.22650146484375, 344.2268981933594, 344.22...","[2444.303955078125, 2652.00244140625, 2465.136..."
...,...,...,...,...,...,...,...,...
150,QC1.mzML::16848338621840768339::1,599.414551,570.033875,599.415588,576.165039,"[570.0338745117188, 574.966796875, 576.1650390...","[599.4155883789062, 599.415283203125, 599.4145...","[3903.8193359375, 4733.27001953125, 2966.22875..."
151,QC1.mzML::1103000384613136512::0,705.810608,573.770325,705.812317,585.897400,"[573.7703247070312, 574.966796875, 576.1650390...","[705.81201171875, 705.8115844726562, 705.81219...","[6845.88525390625, 6800.7763671875, 6981.02148..."
152,QC1.mzML::1103000384613136512::1,706.811401,573.770325,706.818054,585.897400,"[573.7703247070312, 574.966796875, 576.1650390...","[706.8153076171875, 706.8145141601562, 706.815...","[3681.091796875, 1325.0985107421875, 5481.8691..."
153,QC1.mzML::7163667108964009145::0,209.189941,579.857483,209.190094,585.897400,"[579.8574829101562, 581.053466796875, 582.2518...","[209.19000244140625, 209.18997192382812, 209.1...","[33363.90234375, 53031.91796875, 35343.7890625..."


In [95]:
reload_feature_map.feature_rtree_index

rtree.index.Index(bounds=[201.97244262695312, 200.63912963867188, 789.0614624023438, 599.1535034179688], size=75)

In [96]:
pickle.loads(pickle.dumps(reload_feature_map)).feature_rtree_index

rtree.index.Index(bounds=[201.97244262695312, 200.63912963867188, 789.0614624023438, 599.1535034179688], size=75)

In [11]:
rebuild_datas = OpenMSDataWrapper(features=[
    FeatureMap.from_oms(f,x,en).get_oms_feature_map() \
    for f,x,en in zip(datas.features,datas.chromatogram_peaks,datas.exp_names)
])
rebuild_datas.exp_names = datas.exp_names
rebuild_datas = FeatureLinker()(rebuild_datas)

Progress of 'Linking features':
-- done [took 0.01 s (CPU), 0.01 s (Wall)] -- 


In [12]:
rebuild_datas.consensus_map.get_df()

Unnamed: 0_level_0,sequence,charge,RT,mz,quality,QC2.mzML,QC1.mzML
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,,0,404.847657,695.425855,0.557743,4.537626e+06,1.026271e+05
0,,0,576.286209,209.189986,0.952328,1.552605e+05,1.354698e+05
0,,0,360.167820,207.159064,0.935903,2.195314e+06,5.035236e+06
0,,0,349.334946,533.324174,0.840550,7.844999e+04,1.233872e+05
0,,0,507.315594,293.109054,0.990750,1.042219e+05,9.651988e+04
...,...,...,...,...,...,...,...
0,,0,446.029355,725.372814,0.000000,2.473700e+05,0.000000e+00
0,,0,446.029355,709.356876,0.000000,6.133417e+05,0.000000e+00
0,,0,446.029355,741.365603,0.000000,2.869066e+05,0.000000e+00
0,,0,446.029355,618.372397,0.000000,5.882982e+06,0.000000e+00


In [13]:
def link_ms2_to_feature(feature_hulls: pd.DataFrame,spectrum_map: SpectrumMap) -> list[str]:
    spectrum_id_list = []
    for mz_start,rt_start,mz_end,rt_end in zip(
        feature_hulls['MZstart'],
        feature_hulls['RTstart'],
        feature_hulls['MZend'],
        feature_hulls['RTend'],
    ):
        spectrum_id_list += spectrum_map.search_ms2_by_range(
            (mz_start,rt_start,mz_end,rt_end)
        )
    return spectrum_id_list

def link_ms2_and_feature_map(
    feature_map: FeatureMap,
    spectrum_map: SpectrumMap,
    key_id: Literal["feature","spectrum"] = "feature",
) -> pd.Series[str | list[str]]:
    feature_id_bag = db.from_sequence(
        zip(feature_map.feature_info.index, feature_map.feature_info['hull_num'])
    )
    feature_hulls_id_bag = feature_id_bag.map(
        lambda x: [x[0]+f"::{i}" for i in range(x[1])]
    )
    feature_hulls_bag = feature_hulls_id_bag.map(
        lambda x: feature_map.hulls.loc[x]
    )
    spectrum_id_bag = feature_hulls_bag.map(
        lambda x: link_ms2_to_feature(x,spectrum_map)
    )
    spectrum_id_list = dask.compute(
        spectrum_id_bag, scheduler="threads"
    )[0]
    if key_id == "feature":
        mapping_series = pd.Series(spectrum_id_list, index=feature_map.feature_info.index)
        mapping_series.index.name = "feature_id"
        mapping_series.name = "spectrum_id"
        return mapping_series
    else:
        mapping_series = pd.Series()
        mapping_series.index.name = "spectrum_id"
        mapping_series.name = "feature_id"
        for spectrum_ids, feature_id in zip(spectrum_id_list, feature_map.feature_info.index):
            for spectrum_id in spectrum_ids:
                mapping_series[spectrum_id] = feature_id
        return mapping_series

In [14]:
link_ms2_and_feature_map(feature_map, spectrum_map, "feature")

feature_id
QC1.mzML::5261480272464681702                                           []
QC1.mzML::109232923244398657      [QC1.mzML::ms2::923, QC1.mzML::ms2::946]
QC1.mzML::17277684442908073595                        [QC1.mzML::ms2::936]
QC1.mzML::17310728823578640817    [QC1.mzML::ms2::973, QC1.mzML::ms2::995]
QC1.mzML::4758972346404056975                         [QC1.mzML::ms2::976]
                                                    ...                   
QC1.mzML::12179905224677114345                                          []
QC1.mzML::12181800865879437076                                          []
QC1.mzML::3655475155587324847                                           []
QC1.mzML::14201693722899224880                                          []
QC1.mzML::5198541186238866932                        [QC1.mzML::ms2::2370]
Name: spectrum_id, Length: 75, dtype: object

In [15]:
link_ms2_and_feature_map(feature_map, spectrum_map, "spectrum")

spectrum_id
QC1.mzML::ms2::923       QC1.mzML::109232923244398657
QC1.mzML::ms2::946       QC1.mzML::109232923244398657
QC1.mzML::ms2::936     QC1.mzML::17277684442908073595
QC1.mzML::ms2::916     QC1.mzML::17310728823578640817
QC1.mzML::ms2::973     QC1.mzML::17310728823578640817
QC1.mzML::ms2::995     QC1.mzML::17310728823578640817
QC1.mzML::ms2::976      QC1.mzML::4758972346404056975
QC1.mzML::ms2::1035    QC1.mzML::11346374657489545787
QC1.mzML::ms2::1054    QC1.mzML::11346374657489545787
QC1.mzML::ms2::1124     QC1.mzML::9410620042631636663
QC1.mzML::ms2::1183    QC1.mzML::14098708295893166351
QC1.mzML::ms2::1306    QC1.mzML::12673403173576306906
QC1.mzML::ms2::1304     QC1.mzML::8864025362939724457
QC1.mzML::ms2::1289    QC1.mzML::10970745381786157009
QC1.mzML::ms2::1349    QC1.mzML::10970745381786157009
QC1.mzML::ms2::1377     QC1.mzML::8746726905846740337
QC1.mzML::ms2::1391    QC1.mzML::15121964842776754730
QC1.mzML::ms2::1392     QC1.mzML::1679423022259619783
QC1.mzML::ms2::1

In [16]:
class XICMap(BaseModel):

    model_config = ConfigDict({"arbitrary_types_allowed": True})

    ion_index: rtree.index.Index
    ion_df: pd.DataFrame

    @classmethod
    def from_oms(
        cls,
        exp: oms.MSExperiment,
    ) -> XICMap:
        ion_df = exp.get_massql_df()[0][["mz","rt","i"]]
        ion_df['rt'] = ion_df['rt'] * 60
        ion_index = rtree.index.Index()
        for i,(ion_id,ion_mz,ion_rt) in enumerate(
            zip(
                ion_df.index,
                ion_df['mz'],
                ion_df['rt'],
            )
        ):
            ion_index.insert(
                id=i,
                coordinates=(ion_mz, ion_rt),
                obj=ion_id,
            )
        return cls(ion_index=ion_index, ion_df=ion_df)

    @classmethod
    def from_ms1(
        cls,
        ms1: pd.DataFrame,
    ) -> XICMap:
        ion_df = {
            "mz": [],
            "rt": [],
            "i": [],
        }
        ion_index = rtree.index.Index()
        ion_id = 0
        for _,row in ms1.iterrows():
            rt = row['rt']
            for mz,intensity in zip(row['mz_array'],row['intensity_array']):
                ion_df['mz'].append(mz)
                ion_df['rt'].append(rt)
                ion_df['i'].append(intensity)
                ion_index.insert(
                    id=ion_id,
                    coordinates=(mz, rt),
                    obj=ion_id,
                )
                ion_id += 1
        ion_df = pd.DataFrame(ion_df)
        return cls(ion_index=ion_index, ion_df=ion_df)

    def search_ion_by_range(
        self,
        coordinates: tuple[
            float, # min_mz
            float, # min_rt
            float, # max_mz
            float, # max_rt
        ]
    ) -> list[int]:
        return list(self.ion_index.intersection(coordinates, objects="raw"))

In [17]:
xic_map = XICMap.from_oms(datas.exps[0])

In [18]:
xic_map.ion_df.loc[xic_map.search_ion_by_range(
    (200,200,201,202)
)]

Unnamed: 0,mz,rt,i
669,200.091904,201.837875,1270.613525
670,200.128159,201.837875,7574.313477
671,200.164642,201.837875,12525.169922
672,200.200958,201.837875,15436.75
673,200.972382,201.837875,16018.74707
0,200.09166,200.63913,4236.865234
1,200.128098,200.63913,2695.840576
2,200.172775,200.63913,6285.765625
3,200.183792,200.63913,11543.618164
4,200.200943,200.63913,15958.017578


In [19]:
xic_map = XICMap.from_ms1(spectrum_map.ms1_df)

In [20]:
xic_map.ion_df.loc[xic_map.search_ion_by_range(
    (200,200,201,202)
)]

Unnamed: 0,mz,rt,i
669,200.091904,201.837872,1270.613525
670,200.128159,201.837872,7574.313477
671,200.164642,201.837872,12525.169922
672,200.200958,201.837872,15436.75
673,200.972382,201.837872,16018.74707
0,200.09166,200.639135,4236.865234
1,200.128098,200.639135,2695.840576
2,200.172775,200.639135,6285.765625
3,200.183792,200.639135,11543.618164
4,200.200943,200.639135,15958.017578


In [21]:
class ConsensusMap(BaseModel):

    model_config = ConfigDict({"arbitrary_types_allowed": True})

    consensus_df: pd.DataFrame
    consensus_feature_mapping: pd.Series
    feature_consensus_mapping: pd.Series

    @classmethod
    def from_oms(cls, consensus_map: oms.ConsensusMap) -> ConsensusMap:
        raw_consensus_df = consensus_map.get_df()
        exp_names = raw_consensus_df.columns[5:][::-1]
        consensus_bag = db.from_sequence(consensus_map)
        feature_id_bag = consensus_bag.map(
            lambda x: \
                [
                    f.getUniqueId() if f.getUniqueId() is str \
                    else f"{exp_names[f.getMapIndex()]}::{f.getUniqueId()}" \
                    for f in x.getFeatureList()
                ]
        )
        consensus_df = consensus_map.get_df().iloc[:,2:].reset_index(drop=True)
        consensus_df.index.name = "consensus_id"
        consensus_feature_mapping = pd.Series(
            feature_id_bag.compute(scheduler="threads"),
            index=consensus_df.index
        )
        consensus_feature_mapping.name = "feature_ids"
        consensus_feature_mapping.index.name = "consensus_id"
        feature_consensus_mapping = {}
        for cid,fids in consensus_feature_mapping.items():
            for fid in fids:
                feature_consensus_mapping[fid] = cid
        feature_consensus_mapping = pd.Series(feature_consensus_mapping)
        feature_consensus_mapping.name = "consensus_id"
        feature_consensus_mapping.index.name = "feature_id"
        return cls(
            consensus_df=consensus_df,
            consensus_feature_mapping=consensus_feature_mapping,
            feature_consensus_mapping=feature_consensus_mapping
        )

    def as_oms_feature_map(self) -> oms.FeatureMap:
        feature_map = oms.FeatureMap()
        for i,row in self.consensus_df.iterrows():
            feature = oms.Feature()
            feature.setUniqueId(i)
            feature.setMZ(row["mz"])
            feature.setRT(row["RT"])
            feature.setIntensity(row.iloc[3:].max())
            feature_map.push_back(feature)
        return feature_map

In [22]:
consensus_map = ConsensusMap.from_oms(datas.consensus_map)

In [23]:
consensus_map.consensus_df

Unnamed: 0_level_0,RT,mz,quality,QC2.mzML,QC1.mzML
consensus_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,404.847657,695.425855,0.557743,4.537626e+06,1.026271e+05
1,576.286209,209.189986,0.952328,1.552605e+05,1.354698e+05
2,360.167820,207.159064,0.935903,2.195314e+06,5.035236e+06
3,349.334946,533.324174,0.840550,7.844999e+04,1.233872e+05
4,507.315594,293.109054,0.990750,1.042219e+05,9.651988e+04
...,...,...,...,...,...
126,446.029355,725.372814,0.000000,2.473700e+05,0.000000e+00
127,446.029355,709.356876,0.000000,6.133417e+05,0.000000e+00
128,446.029355,741.365603,0.000000,2.869066e+05,0.000000e+00
129,446.029355,618.372397,0.000000,5.882982e+06,0.000000e+00


In [24]:
consensus_map.consensus_feature_mapping

consensus_id
0      [QC1.mzML::1058058667861826189, QC2.mzML::5829...
1      [QC1.mzML::5198541186238866932, QC2.mzML::9748...
2      [QC1.mzML::6356764604209267192, QC2.mzML::1215...
3      [QC1.mzML::8102842054760010345, QC2.mzML::1834...
4      [QC1.mzML::10946711822686169604, QC2.mzML::526...
                             ...                        
126                     [QC2.mzML::15378851355750419626]
127                     [QC2.mzML::17722064519600937218]
128                     [QC2.mzML::18049518165761848085]
129                     [QC2.mzML::18163286508655570769]
130                     [QC2.mzML::18444431267099520080]
Name: feature_ids, Length: 131, dtype: object

In [25]:
consensus_map.feature_consensus_mapping

feature_id
QC1.mzML::1058058667861826189       0
QC2.mzML::5829911669225560868       0
QC1.mzML::5198541186238866932       1
QC2.mzML::974875078852663642        1
QC1.mzML::6356764604209267192       2
                                 ... 
QC2.mzML::15378851355750419626    126
QC2.mzML::17722064519600937218    127
QC2.mzML::18049518165761848085    128
QC2.mzML::18163286508655570769    129
QC2.mzML::18444431267099520080    130
Name: consensus_id, Length: 139, dtype: int64

In [26]:
consensus_map.get_oms_feature_map().get_df()

Unnamed: 0_level_0,peptide_sequence,peptide_score,ID_filename,ID_native_id,charge,RT,mz,RTstart,RTend,MZstart,MZend,quality,intensity
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,,,,,0,404.847657,695.425855,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.0,4.537626e+06
1,,,,,0,576.286209,209.189986,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.0,1.552605e+05
2,,,,,0,360.167820,207.159064,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.0,5.035236e+06
3,,,,,0,349.334946,533.324174,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.0,1.233872e+05
4,,,,,0,507.315594,293.109054,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.0,1.042219e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,,,,,0,446.029355,725.372814,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.0,2.473700e+05
127,,,,,0,446.029355,709.356876,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.0,6.133417e+05
128,,,,,0,446.029355,741.365603,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.0,2.869066e+05
129,,,,,0,446.029355,618.372397,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.0,5.882982e+06
