In [None]:
from __future__ import annotations

import copy
import json
import os
import pickle
import re
from re import Pattern
from typing import Any, ClassVar, Literal

import dask
import dask.bag as db
import geopandas as gpd
import numpy as np
import pandas as pd
import polars as pl
import pyopenms as oms
import rtree
from MetaMSTools.ms_tools import (
    AdductDetector,
    AdductDetectorConfig,
    FeatureFinder,
    FeatureFinderConfig,
    FeatureLinker,
    OpenMSDataWrapper,
    RTAligner,
    TICSmoother,
)
from pydantic import BaseModel, ConfigDict, Field
from shapely.geometry import box
from sqlalchemy import create_engine, text


def get_data_wrapper():
    qc_datas = OpenMSDataWrapper(
        file_paths=[
            "../data/raw_files/QC1.mzML",
            "../data/raw_files/QC2.mzML"
        ]
    )
    qc_datas.init_exps()
    qc_datas = TICSmoother()(qc_datas)
    feature_config = FeatureFinderConfig()
    feature_config.feature_finding_metabo.charge_upper_bound = 1
    feature_config.feature_finding_metabo.charge_lower_bound = 1
    qc_datas = FeatureFinder(config=feature_config)(qc_datas)
    qc_datas.infer_ref_feature_for_align()
    qc_datas = RTAligner()(qc_datas)
    qc_datas = FeatureLinker()(qc_datas)
    adduct_config = AdductDetectorConfig(
        charge_min=1,
        charge_max=1,
    )
    qc_datas = AdductDetector(
        config=adduct_config
    )(qc_datas)
    return qc_datas

In [2]:
datas = get_data_wrapper()

Progress of 'mass trace detection':
-- done [took 3.23 s (CPU), 0.24 s (Wall)] -- 
Progress of 'mass trace detection':
-- done [took 0.20 s (CPU), 0.20 s (Wall)] -- 
Progress of 'elution peak detection':
-- done [took 2.01 s (CPU), 0.07 s (Wall)] -- 
Progress of 'elution peak detection':
-- done [took 6.82 s (CPU), 0.24 s (Wall)] -- 
Progress of 'assembling mass traces to features':
Loading metabolite isotope model with 5% RMS error
-- done [took 2.69 s (CPU), 0.11 s (Wall)] -- 
Progress of 'assembling mass traces to features':
-- done [took 3.85 s (CPU), 0.12 s (Wall)] -- 
Progress of 'Linking features':
-- done [took 0.56 s (CPU), 0.02 s (Wall)] -- 
Adding neutral: ---------- Adduct -----------------
Charge: 0
Amount: 1
MassSingle: 13.9793
Formula: H-2O1
log P: -2.30259

MassExplainer table size: 92
Generating Masses with threshold: -6.90776 ...
<Loading metabolite isotope model with 5% RMS error> occurred 2 times
done
0 of 13 valid net charge compomer results did not pass the featur



In [97]:
class BaseMap(BaseModel):

    model_config = ConfigDict({"arbitrary_types_allowed": True})

    exp_name: str = Field(
        ...,
        data_type="metadata",
        save_mode="json",
        description="实验名称"
    )
    metadata: dict = Field(
        default={},
        data_type="metadata",
        save_mode="json",
        description="数据的metadata信息。"
    )

    def __getstate__(self):
        state = copy.deepcopy(super().__getstate__())
        for k,v in state['__dict__'].items():
            if isinstance(v, pl.DataFrame):
                if any(tp == pl.Object for tp in v.dtypes):
                    state['__dict__'][k] = v.to_pandas()
        return state

    def __setstate__(self, state):
        init_func = []
        for k,f in self.model_fields.items():
            if f.annotation == pl.DataFrame or f.annotation == pl.DataFrame | None:
                if isinstance(state['__dict__'][k], pd.DataFrame):
                    state['__dict__'][k] = pl.from_pandas(state['__dict__'][k])
            elif f.annotation == rtree.index.Index | None:
                if isinstance(state['__dict__'][k], rtree.index.Index):
                    if "init_func" in f.json_schema_extra:
                        init_func.append(f.json_schema_extra["init_func"])
            elif f.annotation == rtree.index.Index:
                if "init_func" in f.json_schema_extra:
                    init_func.append(f.json_schema_extra["init_func"])
        super().__setstate__(state)
        for func_name in init_func:
            getattr(self, func_name)()

    def save(self, save_dir_path: str):

        if not os.path.exists(save_dir_path):
            os.makedirs(save_dir_path)

        metadata_path = os.path.join(save_dir_path, "metadata.json")
        index_dir_path = os.path.join(save_dir_path, "index")
        if not os.path.exists(index_dir_path):
            os.makedirs(index_dir_path)
        data_dir_path = os.path.join(save_dir_path, "data")
        if not os.path.exists(data_dir_path):
            os.makedirs(data_dir_path)
        sqlite_db_path = os.path.join(data_dir_path, "data.sqlite")
        engine = create_engine(f"sqlite:///{sqlite_db_path}")

        metadata_to_save = {"module_type": self.__class__.__name__}
        for k,f in self.model_fields.items():
            if f.json_schema_extra['data_type'] == 'metadata':
                metadata_to_save[k] = getattr(self, k)
            elif f.json_schema_extra['data_type'] == 'index':
                if isinstance(getattr(self, k), rtree.index.Index):
                    rtree_save_path = os.path.join(index_dir_path, k)
                    if os.path.exists(rtree_save_path + ".dat"):
                        os.remove(rtree_save_path + ".dat")
                    if os.path.exists(rtree_save_path + ".idx"):
                        os.remove(rtree_save_path + ".idx")
                    tree:rtree.index.Index = getattr(self, f.json_schema_extra['build_func'])(rtree_save_path)
                    tree.close()
                elif isinstance(getattr(self, k), pd.Index):
                    index_save_path = os.path.join(index_dir_path, k+".csv")
                    pd.Series(getattr(self, k)).to_csv(index_save_path, header=False)
                elif isinstance(getattr(self, k), gpd.GeoDataFrame):
                    getattr(self, k).to_parquet(os.path.join(index_dir_path, k+".parquet"))
                else:
                    other_index_save_path = os.path.join(index_dir_path, k+".pkl")
                    with open(other_index_save_path, 'wb') as f:
                        pickle.dump(getattr(self, k), f)
            elif f.json_schema_extra['data_type'] == 'data':
                if f.json_schema_extra['save_mode'] == 'sqlite':
                    data = getattr(self, k)
                    if isinstance(data, pl.DataFrame):
                        with engine.connect() as conn:
                            data.write_database(table_name=k, connection=conn, if_table_exists="replace")
                    elif isinstance(data, pd.DataFrame):
                        with engine.connect() as conn:
                            data.to_sql(k, conn, if_exists="replace")
                    else:
                        raise ValueError(f"Unsupported data type to save as sqlite: {type(data)}")
                else:
                    other_data_save_path = os.path.join(data_dir_path, k+".pkl")
                    with open(other_data_save_path, 'wb') as f:
                        pickle.dump(getattr(self, k), f)

        with open(metadata_path, 'w') as f:
            json.dump(metadata_to_save, f)

        engine.dispose()

    @classmethod
    def _base_load(cls, save_dir_path: str) -> dict[str, Any]:

        data_dict = {}

        metadata_path = os.path.join(save_dir_path, "metadata.json")
        index_dir_path = os.path.join(save_dir_path, "index")
        data_dir_path = os.path.join(save_dir_path, "data")

        if not os.path.exists(metadata_path):
            raise ValueError(f"Metadata file not found in {save_dir_path}")
        with open(metadata_path) as f:
            metadata:dict = json.load(f)
            exp_name = metadata.pop('exp_name')
            metadata.pop('module_type')
        data_dict['exp_name'] = exp_name
        data_dict['metadata'] = metadata

        if os.path.exists(index_dir_path):
            for k,f in cls.model_fields.items():
                if f.json_schema_extra['data_type'] == 'index':
                    if f.json_schema_extra['save_mode'] == 'rtree':
                        rtree_save_path = os.path.join(index_dir_path, k)
                        if os.path.exists(rtree_save_path + ".dat") and os.path.exists(rtree_save_path + ".idx"):
                            data_dict[k] = rtree.index.Index(rtree_save_path)
                    elif f.annotation == pd.Index or f.annotation == pd.Index | None:
                        index_save_path = os.path.join(index_dir_path, k+".csv")
                        if os.path.exists(index_save_path):
                            data_dict[k] = pd.Index(pd.read_csv(index_save_path, header=None, index_col=0).iloc[:,0])
                    elif f.annotation == gpd.GeoDataFrame or f.annotation == gpd.GeoDataFrame | None:
                        index_save_path = os.path.join(index_dir_path, k+".parquet")
                        if os.path.exists(index_save_path):
                            data_dict[k] = gpd.read_parquet(index_save_path)
                    else:
                        other_index_save_path = os.path.join(index_dir_path, k+".pkl")
                        if os.path.exists(other_index_save_path):
                            with open(other_index_save_path, 'rb') as f:
                                data_dict[k] = pickle.load(f)

        if os.path.exists(data_dir_path):
            sqlite_db_path = os.path.join(data_dir_path, "data.sqlite")
            if os.path.exists(sqlite_db_path):
                engine = create_engine(f"sqlite:///{sqlite_db_path}")
                for k,f in cls.model_fields.items():
                    if f.json_schema_extra['data_type'] == 'data':
                        if f.json_schema_extra['save_mode'] == 'sqlite':
                            with engine.connect() as conn:
                                if conn.execute(text(
                                    f"SELECT name \
                                        FROM sqlite_master \
                                        WHERE type='table' \
                                        AND name='{k}'"
                                )).fetchone() is not None:
                                    if f.annotation == pl.DataFrame or f.annotation == pl.DataFrame | None:
                                        data_dict[k] = pl.read_database(query=f"SELECT * FROM {k}", connection=conn)
                                    elif f.annotation == pd.DataFrame or f.annotation == pd.DataFrame | None:
                                        data_dict[k] = pd.read_sql_query(f"SELECT * FROM {k}", conn)
                        else:
                            other_data_save_path = os.path.join(data_dir_path, k+".pkl")
                            if os.path.exists(other_data_save_path):
                                with open(other_data_save_path, 'rb') as f:
                                    data_dict[k] = pickle.load(f)
                engine.dispose()

        return data_dict

    @classmethod
    def load(cls, save_dir_path: str):

        return cls(**cls._base_load(save_dir_path))


In [None]:
class SpectrumMap(BaseMap):

    scan_id_matcher: ClassVar[Pattern] = re.compile(r'scan=(\d+)')

    ms1_index: gpd.GeoDataFrame | None = Field(
        default=None,
        data_type="index",
        save_mode="parquet",
        description="MS1谱图的空间索引表，基于geopandas"
    )
    ms1_df: pl.DataFrame | None = Field(
        default=None,
        data_type="data",
        save_mode="sqlite",
        description="MS1谱图的DataFrame，基于polars"
    )
    ms2_index: gpd.GeoDataFrame | None = Field(
        default=None,
        data_type="index",
        save_mode="parquet",
        description="MS2谱图的空间索引表，基于geopandas"
    )
    ms2_df: pl.DataFrame | None = Field(
        default=None,
        data_type="data",
        save_mode="sqlite",
        description="MS2谱图的DataFrame，基于polars"
    )

    @staticmethod
    def get_exp_meta(exp: oms.MSExperiment) -> dict[str, str]:
        spec: oms.MSSpectrum = exp[0]
        meta_info_string = spec.getMetaValue("filter string")
        meta_info_list = meta_info_string.split(" ")
        ms_type = meta_info_list[0]
        ion_mode = meta_info_list[1]
        ion_source = meta_info_list[3]
        return {
            "ms_type": ms_type,
            "ion_mode": ion_mode,
            "ion_source": ion_source,
        }

    @staticmethod
    def get_scan_index(spec: oms.MSSpectrum) -> int:
        scan_id_match = SpectrumMap.scan_id_matcher.search(spec.getNativeID())
        if scan_id_match:
            return int(scan_id_match.group(1))
        else:
            raise ValueError(
                f"Cannot extract scan index from \
                spectrum native ID: {spec.getNativeID()}"
            )

    @staticmethod
    def ms2spec2dfdict(spec: oms.MSSpectrum) -> dict[
        Literal[
            "spec_id",
            "rt",
            "precursor_mz",
            "base_peak_mz",
            "base_peak_intensity",
            "mz_array",
            "intensity_array",
        ],
        int | float | np.ndarray
    ]:
        spec_id = SpectrumMap.get_scan_index(spec)
        rt = spec.getRT()
        precursor_mz = spec.getPrecursors()[0].getMZ()
        base_peak_mz = spec.getMetaValue("base peak m/z")
        base_peak_intensity = spec.getMetaValue("base peak intensity")
        mz_array, intensity_array = spec.get_peaks()
        return {
            "spec_id": spec_id,
            "rt": rt,
            "precursor_mz": precursor_mz,
            "base_peak_mz": base_peak_mz,
            "base_peak_intensity": base_peak_intensity,
            "mz_array": mz_array.tolist(),
            "intensity_array": intensity_array.tolist(),
        }

    @staticmethod
    def ms1spec2dfdict(spec: oms.MSSpectrum) -> dict[
        Literal[
            "spec_id",
            "rt",
            "mz_array",
            "intensity_array",
        ],
        int | float | np.ndarray
    ]:
        spec_id = SpectrumMap.get_scan_index(spec)
        rt = spec.getRT()
        mz_array, intensity_array = spec.get_peaks()
        return {
            "spec_id": spec_id,
            "rt": rt,
            "mz_array": mz_array.tolist(),
            "intensity_array": intensity_array.tolist(),
        }

    def insert_ms1_id_to_ms2(self) -> None:
        '''
        如果MS2谱图没有对应的MS1谱图ID，则插入null
        '''
        if self.ms1_df is None or self.ms2_df is None:
            raise ValueError(
                "MS1 and MS2 dataframes must be loaded \
                    before inserting MS1 IDs to MS2 dataframe"
            )
        ms1_df_mapping = self.ms1_df.with_columns(
            pl.col('spec_id').alias('ms1_id')
        ).select(['spec_id','ms1_id'])
        self.ms2_df = self.ms2_df.join_asof(
            ms1_df_mapping,
            left_on='spec_id',
            right_on='spec_id',
            strategy='backward'
        )

    def convert_scan_to_spec_id(self) -> None:
        if self.ms1_df.schema['spec_id'] in (
            pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64
        ):
            self.ms1_df = self.ms1_df.with_columns(
                (f"{self.exp_name}::ms1::" + self.ms1_df['spec_id'].cast(str)).alias('spec_id')
            )
            self.ms1_index.index = pd.Index(self.ms1_df['spec_id'].to_list())
        if self.ms2_df.schema['spec_id'] in (
            pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64
        ):
            self.ms2_df = self.ms2_df.with_columns(
                (f"{self.exp_name}::ms2::" + self.ms2_df['spec_id'].cast(str)).alias('spec_id')
            )
            self.ms2_index.index = pd.Index(self.ms2_df['spec_id'].to_list())
        if self.ms2_df.schema['ms1_id'] in (
            pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64
        ):
            self.ms2_df = self.ms2_df.with_columns(
                (f"{self.exp_name}::ms1::" + self.ms2_df['ms1_id'].cast(str)).alias('ms1_id')
            )

    def modify_ms2_rt(self) -> None:
        ms1_rt_df = self.ms1_df.select(['spec_id', 'rt']).rename({'rt': 'ms1_rt','spec_id':'ms1_id'})
        joined_df = self.ms2_df.join(ms1_rt_df, on='ms1_id', how='left')
        self.ms2_df = joined_df.with_columns(
            pl.when(pl.col('ms1_id').is_not_null())
            .then(pl.col('ms1_rt'))
            .otherwise(pl.col('rt'))
            .alias('rt')
        ).drop('ms1_rt')

    def search_ms2_by_range(
        self,
        coordinates: tuple[
            float, # min_rt
            float, # min_mz
            float, # max_rt
            float, # max_mz
        ],
        return_type: Literal["id", "indices", "df"] = "id",
    ) -> list[int] | list[str] | pl.DataFrame:
        iloc = list(self.ms2_index.sindex.intersection(coordinates))
        if return_type == "id":
            return self.ms2_index.index[iloc].tolist()
        elif return_type == "df":
            return self.ms2_df[iloc]
        else:
            return iloc

    @classmethod
    def from_oms(
        cls,
        exp: oms.MSExperiment,
        exp_name: str,
        worker_type: Literal["threads", "processes", "synchronous"] = "threads",
        num_workers: int | None = None,
    ) -> SpectrumMap:
        spec_bag = db.from_sequence(exp,npartitions=num_workers)
        ms1_bag = spec_bag.filter(lambda x: x.getMSLevel() == 1)
        ms2_bag = spec_bag.filter(lambda x: x.getMSLevel() == 2)
        ms1_bag = ms1_bag.map(cls.ms1spec2dfdict)
        ms2_bag = ms2_bag.map(cls.ms2spec2dfdict)
        ms1,ms2 = dask.compute(ms1_bag,ms2_bag,scheduler=worker_type,num_workers=num_workers)
        ms1_df = pl.DataFrame(ms1,schema={
            "spec_id":pl.Int32,
            "rt":pl.Float32,
            "mz_array":pl.List(pl.Float32),
            "intensity_array":pl.List(pl.Float32),
        })
        ms1_df = ms1_df.with_columns(
            (pl.col('rt') / 60.0),
        )
        ms1_index = gpd.GeoDataFrame(
            {"iloc":range(len(ms1_df))},
            index=ms1_df['spec_id'],
            geometry=gpd.points_from_xy(
                x=ms1_df['rt'],
                y=[0] * len(ms1_df),
            )
        )
        ms2_df = pl.DataFrame(ms2,schema={
            "spec_id":pl.Int32,
            "rt":pl.Float32,
            "precursor_mz":pl.Float32,
            "base_peak_mz":pl.Float32,
            "base_peak_intensity":pl.Float32,
            "mz_array":pl.List(pl.Float32),
            "intensity_array":pl.List(pl.Float32),
        })
        ms2_df = ms2_df.with_columns(
            (pl.col('rt') / 60.0),
        )
        ms2_index = gpd.GeoDataFrame(
            {"iloc":range(len(ms2_df))},
            index=ms2_df['spec_id'],
            geometry=gpd.points_from_xy(
                x=ms2_df['rt'],
                y=ms2_df['precursor_mz'],
            )
        )
        metadata = cls.get_exp_meta(exp)
        spectrum_map = cls(
            exp_name=exp_name,
            metadata=metadata,
            ms1_index=ms1_index,
            ms1_df=ms1_df,
            ms2_index=ms2_index,
            ms2_df=ms2_df,
        )
        spectrum_map.insert_ms1_id_to_ms2()
        spectrum_map.convert_scan_to_spec_id()
        spectrum_map.modify_ms2_rt()
        return spectrum_map

    def save(self, save_dir_path: str):

        self_to_save = copy.copy(self)
        self_to_save.ms1_df = self.ms1_df.with_columns(
            ("[" + pl.col("mz_array").cast(pl.List(pl.String)).list.join(",") + "]")
            .alias("mz_array"),
            ("[" + pl.col("intensity_array").cast(pl.List(pl.String)).list.join(",") + "]")
            .alias("intensity_array"),
        )
        self_to_save.ms2_df = self.ms2_df.with_columns(
            ("[" + pl.col("mz_array").cast(pl.List(pl.String)).list.join(",") + "]")
            .alias("mz_array"),
            ("[" + pl.col("intensity_array").cast(pl.List(pl.String)).list.join(",") + "]")
            .alias("intensity_array"),
        )

        super(SpectrumMap, self_to_save).save(save_dir_path)

    @classmethod
    def load(cls, save_dir_path: str):

        data_dict = cls._base_load(save_dir_path)

        if 'ms1_df' in data_dict:
            if isinstance(data_dict['ms1_df'], pl.DataFrame):
                data_dict['ms1_df'] = data_dict['ms1_df'].with_columns(
                    pl.col("mz_array")
                        .str.strip_chars_start("[")
                        .str.strip_chars_end("]")
                        .str.split(",")
                        .cast(pl.List(pl.Float32)),
                    pl.col("intensity_array")
                        .str.strip_chars_start("[")
                        .str.strip_chars_end("]")
                        .str.split(",")
                        .cast(pl.List(pl.Float32)),
                )
        if 'ms2_df' in data_dict:
            if isinstance(data_dict['ms2_df'], pl.DataFrame):
                data_dict['ms2_df'] = data_dict['ms2_df'].with_columns(
                    pl.col("mz_array")
                        .str.strip_chars_start("[")
                        .str.strip_chars_end("]")
                        .str.split(",")
                        .cast(pl.List(pl.Float32)),
                    pl.col("intensity_array")
                        .str.strip_chars_start("[")
                        .str.strip_chars_end("]")
                        .str.split(",")
                        .cast(pl.List(pl.Float32)),
                )

        return cls(**data_dict)

In [None]:
spectrum_map = SpectrumMap.from_oms(datas.exps[0], datas.exp_names[0])

In [110]:
spectrum_map.save("../cache/test_spectrum_map")

In [111]:
reload_spectrum_map = SpectrumMap.load("../cache/test_spectrum_map")

In [None]:
reload_spectrum_map: SpectrumMap = pickle.loads(pickle.dumps(reload_spectrum_map))

In [114]:
reload_spectrum_map.ms1_index

Unnamed: 0,iloc,geometry
QC1.mzML::ms1::797,0,POINT (3.34399 0)
QC1.mzML::ms1::802,1,POINT (3.36396 0)
QC1.mzML::ms1::807,2,POINT (3.384 0)
QC1.mzML::ms1::812,3,POINT (3.40474 0)
QC1.mzML::ms1::817,4,POINT (3.4255 0)
...,...,...
QC1.mzML::ms1::2423,329,POINT (9.90593 0)
QC1.mzML::ms1::2428,330,POINT (9.92663 0)
QC1.mzML::ms1::2433,331,POINT (9.94589 0)
QC1.mzML::ms1::2438,332,POINT (9.9659 0)


In [115]:
reload_spectrum_map.ms2_index

Unnamed: 0,iloc,geometry
QC1.mzML::ms2::795,0,POINT (3.335 173.128)
QC1.mzML::ms2::796,1,POINT (3.339 233.128)
QC1.mzML::ms2::801,2,POINT (3.359 207.985)
QC1.mzML::ms2::804,3,POINT (3.37 224.128)
QC1.mzML::ms2::808,4,POINT (3.386 131.118)
...,...,...
QC1.mzML::ms2::2434,713,POINT (9.947 279.093)
QC1.mzML::ms2::2436,714,POINT (9.957 371.316)
QC1.mzML::ms2::2440,715,POINT (9.972 224.128)
QC1.mzML::ms2::2441,716,POINT (9.977 233.128)


In [116]:
reload_spectrum_map.ms1_df.to_pandas()

Unnamed: 0,spec_id,rt,mz_array,intensity_array
0,QC1.mzML::ms1::797,3.343986,"[200.09166, 200.1281, 200.17278, 200.18379, 20...","[4236.865, 2695.8406, 6285.7656, 11543.618, 15..."
1,QC1.mzML::ms1::802,3.363965,"[200.0919, 200.12816, 200.16464, 200.20096, 20...","[1270.6135, 7574.3135, 12525.17, 15436.75, 160..."
2,QC1.mzML::ms1::807,3.384004,"[200.00519, 200.07043, 200.09167, 200.12814, 2...","[2496.5212, 759.7963, 1507.5992, 3919.854, 717..."
3,QC1.mzML::ms1::812,3.404740,"[200.0919, 200.12811, 200.16447, 200.17265, 20...","[5781.84, 971.9057, 4665.1494, 11266.042, 1676..."
4,QC1.mzML::ms1::817,3.425502,"[200.00508, 200.06125, 200.07079, 200.0918, 20...","[0.0, 6932.186, 6468.782, 2833.361, 292.0821, ..."
...,...,...,...,...
329,QC1.mzML::ms1::2423,9.905934,"[200.12822, 200.18399, 200.2009, 200.97238, 20...","[0.0, 13750.637, 19217.314, 17962.502, 13282.3..."
330,QC1.mzML::ms1::2428,9.926630,"[200.09184, 200.1132, 200.12827, 200.2009, 200...","[3245.2817, 0.0, 2677.9502, 11551.505, 18565.9..."
331,QC1.mzML::ms1::2433,9.945886,"[200.12798, 200.20091, 200.9724, 201.00827, 20...","[556.1304, 23764.205, 25525.152, 16996.648, 64..."
332,QC1.mzML::ms1::2438,9.965904,"[200.09169, 200.12802, 200.20093, 200.97238, 2...","[0.0, 10572.253, 16608.469, 17686.082, 15538.6..."


In [117]:
reload_spectrum_map.ms2_df.to_pandas()

Unnamed: 0,spec_id,rt,precursor_mz,base_peak_mz,base_peak_intensity,mz_array,intensity_array,ms1_id
0,QC1.mzML::ms2::795,3.334743,173.128479,173.128403,20729.001953,[200.6832],[2170.6648],
1,QC1.mzML::ms2::796,3.339177,233.128433,174.091309,106759.648438,"[212.9503, 216.10178, 233.12871]","[3391.8684, 8426.791, 4821.9966]",
2,QC1.mzML::ms2::801,3.343986,207.985245,184.969223,129272.453125,"[202.97972, 207.98537, 208.13326]","[2566.6626, 7957.564, 19053.58]",QC1.mzML::ms1::797
3,QC1.mzML::ms2::804,3.363965,224.128128,165.054581,57772.613281,"[203.77977, 224.12813, 229.21915]","[2704.0513, 19318.791, 2518.6877]",QC1.mzML::ms1::802
4,QC1.mzML::ms2::808,3.384004,131.117920,90.947571,41532.687500,[207.17068],[2081.907],QC1.mzML::ms1::807
...,...,...,...,...,...,...,...,...
713,QC1.mzML::ms2::2434,9.945886,279.093353,219.056870,33164.667969,"[201.0457, 219.05687, 252.44502]","[2686.1824, 33164.668, 2422.1316]",QC1.mzML::ms1::2433
714,QC1.mzML::ms2::2436,9.945886,371.315826,147.065063,56023.519531,"[241.17986, 259.1897, 260.79556, 269.67587, 29...","[5598.55, 2642.419, 2600.861, 2115.9075, 2110....",QC1.mzML::ms1::2433
715,QC1.mzML::ms2::2440,9.965904,224.128128,155.974701,34200.113281,[224.12831],[11006.43],QC1.mzML::ms1::2438
716,QC1.mzML::ms2::2441,9.965904,233.128494,174.091278,65971.070312,"[216.10187, 233.12769]","[5839.044, 4289.2373]",QC1.mzML::ms1::2438


In [118]:
reload_spectrum_map.search_ms2_by_range(
    (3,200,4,250)
)

['QC1.mzML::ms2::941',
 'QC1.mzML::ms2::851',
 'QC1.mzML::ms2::801',
 'QC1.mzML::ms2::915',
 'QC1.mzML::ms2::895',
 'QC1.mzML::ms2::864',
 'QC1.mzML::ms2::844',
 'QC1.mzML::ms2::925',
 'QC1.mzML::ms2::804',
 'QC1.mzML::ms2::884',
 'QC1.mzML::ms2::824',
 'QC1.mzML::ms2::904',
 'QC1.mzML::ms2::944',
 'QC1.mzML::ms2::796',
 'QC1.mzML::ms2::821',
 'QC1.mzML::ms2::910',
 'QC1.mzML::ms2::880',
 'QC1.mzML::ms2::841']

In [153]:
class FeatureMap(BaseMap):

    feature_index: gpd.GeoDataFrame | None = Field(
        default=None,
        data_type="index",
        save_mode="parquet",
        description="Feature的空间索引表，基于geopandas"
    )
    feature_info: pl.DataFrame | None = Field(
        default=None,
        data_type="data",
        save_mode="sqlite",
        description="Feature信息表，基于polars"
    )
    hull_index: gpd.GeoDataFrame | None = Field(
        default=None,
        data_type="index",
        save_mode="parquet",
        description="Hull的空间索引表，基于geopandas"
    )
    hull_info: pl.DataFrame | None = Field(
        default=None,
        data_type="data",
        save_mode="sqlite",
        description="Hull信息表，基于polars"
    )

    @staticmethod
    def get_feature_metadata(feature: oms.Feature) -> dict[
        Literal[
            'hull_num',"hull_mz","hull_rt","hull_intensity",
            "isotope_pattern",
            "adduct_type","adduct_mass",
        ],
        str | float | int | list[float]
    ]:
        all_keys = []
        feature.getKeys(all_keys)
        all_keys = set(all_keys)
        metadata = {
            "hull_num": feature.getMetaValue("num_of_masstraces"),
            "hull_mz": feature.getMetaValue("masstrace_centroid_mz"),
            "hull_rt": feature.getMetaValue("masstrace_centroid_rt"),
            "hull_intensity": feature.getMetaValue("masstrace_intensity"),
            "isotope_pattern": feature.getMetaValue("isotope_distances"),
        }
        if "dc_charge_adducts" in all_keys:
            metadata["adduct_type"] = feature.getMetaValue("dc_charge_adducts")
            metadata["adduct_mass"] = feature.getMetaValue("dc_charge_adduct_mass")
        return metadata

    @staticmethod
    def get_feature_info(
        feature_map: oms.FeatureMap,
        worker_type: Literal["threads", "processes", "synchronous"] = "threads",
        num_workers: int | None = None,
    ) -> pl.DataFrame:
        feature_info = feature_map.get_df()[
            ["RT","mz","intensity","MZstart","RTstart","MZend","RTend"]
        ]
        feature_info.index.name = "feature_id"
        feature_info = pl.from_pandas(
            feature_info,
            schema_overrides = {
                "RT": pl.Float32,
                "mz": pl.Float32,
                "intensity": pl.Float32,
                "MZstart": pl.Float32,
                "RTstart": pl.Float32,
                "MZend": pl.Float32,
                "RTend": pl.Float32,
            },
            include_index=True
        )
        feature_bag = db.from_sequence(feature_map, npartitions=num_workers)
        feature_metadata_bag = feature_bag.map(FeatureMap.get_feature_metadata)
        feature_metadata_list = dask.compute(
            feature_metadata_bag, scheduler=worker_type, num_workers=num_workers
        )[0]
        feature_metadata_df = pl.DataFrame(
            feature_metadata_list,
            schema_overrides={
                "hull_num": pl.Int8,
                "hull_mz": pl.List(pl.Float32),
                "hull_rt": pl.List(pl.Float32),
                "hull_intensity": pl.List(pl.Float32),
                "isotope_pattern": pl.List(pl.Float32),
            }
        )
        feature_metadata_df = feature_metadata_df.with_columns(
            pl.col("isotope_pattern").list.eval(pl.element().cum_sum()),
        )
        if "adduct_mass" in feature_metadata_df.columns:
            feature_metadata_df = feature_metadata_df.with_columns(
                pl.col("adduct_mass").cast(pl.Float32),
            )
        feature_info = pl.concat([feature_info, feature_metadata_df], how="horizontal")
        return feature_info

    @staticmethod
    def get_hulls(
        feature_map: oms.FeatureMap,
        feature_xic: dict[dict[oms.MSChromatogram]],
    ) -> pl.DataFrame:
        rt_hulls = {}
        for feature_rt_hulls in feature_xic:
            for rt_hull in feature_rt_hulls:
                rt_hulls[rt_hull.getNativeID().replace("_","::")] = rt_hull
        mz_hulls = {}
        for feature in feature_map:
            for i,mz_hull in enumerate(feature.getConvexHulls()):
                mz_hulls[f"{feature.getUniqueId()}::{i}"] = mz_hull
        hulls = []
        hulls_id = list(mz_hulls.keys())
        for hull_id in hulls_id:
            hull = {}
            hull['hull_id'] = hull_id
            rt_points, intens_points = rt_hulls[hull_id].get_peaks()
            mz_points = mz_hulls[hull_id].getHullPoints()[:,1][:len(rt_points)]
            hull['rt_points'] = rt_points.tolist()
            hull['mz_points'] = mz_points.tolist()
            hull['intens_points'] = intens_points.tolist()
            hulls.append(hull)
        hull_info = pl.DataFrame(
            hulls,
            schema_overrides={
                'rt_points': pl.List(pl.Float32),
                'mz_points': pl.List(pl.Float32),
                'intens_points': pl.List(pl.Float32),
            }
        )
        hull_info = hull_info.with_columns(
            pl.col("rt_points").list.min().alias("RTstart"),
            pl.col("rt_points").list.max().alias("RTend"),
            pl.col("mz_points").list.min().alias("MZstart"),
            pl.col("mz_points").list.max().alias("MZend"),
        )
        hull_info = hull_info.select(
            "hull_id",
            "RTstart",
            "RTend",
            "MZstart",
            "MZend",
            "rt_points",
            "mz_points",
            "intens_points",
        )
        return hull_info

    @classmethod
    def from_oms(
        cls,
        feature_map: oms.FeatureMap,
        feature_xic: list[list[oms.MSChromatogram]],
        exp_name: str,
        worker_type: Literal["threads", "processes", "synchronous"] = "threads",
        num_workers: int | None = None,
    ) -> FeatureMap:
        feature_info = cls.get_feature_info(feature_map, worker_type, num_workers)
        hull_info = cls.get_hulls(feature_map, feature_xic)
        feature_info = feature_info.with_columns(
            (f"{exp_name}::" + pl.col("feature_id").cast(str)).alias("feature_id"),
        )
        feature_index = gpd.GeoDataFrame(
            {"iloc": range(len(feature_info))},
            index=feature_info["feature_id"].to_list(),
            geometry=[
                box(rt_start, mz_start, rt_end, mz_end) \
                    for rt_start, mz_start, rt_end, mz_end in zip(
                        feature_info["RTstart"],
                        feature_info["MZstart"],
                        feature_info["RTend"],
                        feature_info["MZend"],
                    )
            ],
        )
        hull_info = hull_info.with_columns(
            (f"{exp_name}::" + pl.col("hull_id").cast(str)).alias("hull_id"),
        )
        hull_index = gpd.GeoDataFrame(
            {"iloc": range(len(hull_info))},
            index=hull_info["hull_id"].to_list(),
            geometry=[
                box(rt_start, mz_start, rt_end, mz_end) \
                    for rt_start, mz_start, rt_end, mz_end in zip(
                        hull_info["RTstart"],
                        hull_info["MZstart"],
                        hull_info["RTend"],
                        hull_info["MZend"],
                    )
            ]
        )
        return cls(
            exp_name=exp_name,
            feature_info=feature_info,
            hull_info=hull_info,
            feature_index=feature_index,
            hull_index=hull_index,
        )

    def get_oms_feature_map(self) -> oms.FeatureMap:
        feature_map = oms.FeatureMap()
        feature_info = self.feature_info.select(
            "feature_id","mz", "RT", "intensity",
        ).with_columns(
            pl.col("feature_id").str.split("::").list.get(1).cast(pl.Int128).alias("feature_id"),
        )
        for i in range(len(feature_info)):
            feature = oms.Feature()
            feature.setUniqueId(feature_info[i, "feature_id"])
            feature.setMZ(feature_info[i, "mz"])
            feature.setRT(feature_info[i, "RT"])
            feature.setIntensity(feature_info[i, "intensity"])
            feature_map.push_back(feature)
        return feature_map

    def search_feature_by_range(
        self,
        coordinates: tuple[
            float, # min_rt
            float, # min_mz
            float, # max_rt
            float, # max_mz
        ],
        return_type: Literal["id", "indices", "df"] = "id",
    ) -> list[int] | list[str] | pl.DataFrame:
        iloc = list(self.feature_index.sindex.intersection(coordinates))
        if return_type == "id":
            return self.feature_index[iloc].index.tolist()
        elif return_type == "df":
            return self.feature_info[iloc]
        else:
            return iloc

    def search_hull_by_range(
        self,
        coordinates: tuple[
            float, # min_rt
            float, # min_mz
            float, # max_rt
            float, # max_mz
        ],
        return_type: Literal["id", "indices", "df"] = "id",
    ) -> list[int] | list[str] | pl.DataFrame:
        iloc = list(self.hull_index.sindex.intersection(coordinates))
        if return_type == "id":
            return self.hull_index[iloc].index.tolist()
        elif return_type == "df":
            return self.hull_info[iloc]
        else:
            return iloc

    def save(self, save_dir_path: str):

        self_to_save = copy.copy(self)

        self_to_save.feature_info = self.feature_info.with_columns(
            ("[" + pl.col("hull_mz").cast(pl.List(pl.String)).list.join(",") + "]")
            .alias("hull_mz"),
            ("[" + pl.col("hull_rt").cast(pl.List(pl.String)).list.join(",") + "]")
            .alias("hull_rt"),
            ("[" + pl.col("hull_intensity").cast(pl.List(pl.String)).list.join(",") + "]")
            .alias("hull_intensity"),
            ("[" + pl.col("isotope_pattern").cast(pl.List(pl.String)).list.join(",") + "]")
            .alias("isotope_pattern"),
        )

        self_to_save.hull_info = self.hull_info.with_columns(
            ("[" + pl.col("rt_points").cast(pl.List(pl.String)).list.join(",") + "]")
            .alias("rt_points"),
            ("[" + pl.col("mz_points").cast(pl.List(pl.String)).list.join(",") + "]")
            .alias("mz_points"),
            ("[" + pl.col("intens_points").cast(pl.List(pl.String)).list.join(",") + "]")
            .alias("intens_points"),
        )

        super(FeatureMap, self_to_save).save(save_dir_path)

    @classmethod
    def load(cls, save_dir_path: str):

        data_dict = cls._base_load(save_dir_path)

        feature_info: pl.DataFrame | None = data_dict.pop("feature_info")

        if feature_info is not None:
            data_dict['feature_info'] = feature_info.with_columns(
                pl.col("hull_mz")
                    .str.strip_chars_start("[")
                    .str.strip_chars_end("]")
                    .str.split(",")
                    .cast(pl.List(pl.Float32)),
                pl.col("hull_rt")
                    .str.strip_chars_start("[")
                    .str.strip_chars_end("]")
                    .str.split(",")
                    .cast(pl.List(pl.Float32)),
                pl.col("hull_intensity")
                    .str.strip_chars_start("[")
                    .str.strip_chars_end("]")
                    .str.split(",")
                    .cast(pl.List(pl.Float32)),
                pl.col("isotope_pattern")
                    .str.strip_chars_start("[")
                    .str.strip_chars_end("]")
                    .str.split(",")
                    .cast(pl.List(pl.Float32)),
            )

        hull_info: pl.DataFrame | None = data_dict.pop("hull_info")
        if hull_info is not None:
            data_dict['hull_info'] = hull_info.with_columns(
                pl.col("rt_points")
                    .str.strip_chars_start("[")
                    .str.strip_chars_end("]")
                    .str.split(",")
                    .cast(pl.List(pl.Float32)),
                pl.col("mz_points")
                    .str.strip_chars_start("[")
                    .str.strip_chars_end("]")
                    .str.split(",")
                    .cast(pl.List(pl.Float32)),
                pl.col("intens_points")
                    .str.strip_chars_start("[")
                    .str.strip_chars_end("]")
                    .str.split(",")
                    .cast(pl.List(pl.Float32)),
            )

        return cls(**data_dict)

In [154]:
feature_map = FeatureMap.from_oms(datas.features[0],datas.chromatogram_peaks[0],datas.exp_names[0])

In [139]:
feature_map.save("../cache/test_feature_map")

In [140]:
reload_feature_map = FeatureMap.load("../cache/test_feature_map")

In [141]:
reload_feature_map:FeatureMap = pickle.loads(pickle.dumps(reload_feature_map))

In [142]:
reload_feature_map.feature_index

Unnamed: 0,iloc,geometry
QC1.mzML::13005910100464212996,0,"POLYGON ((219.024 445.902, 219.024 446.903, 20..."
QC1.mzML::14115125470287761880,1,"POLYGON ((244.603 252.182, 244.603 253.186, 22..."
QC1.mzML::10693648182463709255,2,"POLYGON ((247.02 344.227, 247.02 345.232, 221...."
QC1.mzML::8958118472990820462,3,"POLYGON ((256.908 300.203, 256.908 301.207, 24..."
QC1.mzML::15745383017486200183,4,"POLYGON ((255.664 297.191, 255.664 298.194, 24..."
...,...,...
QC1.mzML::14833318846184236006,70,"POLYGON ((567.546 579.064, 567.546 580.068, 54..."
QC1.mzML::17235000915360007629,71,"POLYGON ((577.414 671.788, 577.414 672.793, 55..."
QC1.mzML::4346243706513282941,72,"POLYGON ((578.617 598.412, 578.617 599.416, 56..."
QC1.mzML::10497263946829528981,73,"POLYGON ((585.897 705.811, 585.897 706.818, 57..."


In [143]:
reload_feature_map.hull_index

Unnamed: 0,iloc,geometry
QC1.mzML::13005910100464212996::0,0,"POLYGON ((219.024 445.902, 219.024 445.903, 20..."
QC1.mzML::13005910100464212996::1,1,"POLYGON ((212.854 446.902, 212.854 446.903, 20..."
QC1.mzML::14115125470287761880::0,2,"POLYGON ((244.603 252.182, 244.603 252.182, 22..."
QC1.mzML::14115125470287761880::1,3,"POLYGON ((239.704 253.185, 239.704 253.186, 22..."
QC1.mzML::10693648182463709255::0,4,"POLYGON ((247.02 344.227, 247.02 344.229, 221...."
...,...,...
QC1.mzML::4346243706513282941::1,150,"POLYGON ((576.165 599.415, 576.165 599.416, 57..."
QC1.mzML::10497263946829528981::0,151,"POLYGON ((585.897 705.811, 585.897 705.812, 57..."
QC1.mzML::10497263946829528981::1,152,"POLYGON ((585.897 706.811, 585.897 706.818, 57..."
QC1.mzML::14614210394166893117::0,153,"POLYGON ((585.897 209.19, 585.897 209.19, 579...."


In [144]:
reload_feature_map.feature_info.to_pandas()

Unnamed: 0,feature_id,RT,mz,intensity,MZstart,RTstart,MZend,RTend,hull_num,hull_mz,hull_rt,hull_intensity,isotope_pattern
0,QC1.mzML::13005910100464212996,215.297836,445.902374,1.043826e+05,445.902130,200.639130,446.902557,219.024033,2,"[445.90237, 446.9021]","[215.29784, 201.83788]","[104382.57, 26270.71]",[0.99971837]
1,QC1.mzML::14115125470287761880,231.165848,252.181900,1.539104e+06,252.181808,228.764023,253.185608,244.602783,2,"[252.1819, 253.18518]","[231.16585, 231.16585]","[1539104.2, 321779.25]",[1.0032716]
2,QC1.mzML::10693648182463709255,233.607956,344.228027,1.974188e+05,344.226501,221.427612,345.231903,247.019714,2,"[344.22803, 345.2315]","[233.60796, 237.21585]","[197418.84, 47032.887]",[1.003477]
3,QC1.mzML::8958118472990820462,243.397186,300.203064,1.728374e+06,300.202606,240.991180,301.206665,256.907562,2,"[300.20306, 301.206]","[243.39719, 243.39719]","[1728374.2, 125906.33]",[1.0029123]
4,QC1.mzML::15745383017486200183,244.602783,297.190918,2.255495e+05,297.190704,240.991180,298.194336,255.663635,2,"[297.19092, 298.19427]","[244.60278, 243.39719]","[225549.45, 35371.062]",[1.0033714]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,QC1.mzML::14833318846184236006,551.715149,579.064331,7.300085e+04,579.063843,546.777588,580.067810,567.546021,2,"[579.06433, 580.06744]","[551.71515, 556.6538]","[73000.85, 34937.773]",[1.0031059]
71,QC1.mzML::17235000915360007629,565.102600,671.789124,1.122403e+05,671.787720,556.653809,672.792969,577.414490,2,"[671.7891, 672.7919]","[565.1026, 561.4547]","[112240.29, 42289.277]",[1.0027822]
72,QC1.mzML::4346243706513282941,572.526733,598.411926,5.571515e+04,598.411560,565.102600,599.415588,578.617126,2,"[598.4119, 599.4152]","[572.52673, 574.9668]","[55715.152, 25916.025]",[1.0032688]
73,QC1.mzML::10497263946829528981,576.165039,705.811768,5.492179e+04,705.810608,573.770325,706.818054,585.897400,2,"[705.81177, 706.8147]","[576.16504, 577.4145]","[54921.793, 30861.908]",[1.0029392]


In [145]:
reload_feature_map.hull_info.to_pandas()

Unnamed: 0,hull_id,RTstart,RTend,MZstart,MZend,rt_points,mz_points,intens_points
0,QC1.mzML::13005910100464212996::0,200.639130,219.024033,445.902130,445.902527,"[200.63913, 201.83788, 203.04022, 204.28438, 2...","[445.90234, 445.9024, 445.90213, 445.9025, 445...","[5323.933, 5364.65, 7542.6016, 5576.3594, 6017..."
1,QC1.mzML::13005910100464212996::1,200.639130,212.854370,446.901550,446.902557,"[200.63913, 201.83788, 203.04022, 206.77208, 2...","[446.90256, 446.90213, 446.90216, 446.9019, 44...","[2132.6067, 2963.0833, 1872.6068, 2504.21, 221..."
2,QC1.mzML::14115125470287761880::0,228.764023,244.602783,252.181808,252.181946,"[228.76402, 229.91997, 231.16585, 232.36807, 2...","[252.18181, 252.18192, 252.1819, 252.18192, 25...","[6225.4355, 203993.89, 562776.44, 403518.47, 1..."
3,QC1.mzML::14115125470287761880::1,228.764023,239.703873,253.185089,253.185608,"[228.76402, 229.91997, 231.16585, 232.36807, 2...","[253.18523, 253.18521, 253.18515, 253.1852, 25...","[3768.086, 27921.67, 152985.08, 54198.633, 258..."
4,QC1.mzML::10693648182463709255::0,221.427612,247.019714,344.226501,344.228577,"[221.42761, 222.6721, 226.36406, 227.56404, 22...","[344.2265, 344.2269, 344.22858, 344.2279, 344....","[2444.304, 2652.0024, 2465.137, 3281.785, 3131..."
...,...,...,...,...,...,...,...,...
150,QC1.mzML::4346243706513282941::1,570.033875,576.165039,599.414551,599.415588,"[570.0339, 574.9668, 576.16504]","[599.4156, 599.4153, 599.41455]","[3903.8193, 4733.27, 2966.2288]"
151,QC1.mzML::10497263946829528981::0,573.770325,585.897400,705.810608,705.812317,"[573.7703, 574.9668, 576.16504, 577.4145, 578....","[705.812, 705.8116, 705.8122, 705.8114, 705.81...","[6845.8853, 6800.7764, 6981.0215, 7517.435, 54..."
152,QC1.mzML::10497263946829528981::1,573.770325,585.897400,706.811401,706.818054,"[573.7703, 574.9668, 576.16504, 577.4145, 578....","[706.8153, 706.8145, 706.8155, 706.8137, 706.8...","[3681.0918, 1325.0985, 5481.869, 2787.5876, 23..."
153,QC1.mzML::14614210394166893117::0,579.857483,585.897400,209.189941,209.190094,"[579.8575, 581.05347, 582.25183, 583.45605, 58...","[209.19, 209.18997, 209.19, 209.18999, 209.189...","[33363.902, 53031.918, 35343.79, 15898.026, 10..."


In [155]:
rebuild_datas = OpenMSDataWrapper(features=[
    FeatureMap.from_oms(f,x,en).get_oms_feature_map() \
    for f,x,en in zip(datas.features,datas.chromatogram_peaks,datas.exp_names)
])
rebuild_datas.exp_names = datas.exp_names
rebuild_datas = FeatureLinker()(rebuild_datas)

Progress of 'Linking features':
-- done [took 0.01 s (CPU), 0.01 s (Wall)] -- 


In [156]:
rebuild_datas.consensus_map.get_df()

Unnamed: 0_level_0,sequence,charge,RT,mz,quality,QC1.mzML,QC2.mzML
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,,0,517.366455,225.148521,0.998806,5.911864e+05,6.568321e+05
0,,0,360.167831,207.159065,0.935903,5.035236e+06,2.195314e+06
0,,0,359.011963,207.985222,0.934963,9.938316e+05,1.047103e+06
0,,0,507.315598,293.109055,0.990750,9.651988e+04,1.042219e+05
0,,0,404.847656,695.425873,0.557731,1.026271e+05,4.537626e+06
...,...,...,...,...,...,...,...
0,,0,293.642609,225.131454,0.000000,0.000000e+00,2.846717e+05
0,,0,401.348358,755.347107,0.000000,0.000000e+00,1.737777e+05
0,,0,446.029358,618.372375,0.000000,0.000000e+00,5.882982e+06
0,,0,496.929535,670.389038,0.000000,0.000000e+00,4.167085e+05


In [13]:
def link_ms2_to_feature(feature_hulls: pd.DataFrame,spectrum_map: SpectrumMap) -> list[str]:
    spectrum_id_list = []
    for mz_start,rt_start,mz_end,rt_end in zip(
        feature_hulls['MZstart'],
        feature_hulls['RTstart'],
        feature_hulls['MZend'],
        feature_hulls['RTend'],
    ):
        spectrum_id_list += spectrum_map.search_ms2_by_range(
            (mz_start,rt_start,mz_end,rt_end)
        )
    return spectrum_id_list

def link_ms2_and_feature_map(
    feature_map: FeatureMap,
    spectrum_map: SpectrumMap,
    key_id: Literal["feature","spectrum"] = "feature",
) -> pd.Series[str | list[str]]:
    feature_id_bag = db.from_sequence(
        zip(feature_map.feature_info.index, feature_map.feature_info['hull_num'])
    )
    feature_hulls_id_bag = feature_id_bag.map(
        lambda x: [x[0]+f"::{i}" for i in range(x[1])]
    )
    feature_hulls_bag = feature_hulls_id_bag.map(
        lambda x: feature_map.hulls.loc[x]
    )
    spectrum_id_bag = feature_hulls_bag.map(
        lambda x: link_ms2_to_feature(x,spectrum_map)
    )
    spectrum_id_list = dask.compute(
        spectrum_id_bag, scheduler="threads"
    )[0]
    if key_id == "feature":
        mapping_series = pd.Series(spectrum_id_list, index=feature_map.feature_info.index)
        mapping_series.index.name = "feature_id"
        mapping_series.name = "spectrum_id"
        return mapping_series
    else:
        mapping_series = pd.Series()
        mapping_series.index.name = "spectrum_id"
        mapping_series.name = "feature_id"
        for spectrum_ids, feature_id in zip(spectrum_id_list, feature_map.feature_info.index):
            for spectrum_id in spectrum_ids:
                mapping_series[spectrum_id] = feature_id
        return mapping_series

In [14]:
link_ms2_and_feature_map(feature_map, spectrum_map, "feature")

feature_id
QC1.mzML::5261480272464681702                                           []
QC1.mzML::109232923244398657      [QC1.mzML::ms2::923, QC1.mzML::ms2::946]
QC1.mzML::17277684442908073595                        [QC1.mzML::ms2::936]
QC1.mzML::17310728823578640817    [QC1.mzML::ms2::973, QC1.mzML::ms2::995]
QC1.mzML::4758972346404056975                         [QC1.mzML::ms2::976]
                                                    ...                   
QC1.mzML::12179905224677114345                                          []
QC1.mzML::12181800865879437076                                          []
QC1.mzML::3655475155587324847                                           []
QC1.mzML::14201693722899224880                                          []
QC1.mzML::5198541186238866932                        [QC1.mzML::ms2::2370]
Name: spectrum_id, Length: 75, dtype: object

In [15]:
link_ms2_and_feature_map(feature_map, spectrum_map, "spectrum")

spectrum_id
QC1.mzML::ms2::923       QC1.mzML::109232923244398657
QC1.mzML::ms2::946       QC1.mzML::109232923244398657
QC1.mzML::ms2::936     QC1.mzML::17277684442908073595
QC1.mzML::ms2::916     QC1.mzML::17310728823578640817
QC1.mzML::ms2::973     QC1.mzML::17310728823578640817
QC1.mzML::ms2::995     QC1.mzML::17310728823578640817
QC1.mzML::ms2::976      QC1.mzML::4758972346404056975
QC1.mzML::ms2::1035    QC1.mzML::11346374657489545787
QC1.mzML::ms2::1054    QC1.mzML::11346374657489545787
QC1.mzML::ms2::1124     QC1.mzML::9410620042631636663
QC1.mzML::ms2::1183    QC1.mzML::14098708295893166351
QC1.mzML::ms2::1306    QC1.mzML::12673403173576306906
QC1.mzML::ms2::1304     QC1.mzML::8864025362939724457
QC1.mzML::ms2::1289    QC1.mzML::10970745381786157009
QC1.mzML::ms2::1349    QC1.mzML::10970745381786157009
QC1.mzML::ms2::1377     QC1.mzML::8746726905846740337
QC1.mzML::ms2::1391    QC1.mzML::15121964842776754730
QC1.mzML::ms2::1392     QC1.mzML::1679423022259619783
QC1.mzML::ms2::1

In [16]:
class XICMap(BaseModel):

    model_config = ConfigDict({"arbitrary_types_allowed": True})

    ion_index: rtree.index.Index
    ion_df: pd.DataFrame

    @classmethod
    def from_oms(
        cls,
        exp: oms.MSExperiment,
    ) -> XICMap:
        ion_df = exp.get_massql_df()[0][["mz","rt","i"]]
        ion_df['rt'] = ion_df['rt'] * 60
        ion_index = rtree.index.Index()
        for i,(ion_id,ion_mz,ion_rt) in enumerate(
            zip(
                ion_df.index,
                ion_df['mz'],
                ion_df['rt'],
            )
        ):
            ion_index.insert(
                id=i,
                coordinates=(ion_mz, ion_rt),
                obj=ion_id,
            )
        return cls(ion_index=ion_index, ion_df=ion_df)

    @classmethod
    def from_ms1(
        cls,
        ms1: pd.DataFrame,
    ) -> XICMap:
        ion_df = {
            "mz": [],
            "rt": [],
            "i": [],
        }
        ion_index = rtree.index.Index()
        ion_id = 0
        for _,row in ms1.iterrows():
            rt = row['rt']
            for mz,intensity in zip(row['mz_array'],row['intensity_array']):
                ion_df['mz'].append(mz)
                ion_df['rt'].append(rt)
                ion_df['i'].append(intensity)
                ion_index.insert(
                    id=ion_id,
                    coordinates=(mz, rt),
                    obj=ion_id,
                )
                ion_id += 1
        ion_df = pd.DataFrame(ion_df)
        return cls(ion_index=ion_index, ion_df=ion_df)

    def search_ion_by_range(
        self,
        coordinates: tuple[
            float, # min_mz
            float, # min_rt
            float, # max_mz
            float, # max_rt
        ]
    ) -> list[int]:
        return list(self.ion_index.intersection(coordinates, objects="raw"))

In [17]:
xic_map = XICMap.from_oms(datas.exps[0])

In [18]:
xic_map.ion_df.loc[xic_map.search_ion_by_range(
    (200,200,201,202)
)]

Unnamed: 0,mz,rt,i
669,200.091904,201.837875,1270.613525
670,200.128159,201.837875,7574.313477
671,200.164642,201.837875,12525.169922
672,200.200958,201.837875,15436.75
673,200.972382,201.837875,16018.74707
0,200.09166,200.63913,4236.865234
1,200.128098,200.63913,2695.840576
2,200.172775,200.63913,6285.765625
3,200.183792,200.63913,11543.618164
4,200.200943,200.63913,15958.017578


In [19]:
xic_map = XICMap.from_ms1(spectrum_map.ms1_df)

In [20]:
xic_map.ion_df.loc[xic_map.search_ion_by_range(
    (200,200,201,202)
)]

Unnamed: 0,mz,rt,i
669,200.091904,201.837872,1270.613525
670,200.128159,201.837872,7574.313477
671,200.164642,201.837872,12525.169922
672,200.200958,201.837872,15436.75
673,200.972382,201.837872,16018.74707
0,200.09166,200.639135,4236.865234
1,200.128098,200.639135,2695.840576
2,200.172775,200.639135,6285.765625
3,200.183792,200.639135,11543.618164
4,200.200943,200.639135,15958.017578


In [21]:
class ConsensusMap(BaseModel):

    model_config = ConfigDict({"arbitrary_types_allowed": True})

    consensus_df: pd.DataFrame
    consensus_feature_mapping: pd.Series
    feature_consensus_mapping: pd.Series

    @classmethod
    def from_oms(cls, consensus_map: oms.ConsensusMap) -> ConsensusMap:
        raw_consensus_df = consensus_map.get_df()
        exp_names = raw_consensus_df.columns[5:][::-1]
        consensus_bag = db.from_sequence(consensus_map)
        feature_id_bag = consensus_bag.map(
            lambda x: \
                [
                    f.getUniqueId() if f.getUniqueId() is str \
                    else f"{exp_names[f.getMapIndex()]}::{f.getUniqueId()}" \
                    for f in x.getFeatureList()
                ]
        )
        consensus_df = consensus_map.get_df().iloc[:,2:].reset_index(drop=True)
        consensus_df.index.name = "consensus_id"
        consensus_feature_mapping = pd.Series(
            feature_id_bag.compute(scheduler="threads"),
            index=consensus_df.index
        )
        consensus_feature_mapping.name = "feature_ids"
        consensus_feature_mapping.index.name = "consensus_id"
        feature_consensus_mapping = {}
        for cid,fids in consensus_feature_mapping.items():
            for fid in fids:
                feature_consensus_mapping[fid] = cid
        feature_consensus_mapping = pd.Series(feature_consensus_mapping)
        feature_consensus_mapping.name = "consensus_id"
        feature_consensus_mapping.index.name = "feature_id"
        return cls(
            consensus_df=consensus_df,
            consensus_feature_mapping=consensus_feature_mapping,
            feature_consensus_mapping=feature_consensus_mapping
        )

    def as_oms_feature_map(self) -> oms.FeatureMap:
        feature_map = oms.FeatureMap()
        for i,row in self.consensus_df.iterrows():
            feature = oms.Feature()
            feature.setUniqueId(i)
            feature.setMZ(row["mz"])
            feature.setRT(row["RT"])
            feature.setIntensity(row.iloc[3:].max())
            feature_map.push_back(feature)
        return feature_map

In [22]:
consensus_map = ConsensusMap.from_oms(datas.consensus_map)

In [23]:
consensus_map.consensus_df

Unnamed: 0_level_0,RT,mz,quality,QC2.mzML,QC1.mzML
consensus_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,404.847657,695.425855,0.557743,4.537626e+06,1.026271e+05
1,576.286209,209.189986,0.952328,1.552605e+05,1.354698e+05
2,360.167820,207.159064,0.935903,2.195314e+06,5.035236e+06
3,349.334946,533.324174,0.840550,7.844999e+04,1.233872e+05
4,507.315594,293.109054,0.990750,1.042219e+05,9.651988e+04
...,...,...,...,...,...
126,446.029355,725.372814,0.000000,2.473700e+05,0.000000e+00
127,446.029355,709.356876,0.000000,6.133417e+05,0.000000e+00
128,446.029355,741.365603,0.000000,2.869066e+05,0.000000e+00
129,446.029355,618.372397,0.000000,5.882982e+06,0.000000e+00


In [24]:
consensus_map.consensus_feature_mapping

consensus_id
0      [QC1.mzML::1058058667861826189, QC2.mzML::5829...
1      [QC1.mzML::5198541186238866932, QC2.mzML::9748...
2      [QC1.mzML::6356764604209267192, QC2.mzML::1215...
3      [QC1.mzML::8102842054760010345, QC2.mzML::1834...
4      [QC1.mzML::10946711822686169604, QC2.mzML::526...
                             ...                        
126                     [QC2.mzML::15378851355750419626]
127                     [QC2.mzML::17722064519600937218]
128                     [QC2.mzML::18049518165761848085]
129                     [QC2.mzML::18163286508655570769]
130                     [QC2.mzML::18444431267099520080]
Name: feature_ids, Length: 131, dtype: object

In [25]:
consensus_map.feature_consensus_mapping

feature_id
QC1.mzML::1058058667861826189       0
QC2.mzML::5829911669225560868       0
QC1.mzML::5198541186238866932       1
QC2.mzML::974875078852663642        1
QC1.mzML::6356764604209267192       2
                                 ... 
QC2.mzML::15378851355750419626    126
QC2.mzML::17722064519600937218    127
QC2.mzML::18049518165761848085    128
QC2.mzML::18163286508655570769    129
QC2.mzML::18444431267099520080    130
Name: consensus_id, Length: 139, dtype: int64

In [26]:
consensus_map.get_oms_feature_map().get_df()

Unnamed: 0_level_0,peptide_sequence,peptide_score,ID_filename,ID_native_id,charge,RT,mz,RTstart,RTend,MZstart,MZend,quality,intensity
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,,,,,0,404.847657,695.425855,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.0,4.537626e+06
1,,,,,0,576.286209,209.189986,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.0,1.552605e+05
2,,,,,0,360.167820,207.159064,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.0,5.035236e+06
3,,,,,0,349.334946,533.324174,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.0,1.233872e+05
4,,,,,0,507.315594,293.109054,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.0,1.042219e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,,,,,0,446.029355,725.372814,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.0,2.473700e+05
127,,,,,0,446.029355,709.356876,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.0,6.133417e+05
128,,,,,0,446.029355,741.365603,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.0,2.869066e+05
129,,,,,0,446.029355,618.372397,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.0,5.882982e+06
