In [22]:
import numpy as np
import pandas as pd
from alpharaw import register_all_readers
from alpharaw.ms_data_base import ms_reader_provider

register_all_readers()
mzml_reader = ms_reader_provider.get_reader("mzml")
mzml_reader.load_raw("../data/raw/SYS026_RA957/20200317_QE_HFX2_LC3_DIA_RA957_R01.mzML")

In [23]:
spec_df = mzml_reader.spectrum_df
spec_df

Unnamed: 0,spec_idx,peak_start_idx,peak_stop_idx,rt,precursor_mz,precursor_charge,isolation_lower_mz,isolation_upper_mz,ms_level
0,0,0,3283,0.002374,-1.0,0,-1.0,-1.0,1
1,1,3283,4654,0.006591,368.5,0,350.0,387.0,2
2,2,4654,5532,0.007747,401.0,0,386.0,416.0,2
3,3,5532,6317,0.008868,427.0,0,415.0,439.0,2
4,4,6317,7198,0.009991,450.0,0,438.0,462.0,2
...,...,...,...,...,...,...,...,...,...
89076,89076,137080673,137082379,119.995220,919.0,0,883.0,955.0,2
89077,89077,137082379,137084652,119.996410,1005.5,0,954.0,1057.0,2
89078,89078,137084652,137086933,119.998780,1353.0,0,1056.0,1650.0,2
89079,89079,137086933,137092701,120.000600,-1.0,0,-1.0,-1.0,1


In [24]:
def get_cycle_time(spectrum_df):
    ms1_mask = spectrum_df["ms_level"] == 1
    ms1_rts = spectrum_df.loc[ms1_mask, "rt"].values

    if len(ms1_rts) < 2:
        return np.array([])

    return np.diff(ms1_rts)


def get_scan_times(spectrum_df):
    rts = spectrum_df["rt"].values
    return np.diff(rts)


cycle_times = get_cycle_time(spec_df)
scan_times = get_scan_times(spec_df)

In [25]:
cycle_times.mean(), cycle_times.std(), cycle_times.min(), cycle_times.max()

(np.float64(0.030983275613916863),
 np.float64(0.0002627860506663159),
 np.float64(0.030585000000002083),
 np.float64(0.03148969999999984))

In [26]:
scan_times.mean(), scan_times.std(), scan_times.min(), scan_times.max()

(np.float64(0.0013471311905332284),
 np.float64(0.0006642162628914645),
 np.float64(0.0011060000000071568),
 np.float64(0.004239999999995803))

In [27]:
df_diann = pd.read_parquet("../data/raw/SYS026_RA957/DDA_SYSMHC_bynam/lib-base-result-first-pass.parquet")
df_diann = df_diann[df_diann["Run"] == "20200317_QE_HFX2_LC3_DIA_RA957_R01"]

In [28]:
spec_df["rt"].dtypes, df_diann["RT"].dtypes

(dtype('float64'), dtype('float32'))

In [29]:
# transform rt from float64 to float32
spec_df["rt"] = spec_df["rt"].astype(np.float32)

# try to fetch a spectrum by rt
target_rt = df_diann["RT"].iloc[0]
spec_df[spec_df["rt"] == target_rt]

Unnamed: 0,spec_idx,peak_start_idx,peak_stop_idx,rt,precursor_mz,precursor_charge,isolation_lower_mz,isolation_upper_mz,ms_level
67077,67077,99193934,99195777,90.442459,557.5,0,547.0,568.0,2


In [30]:
# try every row in df_diann
# to see if we can find matching rt in spec_df
# if cannot, find the closest one and save them in np.array

matched_count = 0
closest_rts = []
rt_differences = []
for rt in df_diann["RT"]:
    if not spec_df[spec_df["rt"] == rt].empty:
        matched_count += 1
    else:
        # find closest rt, and its difference
        closest_rt = spec_df.iloc[(spec_df["rt"] - rt).abs().argsort()[:1]]["rt"].values[0]
        closest_rts.append(closest_rt)
        rt_differences.append(abs(closest_rt - rt))

matched_count, len(df_diann), matched_count / len(df_diann)

(159314, 169226, 0.941427440227861)

In [31]:
min(rt_differences), max(rt_differences), np.mean(rt_differences), np.std(rt_differences)

(np.float32(5.9604645e-08),
 np.float32(7.6293945e-06),
 np.float32(4.9382015e-06),
 np.float32(2.3214732e-06))