In [1]:
import pandas as pd
from scipy.signal import find_peaks  # 判断峰

# 赋值列名
df = pd.read_csv("61minSIMddMS2_Pos_20210913_Run14.csv", header=None, names=['ms', 'scan', 'rt', 'mz', 'intensity', 'rel_intensity'])

# 包含ms的行
ms1_data = df[df['ms'].str.contains(' ms ')]
ms1_data.to_csv("ms1_data.csv", index=False)

# 包含ms2的行
ms2_data = df[df['ms'].str.contains(' ms2 ')]
ms2_data.to_csv("ms2_data.csv", index=False)


In [36]:
# 读取ms2_data
df = pd.read_csv("ms2_data.csv")

# 按照label列名删除列
df = df.drop(['rel_intensity'], axis=1)

# 根据ms列 提取ms1母离子
def get_ms1_mz(mz_str):
    # mz_str = "FTMS + p ESI d Full ms2 331.1268@hcd30.00 [50.0000-357.9534]"
    mz_str = mz_str.split('@')[0].split(' ')[-1]
    ms1_mz = int(float(mz_str))
    return ms1_mz

# 根据ms列 提取ms1扫描范围
def get_ms1_scan_range(mz_str):
    # mz_str = "FTMS + p ESI d Full ms2 331.1268@hcd30.00 [50.0000-357.9534]"
    mz_str = mz_str.split(' ')[-1].replace("[","").replace("]","").split("-")
    scan_end = float(mz_str[1])
    return scan_end

# 插入ms1_mz列到第一列
col_name = df.columns.tolist()
col_name.insert(0, 'ms1_mz')
df = df.reindex(columns = col_name)

# 将提取的mz添加一列 
df['ms1_mz'] = df["ms"].apply(lambda x: get_ms1_mz(x)) 

# 将提取的scan_end添加一列 
df['scan_end'] = df["ms"].apply(lambda x: get_ms1_scan_range(x)) 

# 筛选ms2数据: 若scan_end - ms1_mz > 100 则删除对应的行 
df['minus'] = df["scan_end"] - df["ms1_mz"]
df = df[df["minus"] < 100] 
df = df.drop(['ms','scan_end','minus'], axis=1)

df.to_csv("deal_ms2_data.csv", index=False)


### 判断MS1数据

#### x

MS1数据的其中一个母离子的mz 找一个20ppm的窗口, 找窗口内的几个高峰,看其是否出现了母离子mz的峰, 如果有则看旁边是否有其它干扰峰 若没有找到ms1_mz对应的峰,则终止
x = 331.128
y1 = 0.99998 * x
y2 = 1.00002 * x



In [220]:
# 从533母离子中找到所有scan的最高峰
ms1_df = pd.read_csv("ms1_data.csv")

# 根据ms列 提取ms1_scan_mz
def get_ms1_scan_mz(mz_str):
    # mz_str = "FTMS + p ESI SIM ms [330.9280-331.3280]"
    mz_str = mz_str.split(' ')[-1].replace("[","").replace("]","").split("-")
    return (float(mz_str[0])+0.2)

# 插入ms1_scan_mz列到第一列
col_name = ms1_df.columns.tolist()
col_name.insert(0, 'ms1_scan_mz')
ms1_df = ms1_df.reindex(columns = col_name)

# 将提取的mz添加一列 
ms1_df['ms1_scan_mz'] = ms1_df["ms"].apply(lambda x: get_ms1_scan_mz(x))

ms1_df = ms1_df.drop(['ms','rel_intensity'], axis=1)

ms1_df.to_csv("deal_ms1_data.csv", index=False)

### 找到每个一级scan的峰


In [55]:
# 根据峰高找到峰的索引
def find_peaks_index(time_point_height, height_value):
    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.find_peaks.html#scipy.signal.find_peaks
    get_peaks_index, _ = find_peaks(time_point_height, prominence=1, height=height_value)  # 找出满足条件的峰的索引
    return get_peaks_index

In [241]:
ms1_df = pd.read_csv("deal_ms1_data.csv")

# ms1_mz = 620.26
# ms1_mz = 303.231
# ms1_mz = 331.128
# ms1_mz = 355.226
# ms1_mz = 333.144
# ms1_mz = 798.564
ms1_mz_list = list(set(list(round(ms1_df['ms1_scan_mz'], 3))))
for ms1_mz in ms1_mz_list:
# if 1:
#     ms1_mz = 520.507
#     ms1_mz = 303.231
    print("ms1_mz----------:", ms1_mz)
    ms1_df = ms1_df[ms1_df["ms1_scan_mz"] == ms1_mz]
    print(ms1_df)
    # 定义满足什么样的高度才是峰
    peak_index = list(find_peaks_index(ms1_df["intensity"], ms1_df['intensity'].max() / 20))
    ms1_peak_df = ms1_df.iloc[peak_index, ]

    # 添加两者之差的绝对值为一列数据, 并保留两位小数
    df_merge = ms1_peak_df.copy()
    df_merge['minus_abs'] = round(abs(df_merge['ms1_scan_mz'] - df_merge['mz']), 2)
    # print(df_merge)

    # 判断是否存在约为ms1_mz的峰 (在其左右附近0.02)
    df_ms1_mz = df_merge[df_merge['minus_abs'] <= 0.02]
#     print(df_ms1_mz)

    df_merge['max_intensity'] = df_merge.groupby('scan')['intensity'].transform('max')
    # print(df_merge)

    # 如果存在 继续
    if(df_ms1_mz.shape[0]):
        # 找这些scans
        df_scans = df_merge[df_merge["scan"].isin(df_ms1_mz["scan"])]
#         print(df_scans)

        # 取同一个scan的最大峰
        df_scans = df_scans[(df_scans["intensity"] == df_scans["max_intensity"]) & (df_scans["minus_abs"] <= 0.02)]
#         print(df_scans)

        # 如果df_scans不为空
        if not df_scans.empty:
            # 提取intensity列最大值所在行的rt
            max_rt = df_scans[df_scans['intensity'] == df_scans['intensity'].max()]["rt"].values[0]
            print("一级数据对应的最好的rt: ", max_rt)

            # ------------------二级数据处理-----------------------
            ms2_df = pd.read_csv("deal_ms2_data.csv")

            # 从533母离子中找到所有scan
            all_scan_df = ms2_df[ms2_df["ms1_mz"] == int(ms1_mz)]

            # 找出所有在二级图中存在母离子的scan
            have_ms1_mz_df = all_scan_df[(all_scan_df["mz"]>=(ms1_mz - 0.05)) & (all_scan_df["mz"]<=(ms1_mz + 0.05))]
            select_scan_list = list(have_ms1_mz_df["scan"])
            filter_scans = sorted(list(set(select_scan_list)))
            # filter_scans

            # 这些filter_scans对应的rt 与一级出现的rt之差
            tmp_df = have_ms1_mz_df.copy()
            tmp_df["max_rt"] = max_rt
            tmp_df['minus_abs'] = round(abs(tmp_df['max_rt'] - tmp_df['rt']), 2)
            tmp_df = tmp_df[tmp_df['minus_abs'] <= 0.5]
            tmp_df = tmp_df.sort_values(by='minus_abs')
            # tmp_df.to_csv("tttttt.csv", index=False)
            # print(tmp_df)

            # 按照某一列进行去重复值
            final_scans = list(tmp_df.drop_duplicates(['scan'])['scan'])
            # 如果找到的个数大于6 则只保留前6个
            if(len(final_scans) >= 6):
                final_scans = final_scans[:6]
            print(ms1_mz, final_scans)
        else:
            print("%s 的df_scans为空 没有二级" % ms1_mz)
    else:
        print("%s 没有二级" % ms1_mz)

ms1_mz----------: 646.407
        ms1_scan_mz  scan        rt         mz  intensity
70891       646.407  3032  25.92405  639.77441          0
70892       646.407  3032  25.92405  639.77683          0
70893       646.407  3032  25.92405  639.77926          0
70894       646.407  3032  25.92405  639.78169          0
70895       646.407  3032  25.92405  646.17352          0
...             ...   ...       ...        ...        ...
112801      646.407  4144  29.88830  646.60654          0
112802      646.407  4144  29.88830  653.11118          0
112803      646.407  4144  29.88830  653.11368          0
112804      646.407  4144  29.88830  653.11619          0
112805      646.407  4144  29.88830  653.11869          0

[18859 rows x 5 columns]
646.407 的df_scans为空 没有二级
ms1_mz----------: 520.507
Empty DataFrame
Columns: [ms1_scan_mz, scan, rt, mz, intensity]
Index: []
520.507 没有二级
ms1_mz----------: 651.459
Empty DataFrame
Columns: [ms1_scan_mz, scan, rt, mz, intensity]
Index: []
651.459 没有二级
m

In [242]:
ms1_mz_list

[646.407,
 520.507,
 651.459,
 798.564,
 674.437,
 686.438,
 303.231,
 321.255,
 331.128,
 333.144,
 334.128,
 720.274,
 726.158,
 355.226,
 357.241,
 620.26,
 624.52,
 370.237,
 633.255]