In [1]:
import pandas as pd
import numpy as np

from ionmob.data.chemistry import reduced_mobility_to_ccs
from ionmob.preprocess.helpers import preprocess_peaks_sequence
from ionmob.preprocess.helpers import get_ccs_shift, apply_shift_per_charge

In [2]:
peaks_data = pd.read_csv('resources/2022-152_Phospho_HeLa_F220819_UD_DG_PEAKS_14/DB search psm.csv')

peaks_data = peaks_data[['m/z', 'Z', 'Peptide', '1/k0 Range', 'RT', 'Area']].rename(columns={'m/z': 'mz', 'Z': 'charge', 'RT': 'rt', 'Area': 'intensity'})
peaks_data = peaks_data[peaks_data.charge < 5]

peaks_data['1/k0'] = peaks_data.apply(lambda r: np.mean(np.array([float(r['1/k0 Range'].split('-')[0]), 
                                          float(r['1/k0 Range'].split('-')[1])])), axis=1)

peaks_data['ccs'] = reduced_mobility_to_ccs(peaks_data['1/k0'], peaks_data['mz'], peaks_data['charge'])

peaks_data['sequence-tokenized'] = peaks_data.apply(lambda r: preprocess_peaks_sequence(r['Peptide']), axis=1)
peaks_data['name'] = ex_name = "descriptive-name"

In [3]:
data_meier = pd.read_parquet('../data/Meier.parquet')

In [4]:
mean = get_ccs_shift(peaks_data, data_meier)
peaks_data['ccs_shifted'] = peaks_data['ccs'] + mean

In [5]:
peaks_data = peaks_data.drop_duplicates(['Peptide', 'charge', 'ccs'])
peaks_data['intensity'] = peaks_data['intensity'] + 1.0000001

In [6]:
from ionmob.alignment import experiment as exp
from ionmob.alignment import alignment as alig

shifted_all = peaks_data

seq, charge, ccs, intensity, mz, raw_file = shifted_all["Peptide"].values, shifted_all["charge"].values, shifted_all["ccs_shifted"].values, np.ones_like(shifted_all["mz"].values), shifted_all["mz"].values, shifted_all["name"].values

ex1 = exp.Experiment(ex_name, seq, charge, ccs,
                     intensity, mz, raw_file, np.arange(shifted_all.shape[0]), 
                     shifted_all.rt.values, 
                     shifted_all.rt.values, 
                     shifted_all.rt.values,
                     shifted_all.mz.values)

ex2 = ex1.assign_modalities()

In [7]:
grouped = ex2.data[(ex2.data.modality == 'main') | (ex2.data.modality == 'unimodal')]

In [8]:
data_shifted = peaks_data

merge_data = grouped[['sequence', 'charge', 'ccs']]

both = pd.merge(left=data_shifted, right=merge_data, left_on=['Peptide', 'charge'], 
                right_on=['sequence', 'charge'])

singletons = both[['mz', 'charge', 'Peptide', 'sequence-tokenized', 'rt', 
                   'name', 'ccs_y']].drop_duplicates(['Peptide', 'charge']).rename(columns={'ccs_y':'ccs'})

s = singletons[['mz', 'charge', 'sequence-tokenized', 'rt', 'name', 'ccs']]

s['sequence'] = s.apply(lambda r: str(list(r['sequence-tokenized'])), axis=1)

s = s.drop(columns=['sequence'])

processed_data = s[['mz', 'charge', 'sequence-tokenized', 'rt', 'ccs', 'name']]

In [9]:
processed_data

Unnamed: 0,mz,charge,sequence-tokenized,rt,ccs,name
0,1509.5143,3,"[<START>, A, A, A, A, A, P, A, S, E, D, E, D, ...",18.53,676.733691,Tenzer-phospho-experiment-name
16,1005.7466,3,"[<START>, E, A, L, S, N, L, T, A, L, T, S, D, ...",27.08,594.788838,Tenzer-phospho-experiment-name
33,1508.1051,2,"[<START>, E, A, L, S, N, L, T, A, L, T, S, D, ...",27.07,526.198039,Tenzer-phospho-experiment-name
41,847.6588,3,"[<START>, D, S-<PH>, H, S-<PH>, S, E, E, D, E,...",18.69,563.044805,Tenzer-phospho-experiment-name
66,1270.9846,2,"[<START>, D, S-<PH>, H, S-<PH>, S, E, E, D, E,...",18.44,496.576500,Tenzer-phospho-experiment-name
...,...,...,...,...,...,...
238294,535.7287,2,"[<START>, T, I, T, S-<PH>, S, Y, Y, R, <END>]",15.68,346.360873,Tenzer-phospho-experiment-name
238295,583.2572,2,"[<START>, G, G, S-<PH>, G, G, T, R, G, P, P, S...",2.96,345.167007,Tenzer-phospho-experiment-name
238296,549.2449,3,"[<START>, D, Q, Q, L, E, P, K, K, S, T-<PH>, S...",7.53,478.407200,Tenzer-phospho-experiment-name
238297,697.8365,2,"[<START>, S, K, S, M, D, L, G, I, A, D, E, T, ...",13.32,409.114630,Tenzer-phospho-experiment-name
