# Pre-process DecryptM Dataset

**Publication**: Jana Zecha et al. Decrypting drug actions and protein modifications by dose- and time-resolved proteomics.

In [5]:
import pandas as pd
import os

## Rituximab Time-Dependent

In [6]:
data_file_path = os.path.join("..", "data", "rituximab_td.txt")

rituximab_td = pd.read_csv(data_file_path, sep="\t")

In [7]:
# Filter data
# Arbitrary Cutoffs
max_score_cutoff = 60  # Confidence score for peptide identification
min_pep_cutoff = 0.05  # Minimum posterior error probability

rows_before = rituximab_td.shape[0]
print(f"Number of rows before filtering: {rows_before}")

rituximab_td = rituximab_td[
    (rituximab_td['Max Score'] >= max_score_cutoff) & 
    (rituximab_td['Min PEP'] <= min_pep_cutoff) &
    (rituximab_td['Phospho (STY)'] >= 1) &
    (rituximab_td['Phosphoproteome'] == True)
]

rows_after = rituximab_td.shape[0]
print(f"Number of rows before filtering: {rows_after}")

# Break down experiment column
split_experiment_col = rituximab_td['Experiment'].str.split('_', expand=True)
rituximab_td['Cell Line'] = split_experiment_col[1]
rituximab_td['Drug'] = split_experiment_col[2]
rituximab_td['Dose'] = split_experiment_col[3].str.replace('ng', '')

Number of rows before filtering: 149787
Number of rows before filtering: 130909


In [8]:
rituximab_td = rituximab_td[[
        'Cell Line',
        'Drug',
        'Dose',
        'Modified sequence', 
        'Gene names',
        'TMT Channel Ratio 1',
        'TMT Channel Ratio 2', 
        'TMT Channel Ratio 3', 
        'TMT Channel Ratio 4',
        'TMT Channel Ratio 5', 
        'TMT Channel Ratio 6', 
        'TMT Channel Ratio 7',
        'TMT Channel Ratio 8'
]]

rituximab_td = rituximab_td.rename(columns={
        'TMT Channel Ratio 1': 'Ratio_1',
        'TMT Channel Ratio 2': 'Ratio_2',
        'TMT Channel Ratio 3': 'Ratio_5',
        'TMT Channel Ratio 4': 'Ratio_10',
        'TMT Channel Ratio 5': 'Ratio_60',
        'TMT Channel Ratio 6': 'Ratio_120',
        'TMT Channel Ratio 7': 'Ratio_360',
        'TMT Channel Ratio 8': 'Ratio_1440'})

rituximab_td.head(10)

Unnamed: 0,Cell Line,Drug,Dose,Modified sequence,Gene names,Ratio_1,Ratio_2,Ratio_5,Ratio_10,Ratio_60,Ratio_120,Ratio_360,Ratio_1440
0,Ramos,Rituximab,0,(ac)AAAAAAAAAAGAAGGRGS(ph)GPGR,PABPN1,1.0,1.482082,1.304435,1.200087,1.711429,1.036134,0.939321,2.293267
1,SU-DHL-4,Rituximab,30000,(ac)AAAAAAAAAAGAAGGRGS(ph)GPGR,PABPN1,1.0,0.936966,0.895313,0.806706,1.030389,1.129011,0.949684,1.032427
2,ARH-77,Rituximab,0,(ac)AAAAAAAGDS(ph)DSWDADAFSVEDPVR,EIF3J,1.0,0.894783,0.882104,0.860725,0.996797,0.947909,0.774303,1.274237
3,ARH-77,Rituximab,1000,(ac)AAAAAAAGDS(ph)DSWDADAFSVEDPVR,EIF3J,1.0,0.925674,0.996374,1.035341,0.854041,0.649231,0.801136,1.313595
4,ARH-77,Rituximab,30000,(ac)AAAAAAAGDS(ph)DSWDADAFSVEDPVR,EIF3J,,,,,,,,
5,Ramos,Rituximab,1000,(ac)AAAAAAAGDS(ph)DSWDADAFSVEDPVR,EIF3J,1.0,0.916323,1.344232,1.12465,1.409876,1.464369,1.204183,0.909416
6,Ramos,Rituximab,30000,(ac)AAAAAAAGDS(ph)DSWDADAFSVEDPVR,EIF3J,1.0,1.126545,0.926199,0.988858,1.031604,0.717043,0.922084,1.318389
7,SU-DHL-4,Rituximab,0,(ac)AAAAAAAGDS(ph)DSWDADAFSVEDPVR,EIF3J,1.0,0.840954,0.925568,1.176106,0.850982,0.765193,0.984658,0.934958
8,SU-DHL-4,Rituximab,1000,(ac)AAAAAAAGDS(ph)DSWDADAFSVEDPVR,EIF3J,1.0,0.651056,1.080928,0.76111,0.580987,0.492847,0.519884,0.780907
9,SU-DHL-4,Rituximab,30000,(ac)AAAAAAAGDS(ph)DSWDADAFSVEDPVRK,EIF3J,1.0,1.003644,1.032999,1.202614,0.978781,1.083113,0.823669,0.93989
