# Pre-process DecryptM Dataset

**Publication**: Jana Zecha et al. Decrypting drug actions and protein modifications by dose- and time-resolved proteomics.

In [4]:
import pandas as pd
import os
import toml

## Dose-Dependent Drugs

In [6]:
def search_files(directory):
    """
    Searches for .txt and .toml files in the given directory and its subdirectories.
    Args:
        directory (str): The path to the directory to search in.
    Returns:
        tuple: A tuple containing two lists:
            - txt_files (list): A list of .txt files found.
            - toml_files (list): A list of .toml files found.
    """
    txt_files = []
    toml_files = []

    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                txt_files.append(file)
            elif file.endswith('.toml'):
                toml_files.append(file)

    return txt_files, toml_files


def process_experiment_column(df):
    """
    Processes the 'Experiment' column in the given DataFrame by splitting it into multiple new columns.
    Args:
        df (pd.DataFrame): The input DataFrame containing an 'Experiment' column to be processed.
    Returns:
        pd.DataFrame: The DataFrame with the 'Experiment' column split into new columns:
            - 'Experiment'
            - 'Cell Line'
            - 'Drug'
            - 'Time point'
            - 'Replicate'
    """
    new_columns = ["Experiment", "Cell Line", "Drug", "Time point", "Replicate"]
    split_columns = df['Experiment'].str.split('_', expand=True)
    num_parts = split_columns.shape[1]
    
    for col in new_columns:
        df[col] = 'NA'
    
    df[new_columns[0]] = split_columns[0]
    df[new_columns[1]] = split_columns[1]
    df[new_columns[2]] = split_columns.iloc[:, 2:num_parts-2].apply(lambda x: '_'.join(x), axis=1)
    df[new_columns[3]] = split_columns[num_parts-2]
    df[new_columns[4]] = split_columns[num_parts-1]
    
    df['Time point'] = df['Time point'].apply(lambda x: int(x[:-1]) * 60 if x.endswith('h') else int(x[:-3]) if x.endswith('min') else x)  # Convert hours to minutes and make column numeric
    
    df = df.loc[:, ~df.columns.duplicated()]
    df = df[new_columns + [col for col in df.columns if col not in new_columns]]
    
    return df


def update_dataframe_with_toml(df, toml_file):
    """
    Updates the column names of a DataFrame based on information from a TOML file.
    This function reads a TOML file to get dose and channel information, then renames
    the columns of the DataFrame accordingly. The columns to be renamed are expected
    to follow the pattern "TMT Channel Ratio {channel}", and they will be renamed to
    "Dose {dose}" based on the corresponding dose for each channel.
    Args:
        df (pandas.DataFrame): The DataFrame whose columns are to be renamed.
        toml_file (str): The path to the TOML file containing dose and channel information.
    Returns:
        pandas.DataFrame: The DataFrame with updated column names.
    """
    with open(toml_file, 'r') as file:
        toml_data = toml.load(file)
    doses = toml_data['TMT']['doses']
    channels = toml_data['TMT']['channels']
    
    for i, channel in enumerate(channels):
        old_column_name = f"TMT Channel Ratio {channel}"
        if old_column_name in df.columns:
            new_column_name = f"Dose {doses[i]}" if i < len(doses) else f"Dose NA"
            df.rename(columns={old_column_name: new_column_name}, inplace=True)
    return df


def filter_data(df, min_score_cutoff = 60, max_pep_cutoff = 0.05, verbose = False):
    """
    Filters the DataFrame based on the given cutoffs.
    
    Args:
        df (pd.DataFrame): The input DataFrame to be filtered.
        max_score_cutoff (float): The maximum score cutoff for filtering.
        min_pep_cutoff (float): The minimum PEP cutoff for filtering.
    
    Returns:
        pd.DataFrame: The filtered DataFrame.
    """
    rows_before = df.shape[0]

    filtered_df = df[
        (df['Max Score'] >= min_score_cutoff) & 
        (df['Min PEP'] <= max_pep_cutoff) & 
        (df['Phospho (STY)'] >= 1) & 
        (df['Phosphoproteome'] == True)
    ]

    rows_after = filtered_df.shape[0]
    if verbose == True:
        print(f"Number of rows before filtering: {rows_before}")
        print(f"Number of rows after filtering: {rows_after}")
    
    return filtered_df


def remove_columns(df):
    columns_to_remove = [
        'Experiment', 'Replicate', 'N duplicates', 'Sequence', 'Length',
        'Missed cleavages', 'Proteins', 'Leading proteins', 'Protein names',
        'Phospho (STY)', 'All Phospho (STY) Probabilities', 'Max Score', 'Min PEP',
        'Intensity', 'Phosphoproteome', 'Fullproteome', 'Curve signal', 'Log EC50',
        'Curve slope', 'Curve top', 'Curve bottom', 'R2', 'Curve RMSE', 'Log EC50 error',
        'Curve slope error', 'Curve top error', 'Curve bottom error', 'EC50', 'pEC50',
        'Curve effect size', 'Regulation', 'Acetyl (K)', 'All Acetyl (K) Probabilities', 'Acetylproteome'
    ]

    df.drop(columns=columns_to_remove, inplace=True, errors='ignore')
    
    # Remove columns that start with 'Reporter intensity corrected' or 'TMT Channel Normal'
    df = df.loc[:, ~df.columns.str.startswith(('Reporter intensity corrected', 'TMT Channel Normal'))]
    
    return df


def pivot_long(df):
    """
    Converts a DataFrame from wide format to long format.
    This function takes a DataFrame with multiple columns representing different doses
    and pivots it to a long format where each row represents a single dose measurement.
    Args:
        df (pd.DataFrame): The input DataFrame in wide format.
    Returns:
        pd.DataFrame: The transformed DataFrame in long format with columns 'Dose' and 'Value'.
    """
    
    dose_columns = [col for col in df.columns if col.startswith('Dose')]
    
    df_long = pd.melt(
        df, 
        id_vars=[col for col in df.columns if col not in dose_columns], 
        value_vars=dose_columns, 
        var_name='Dose', 
        value_name='Value'
    )
    
    df_long['Dose'] = df_long['Dose'].str.replace('Dose ', '').astype(float)
    
    return df_long

In [7]:
def main():
    directory = '../data'
    txt_files, toml_files = search_files(directory)
    combined_df = pd.DataFrame()
    
    # Remove time-dependent file
    if 'Rituximab_td.txt' in txt_files:
        txt_files.remove('Rituximab_td.txt')
        
    for txt_file in txt_files:
        txt_file_path = os.path.normpath(directory + '/' + txt_file)
        df = pd.read_csv(txt_file_path, delimiter='\t', header=0)
        df = process_experiment_column(df)

        toml_file_name = f"pipeline_{txt_file.replace('_dd.txt', '.toml')}"
        if toml_file_name in toml_files:
            toml_file_name = os.path.normpath(directory + '/' + toml_file_name)
            df = update_dataframe_with_toml(df, toml_file_name)
        else:
            print(f"toml file: {toml_file_name} does not exist!")
            
        min_score_cutoff = 60
        max_pep_cutoff = 0.05
        df = filter_data(df, min_score_cutoff, max_pep_cutoff, False)
        df = remove_columns(df)
        df = pivot_long(df)
        
        combined_df = pd.concat([combined_df, df], ignore_index=True)
    
    return combined_df

if __name__ == "__main__":
    combined_df = main()
    display(combined_df)
    
    # Check unique values
    unique_cell_lines = combined_df['Cell Line'].unique()
    unique_drugs = combined_df['Drug'].unique()
    unique_time_points = combined_df['Time point'].unique()
    unique_doses = combined_df['Dose'].unique()
    
    print("Unique Cell Lines:", unique_cell_lines)
    print("Unique Drugs:", unique_drugs)
    print("Unique Time Points:", unique_time_points)
    print("Unique Doses:", unique_doses)

Unnamed: 0,Cell Line,Drug,Time point,Modified sequence,Gene names,Dose,Value
0,A549,PD325901,60,(ac)AAAAAAAGDS(ph)DSWDADAFSVEDPVR,EIF3J,Dose 10000.0,
1,A549,PD325901,60,(ac)AAAAAAAGDS(ph)DSWDADAFSVEDPVRK,EIF3J,Dose 10000.0,1.489595
2,A549,PD325901,60,(ac)AAAADS(ph)FSGGPAGVR,RMI2,Dose 10000.0,1.307801
3,A549,MK2206,60,(ac)AAAAPDSRVS(ph)EEENLK,RRP15,Dose 10000.0,0.789177
4,A549,PD325901,60,(ac)AAAAPDSRVS(ph)EEENLK,RRP15,Dose 10000.0,1.065007
...,...,...,...,...,...,...,...
9998965,HeLa,SAHA,240,YYS(ph)DSDDELTVEQR,BOD1L1,Dose 0.0,1.000000
9998966,HeLa,SAHA,240,YYS(ph)IDDNQNK,NCOA7,Dose 0.0,1.000000
9998967,HeLa,SAHA,240,YYS(ph)PCEEHPAETNQNEGSESGTIR,ARHGEF5,Dose 0.0,1.000000
9998968,HeLa,SAHA,240,YYSDS(ph)DDELTVEQR,BOD1L1,Dose 0.0,1.000000


Unique Cell Lines: ['A549' 'PC-9' 'RPMI8226' 'K562' 'A431' 'BT-474' 'HeLa' 'MDA-MB-175'
 'A459' 'KYSE-520' 'SK-BR-3']
Unique Drugs: ['PD325901' 'MK2206' 'AZD8055' 'Dactolisib' 'Dasatinib' 'Nintedanib'
 'Tideglusib' 'Pictilisib' 'AZD4547' 'LapatinibAZD4547' 'Lapatinib'
 'BTZ_CFZ' 'Imatinib' 'Pertuzumab' 'Trastuzumab' 'CUDC101' 'Curcumin'
 'Cytarabine' 'Afatinib' 'Gefitinib' 'GeftinibAZD4547-1to80' 'A486' 'A485'
 'Refametinib' 'Staursporin' 'Methotrexat' 'Paclitaxel' 'Romidepsin'
 'SelumetinibMK2206-1to2' 'SelumetinibMK2206-3to1' 'Selumetinib' 'SHP099'
 'SAHA']
Unique Time Points: [ 60  30 960 120 240 480]
Unique Doses: ['Dose 10000.0' 'Dose 3000.0' 'Dose 1000.0' 'Dose 300.0' 'Dose 100.0'
 'Dose 30.0' 'Dose 10.0' 'Dose 3.0' 'Dose 1.0' 'Dose 0.0' 'Dose 0.3'
 'Dose 0.1' 'Dose 0.03' 'Dose 0.01' 'Dose 0.003' 'Dose 100000.0'
 'Dose 30000.0' 'Dose 0.001' 'Dose 0.0003' 'Dose 0.0001']


## Rituximab Time-Dependent

In [None]:
data_file_path = os.path.join("..", "data", "rituximab_td.txt")

rituximab_td = pd.read_csv(data_file_path, sep="\t")

In [None]:
# Filter data
# Arbitrary Cutoffs
max_score_cutoff = 60  # Confidence score for peptide identification
min_pep_cutoff = 0.05  # Minimum posterior error probability

rows_before = rituximab_td.shape[0]
print(f"Number of rows before filtering: {rows_before}")

rituximab_td = rituximab_td[
    (rituximab_td['Max Score'] >= max_score_cutoff) & 
    (rituximab_td['Min PEP'] <= min_pep_cutoff) &
    (rituximab_td['Phospho (STY)'] >= 1) &
    (rituximab_td['Phosphoproteome'] == True)
]

rows_after = rituximab_td.shape[0]
print(f"Number of rows before filtering: {rows_after}")

# Break down experiment column
split_experiment_col = rituximab_td['Experiment'].str.split('_', expand=True)
rituximab_td['Cell Line'] = split_experiment_col[1]
rituximab_td['Drug'] = split_experiment_col[2]
rituximab_td['Dose'] = split_experiment_col[3].str.replace('ng', '')

In [None]:
rituximab_td = rituximab_td[[
        'Cell Line',
        'Drug',
        'Dose',
        'Modified sequence', 
        'Gene names',
        'TMT Channel Ratio 1',
        'TMT Channel Ratio 2', 
        'TMT Channel Ratio 3', 
        'TMT Channel Ratio 4',
        'TMT Channel Ratio 5', 
        'TMT Channel Ratio 6', 
        'TMT Channel Ratio 7',
        'TMT Channel Ratio 8'
]]

rituximab_td = rituximab_td.rename(columns={
        'TMT Channel Ratio 1': 'Ratio_1',
        'TMT Channel Ratio 2': 'Ratio_2',
        'TMT Channel Ratio 3': 'Ratio_5',
        'TMT Channel Ratio 4': 'Ratio_10',
        'TMT Channel Ratio 5': 'Ratio_60',
        'TMT Channel Ratio 6': 'Ratio_120',
        'TMT Channel Ratio 7': 'Ratio_360',
        'TMT Channel Ratio 8': 'Ratio_1440'})

rituximab_td.head(10)