In [183]:
# Jupyter notebook sample
import numpy as np
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import re
import pathlib

# TARGET_PTM = '(UniMod:37)'
# TARGET_PTM = '(UniMod:7)'
# TARGET_PTM = 'Hypusine'
# TARGET_PTM = 'Deoxyhypusine'
TARGET_PTM = r'acetaldehyde|malondialdehydeacetaldehyde|malondialdehyde'

# PTM_DATA = '/Users/cgu3/Documents/diann/data/PROJ0093.csv'
PTM_DATA = '/Users/cgu3/Documents/diann/data/PROJ0093_qvalue005.csv'

In [184]:
dat = pd.read_csv(PTM_DATA)

In [185]:
import re

# check total number of row where the Modified.Sequence contain "(*)" and * means any number of any character
dat['has_target_PTM'] = dat['Modified.Sequence'].str.contains(TARGET_PTM, case=False, na=False)
dat['num_PTM'] = dat['Modified.Sequence'].str.count(TARGET_PTM)
# create PTM column by extract one of "acetaldehyde|malondialdehyde|malondialdehydeacetaldehyde" from "source" column
dat['PTM'] = dat['source'].str.extract(f'({TARGET_PTM})', expand=False)

In [186]:
# filter out the rows where has_target_PTM is True
peptide_list_PTM = dat[dat['has_target_PTM'] == True].loc[:, ['PTM', 'Run', 'Protein.Group', 'Protein.Ids', 'Protein.Names', 'Genes', 'Stripped.Sequence', 'Modified.Sequence', 'Precursor.Quantity']]
peptide_list_PTM.to_csv(f'output/{pathlib.Path(PTM_DATA).stem}.csv', index=False)

In [187]:
peptide_level_aggregation = dat.groupby(['PTM', 'Run', 'Stripped.Sequence']).agg({'has_target_PTM': 'any', 'num_PTM': 'max'}).reset_index()

protein_level_aggregation = dat.groupby(['PTM', 'Run', 'Protein.Group']).agg({'has_target_PTM': 'any', 'num_PTM': 'max'}).reset_index()

In [156]:
peptide_level = peptide_level_aggregation.groupby(['PTM', 'Run']).agg({'has_target_PTM': ['mean', 'sum', 'count']}).reset_index()

# Flatten multi-level column names
peptide_level.columns = ['_'.join(col).strip('_') for col in peptide_level.columns]
peptide_level = peptide_level.rename(columns={'has_target_PTM_count': 'total_count'})

peptide_level

Unnamed: 0,PTM,Run,has_target_PTM_mean,has_target_PTM_sum,total_count
0,acetaldehyde,PROJ0093_MDAMB231_Azi_R1_S1-A8_1_11479,0.000266,9,33867
1,acetaldehyde,PROJ0093_MDAMB231_DMSO_R1_S1-A7_1_11478,0.000514,10,19447
2,malondialdehyde,PROJ0093_MDAMB231_Azi_R1_S1-A8_1_11479,0.00015,5,33400
3,malondialdehyde,PROJ0093_MDAMB231_DMSO_R1_S1-A7_1_11478,0.00021,4,19008
4,malondialdehydeacetaldehyde,PROJ0093_MDAMB231_Azi_R1_S1-A8_1_11479,0.00027,9,33393
5,malondialdehydeacetaldehyde,PROJ0093_MDAMB231_DMSO_R1_S1-A7_1_11478,0.000211,4,18948


In [157]:
protein_level = protein_level_aggregation.groupby(['PTM', 'Run']).agg({
    'has_target_PTM': ['mean', 'sum', 'count']
}).reset_index()

# Flatten multi-level column names
protein_level.columns = ['_'.join(col).strip('_') for col in protein_level.columns]
protein_level = protein_level.rename(columns={'has_target_PTM_count': 'total_count'})

# Display result
protein_level

Unnamed: 0,PTM,Run,has_target_PTM_mean,has_target_PTM_sum,total_count
0,acetaldehyde,PROJ0093_MDAMB231_Azi_R1_S1-A8_1_11479,0.001702,8,4701
1,acetaldehyde,PROJ0093_MDAMB231_DMSO_R1_S1-A7_1_11478,0.00282,9,3192
2,malondialdehyde,PROJ0093_MDAMB231_Azi_R1_S1-A8_1_11479,0.001073,5,4660
3,malondialdehyde,PROJ0093_MDAMB231_DMSO_R1_S1-A7_1_11478,0.001282,4,3119
4,malondialdehydeacetaldehyde,PROJ0093_MDAMB231_Azi_R1_S1-A8_1_11479,0.001935,9,4650
5,malondialdehydeacetaldehyde,PROJ0093_MDAMB231_DMSO_R1_S1-A7_1_11478,0.001278,4,3129


In [158]:
# horizontal combine two dataframe
combined = pd.merge(peptide_level, protein_level, on=['PTM', 'Run'], suffixes=('_peptide', '_protein'))
combined

Unnamed: 0,PTM,Run,has_target_PTM_mean_peptide,has_target_PTM_sum_peptide,total_count_peptide,has_target_PTM_mean_protein,has_target_PTM_sum_protein,total_count_protein
0,acetaldehyde,PROJ0093_MDAMB231_Azi_R1_S1-A8_1_11479,0.000266,9,33867,0.001702,8,4701
1,acetaldehyde,PROJ0093_MDAMB231_DMSO_R1_S1-A7_1_11478,0.000514,10,19447,0.00282,9,3192
2,malondialdehyde,PROJ0093_MDAMB231_Azi_R1_S1-A8_1_11479,0.00015,5,33400,0.001073,5,4660
3,malondialdehyde,PROJ0093_MDAMB231_DMSO_R1_S1-A7_1_11478,0.00021,4,19008,0.001282,4,3119
4,malondialdehydeacetaldehyde,PROJ0093_MDAMB231_Azi_R1_S1-A8_1_11479,0.00027,9,33393,0.001935,9,4650
5,malondialdehydeacetaldehyde,PROJ0093_MDAMB231_DMSO_R1_S1-A7_1_11478,0.000211,4,18948,0.001278,4,3129
