In [30]:
import pandas as pd
import numpy as np

In [31]:
# Get data for peptide variants with at least 20 non-NaN values in the controls columns
peptide_variants_df = pd.read_csv('filtered_data.csv', low_memory=False)
print(f"Number of rows peptide variants in our filtered data: {len(peptide_variants_df)}")
print(f"Columns: {peptide_variants_df.columns}")
peptide_variants_df.head()

Number of rows peptide variants in our filtered data: 4168
Columns: Index(['rowid', 'ccms_row_id', 'Variant', 'Variant ID', 'Unmod variant',
       'Total', 'Total- Unmodified sequence', 'Variants- Unmodified sequence',
       'Proteins', 'Mass',
       ...
       'baricitib_1000nm', 'baricitib_100nm', 'baricitib_10nm',
       'baricitib_30000nm', 'baricitib_3000nm', 'baricitib_300nm',
       'baricitib_30nm', 'baricitib_3nm', 'baricitib_dmso', 'baricitib_pdpd'],
      dtype='object', length=463)


Unnamed: 0,rowid,ccms_row_id,Variant,Variant ID,Unmod variant,Total,Total- Unmodified sequence,Variants- Unmodified sequence,Proteins,Mass,...,baricitib_1000nm,baricitib_100nm,baricitib_10nm,baricitib_30000nm,baricitib_3000nm,baricitib_300nm,baricitib_30nm,baricitib_3nm,baricitib_dmso,baricitib_pdpd
0,5,5,.ESESTAGSFSLSVR.,21292,.ESESTAGSFSLSVR.,7995,8328,7,sp|P06239-2|LCK_HUMAN;sp|P06239-3|LCK_HUMAN;sp...,1456.7,...,11120000.0,15061000.0,18264000.0,13380000.0,14280000.0,19220000.0,11216000.0,12721000.0,12835000.0,8137600.0
1,7,7,.IQDKEGIPPDQQR.,39596,.IQDKEGIPPDQQR.,6836,6882,7,sp|P0CG47|UBB_HUMAN;sp|P0CG48|UBC_HUMAN;sp|P62...,1523.8,...,1740400.0,9648200.0,16877000.0,2505000.0,3093800.0,4726100.0,7930900.0,8424800.0,11643000.0,5830600.0
2,11,11,.IFTSIGEDYDER.,36599,.IFTSIGEDYDER.,5284,5412,7,sp|P35232-2|PHB_HUMAN;sp|P35232|PHB_HUMAN;tr|C...,1444.6,...,31864000.0,123890000.0,130640000.0,46563000.0,37421000.0,54811000.0,96800000.0,78860000.0,75034000.0,70372000.0
3,14,14,.TAVC+57.021DIPPR.,87369,.TAVCDIPPR.,4837,4837,1,sp|A6NNZ2|TBB8B_HUMAN;sp|P04350|TBB4A_HUMAN;sp...,1085.5,...,1097300.0,2733200.0,6343300.0,987920.0,1095500.0,2920500.0,4860600.0,7768400.0,2558000.0,3113400.0
4,15,15,.IITHPNFNGNTLDNDIMLIK.,37659,.IITHPNFNGNTLDNDIMLIK.,4830,20735,81,TRYP_PIG,2283.2,...,,,,,,,,,,


In [32]:
# Get total number of unmodified peptides
unmodified_peptides = peptide_variants_df['Unmod variant'].unique()
print(f"Number of unique unmodified peptides: {len(unmodified_peptides)}")
print(f"Unmodified peptides: {unmodified_peptides}")

Number of unique unmodified peptides: 4162
Unmodified peptides: ['.ESESTAGSFSLSVR.' '.IQDKEGIPPDQQR.' '.IFTSIGEDYDER.' ...
 '.KQQSIAGSADSKPIDVSR.' '.LNLYELK.' '.AGAGPGGPPQKPAPSSQR.']


In [33]:
# Get the mean, median, and standard deviation of the control abundance values for each peptide variant
start_col = 'aew541_1000nm' # Start from first drug column 
start_idx = peptide_variants_df.columns.get_loc(start_col)

# Get the columns with drug treatment peptide abundance data
drug_columns = peptide_variants_df.columns[start_idx:]
filtered_drug_columns = [col for col in drug_columns if not col.endswith('_pdpd')] # Drop PDPD columns
dmso_columns = [col for col in drug_columns if col.endswith('_dmso')] # Get control columns

# Add control means, medians, and standard deviations to the dataframe
peptide_variants_df['Control_Mean_Abundance'] = peptide_variants_df[dmso_columns].mean(axis=1, skipna=True)
peptide_variants_df['Control_Median_Abundance'] = peptide_variants_df[dmso_columns].median(axis=1, skipna=True)
peptide_variants_df['Control_StdDev_Abundance'] = peptide_variants_df[dmso_columns].std(axis=1, skipna=True)

peptide_variants_df[['Variant', 'Control_Mean_Abundance', 'Control_Median_Abundance', 'Control_StdDev_Abundance', 'aew541_1000nm']].head()

Unnamed: 0,Variant,Control_Mean_Abundance,Control_Median_Abundance,Control_StdDev_Abundance,aew541_1000nm
0,.ESESTAGSFSLSVR.,7538684.0,5292500.0,6148641.0,15921000.0
1,.IQDKEGIPPDQQR.,8102660.0,4881150.0,12192510.0,4450700.0
2,.IFTSIGEDYDER.,25249810.0,12670000.0,34426720.0,8153300.0
3,.TAVC+57.021DIPPR.,5945529.0,2138200.0,11785310.0,4442300.0
4,.IITHPNFNGNTLDNDIMLIK.,131311900.0,52608000.0,198586500.0,134780000.0


In [34]:
# Check if mean, median, and standard deviation are correct

# Get DMSO values for first peptide variant
peptide_variant_dmso_row = peptide_variants_df.iloc[0][dmso_columns]
mean_val = peptide_variant_dmso_row.mean(skipna=True)
median_val = peptide_variant_dmso_row.median(skipna=True)
std_val = peptide_variant_dmso_row.std(skipna=True)

print(f"Mean: {mean_val}")
print(f"Median: {median_val}")
print(f"Standard Deviation: {std_val}")

# Check if the calculated values match the dataframe values
peptide_variants_df[['Variant', 'Control_Mean_Abundance', 'Control_Median_Abundance', 'Control_StdDev_Abundance']].head(1)


Mean: 7538683.902439024
Median: 5292500.0
Standard Deviation: 6148640.6690364815


Unnamed: 0,Variant,Control_Mean_Abundance,Control_Median_Abundance,Control_StdDev_Abundance
0,.ESESTAGSFSLSVR.,7538684.0,5292500.0,6148641.0


In [35]:
# Compute ratios and log ratios for each drug treatment column
peptide_variants_ratios = peptide_variants_df.copy()
peptide_variants_log_ratios = peptide_variants_df.copy()

for col in filtered_drug_columns:
    peptide_variants_ratios[col] = np.where(
        peptide_variants_df['Control_Median_Abundance'] > 0,
        peptide_variants_df[col] / peptide_variants_df['Control_Median_Abundance'],
        np.nan
    )
    peptide_variants_log_ratios[col] = np.where(
        peptide_variants_df['Control_Median_Abundance'] > 0,
        np.log(peptide_variants_df[col] / peptide_variants_df['Control_Median_Abundance']),
        np.nan
    )

peptide_variants_df[['Variant', 'Control_Median_Abundance'] + filtered_drug_columns[:9]].head()

Unnamed: 0,Variant,Control_Median_Abundance,aew541_1000nm,aew541_100nm,aew541_10nm,aew541_30000nm,aew541_3000nm,aew541_300nm,aew541_30nm,aew541_3nm,aew541_dmso
0,.ESESTAGSFSLSVR.,5292500.0,15921000.0,35107000.0,23510000.0,7003800.0,11974000.0,28975000.0,23871000.0,25788000.0,24221000.0
1,.IQDKEGIPPDQQR.,4881150.0,4450700.0,7027600.0,4492100.0,3121300.0,3327800.0,3826700.0,1551300.0,4841800.0,4596300.0
2,.IFTSIGEDYDER.,12670000.0,8153300.0,13776000.0,9365300.0,6006600.0,8228400.0,13621000.0,5029400.0,10542000.0,9534400.0
3,.TAVC+57.021DIPPR.,2138200.0,4442300.0,7561200.0,6468900.0,3167000.0,3713800.0,6473200.0,3499900.0,8563200.0,6993100.0
4,.IITHPNFNGNTLDNDIMLIK.,52608000.0,134780000.0,300930000.0,167050000.0,107730000.0,155910000.0,219370000.0,80393000.0,144490000.0,130720000.0


In [36]:
peptide_variants_ratios[['Variant', 'Control_Median_Abundance'] + filtered_drug_columns[:9]].head()

Unnamed: 0,Variant,Control_Median_Abundance,aew541_1000nm,aew541_100nm,aew541_10nm,aew541_30000nm,aew541_3000nm,aew541_300nm,aew541_30nm,aew541_3nm,aew541_dmso
0,.ESESTAGSFSLSVR.,5292500.0,3.008219,6.633349,4.442135,1.323344,2.262447,5.474728,4.510345,4.872556,4.576476
1,.IQDKEGIPPDQQR.,4881150.0,0.911814,1.439743,0.920295,0.63946,0.681766,0.783975,0.317814,0.991938,0.941643
2,.IFTSIGEDYDER.,12670000.0,0.643512,1.087293,0.739171,0.474081,0.64944,1.075059,0.396953,0.832044,0.752518
3,.TAVC+57.021DIPPR.,2138200.0,2.077589,3.536245,3.025395,1.481152,1.736881,3.027406,1.636844,4.004864,3.270555
4,.IITHPNFNGNTLDNDIMLIK.,52608000.0,2.561968,5.720233,3.175373,2.047787,2.963618,4.169898,1.528152,2.74654,2.484793


In [37]:
peptide_variants_log_ratios[['Variant', 'Control_Median_Abundance'] + filtered_drug_columns[:9]].head()

Unnamed: 0,Variant,Control_Median_Abundance,aew541_1000nm,aew541_100nm,aew541_10nm,aew541_30000nm,aew541_3000nm,aew541_300nm,aew541_30nm,aew541_3nm,aew541_dmso
0,.ESESTAGSFSLSVR.,5292500.0,1.101348,1.89211,1.491135,0.280162,0.816447,1.700143,1.506374,1.583619,1.520929
1,.IQDKEGIPPDQQR.,4881150.0,-0.092319,0.364464,-0.083061,-0.447131,-0.383069,-0.243378,-1.146288,-0.008094,-0.060129
2,.IFTSIGEDYDER.,12670000.0,-0.440814,0.083691,-0.302226,-0.746378,-0.431645,0.072376,-0.923936,-0.18387,-0.284331
3,.TAVC+57.021DIPPR.,2138200.0,0.731208,1.263066,1.107042,0.39282,0.552091,1.107706,0.49277,1.38751,1.18496
4,.IITHPNFNGNTLDNDIMLIK.,52608000.0,0.940776,1.744009,1.155425,0.71676,1.086411,1.427892,0.424059,1.010342,0.910189


In [38]:
# Get median log ratios and standard deviations of log ratios for each peptide variant

peptide_variants_log_ratios['Control_Log_Ratio_Median'] = peptide_variants_log_ratios[dmso_columns].median(axis=1, skipna=True)
peptide_variants_log_ratios['Control_Log_Ratio_StdDev'] = peptide_variants_log_ratios[dmso_columns].std(axis=1, skipna=True)
peptide_variants_log_ratios[['Variant', 'Control_Log_Ratio_Median', 'Control_Log_Ratio_StdDev'] + filtered_drug_columns[:9]].head()

Unnamed: 0,Variant,Control_Log_Ratio_Median,Control_Log_Ratio_StdDev,aew541_1000nm,aew541_100nm,aew541_10nm,aew541_30000nm,aew541_3000nm,aew541_300nm,aew541_30nm,aew541_3nm,aew541_dmso
0,.ESESTAGSFSLSVR.,0.0,0.856921,1.101348,1.89211,1.491135,0.280162,0.816447,1.700143,1.506374,1.583619,1.520929
1,.IQDKEGIPPDQQR.,-0.001706,0.905393,-0.092319,0.364464,-0.083061,-0.447131,-0.383069,-0.243378,-1.146288,-0.008094,-0.060129
2,.IFTSIGEDYDER.,0.0,1.088943,-0.440814,0.083691,-0.302226,-0.746378,-0.431645,0.072376,-0.923936,-0.18387,-0.284331
3,.TAVC+57.021DIPPR.,-6e-06,1.20013,0.731208,1.263066,1.107042,0.39282,0.552091,1.107706,0.49277,1.38751,1.18496
4,.IITHPNFNGNTLDNDIMLIK.,-4.8e-05,1.679313,0.940776,1.744009,1.155425,0.71676,1.086411,1.427892,0.424059,1.010342,0.910189


In [39]:
# Get peptide_z-scores over log ratio values

peptide_variants_z_scores = peptide_variants_log_ratios.copy()
for col in filtered_drug_columns:
    peptide_variants_z_scores[col] = np.where(
        peptide_variants_z_scores['Control_Log_Ratio_StdDev'] > 0,
        (peptide_variants_z_scores[col] - peptide_variants_z_scores['Control_Log_Ratio_Median']) / peptide_variants_z_scores['Control_Log_Ratio_StdDev'],
        np.nan
    )

peptide_variants_z_scores[['Variant', 'Control_Log_Ratio_Median', 'Control_Log_Ratio_StdDev'] + filtered_drug_columns[:9]].head()

Unnamed: 0,Variant,Control_Log_Ratio_Median,Control_Log_Ratio_StdDev,aew541_1000nm,aew541_100nm,aew541_10nm,aew541_30000nm,aew541_3000nm,aew541_300nm,aew541_30nm,aew541_3nm,aew541_dmso
0,.ESESTAGSFSLSVR.,0.0,0.856921,1.285239,2.208032,1.740108,0.32694,0.952768,1.984013,1.75789,1.848033,1.774876
1,.IQDKEGIPPDQQR.,-0.001706,0.905393,-0.100082,0.404432,-0.089856,-0.49197,-0.421214,-0.266925,-1.264183,-0.007056,-0.064528
2,.IFTSIGEDYDER.,0.0,1.088943,-0.404809,0.076855,-0.27754,-0.685415,-0.396389,0.066464,-0.848471,-0.168852,-0.261107
3,.TAVC+57.021DIPPR.,-6e-06,1.20013,0.609279,1.052446,0.92244,0.32732,0.460031,0.922994,0.410602,1.156138,0.987365
4,.IITHPNFNGNTLDNDIMLIK.,-4.8e-05,1.679313,0.560243,1.038554,0.688063,0.426846,0.646966,0.850312,0.252548,0.601669,0.54203


In [40]:
# Get peptide z-score vectors for clustering
peptide_z_score_vectors = peptide_variants_z_scores[['Variant'] + filtered_drug_columns].set_index('Variant')
peptide_z_score_vectors.head()

Unnamed: 0_level_0,aew541_1000nm,aew541_100nm,aew541_10nm,aew541_30000nm,aew541_3000nm,aew541_300nm,aew541_30nm,aew541_3nm,aew541_dmso,amg208_1000nm,...,barasertibhqpa_dmso,baricitib_1000nm,baricitib_100nm,baricitib_10nm,baricitib_30000nm,baricitib_3000nm,baricitib_300nm,baricitib_30nm,baricitib_3nm,baricitib_dmso
Variant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.ESESTAGSFSLSVR.,1.285239,2.208032,1.740108,0.32694,0.952768,1.984013,1.75789,1.848033,1.774876,-0.014666,...,-0.005082,0.866421,1.220436,1.445455,1.082329,1.158297,1.504993,0.876452,1.023389,1.0338
.IQDKEGIPPDQQR.,-0.100082,0.404432,-0.089856,-0.49197,-0.421214,-0.266925,-1.264183,-0.007056,-0.064528,-0.219221,...,-0.357942,-1.137142,0.754475,1.372086,-0.734915,-0.501744,-0.03377,0.537989,0.604715,0.962047
.IFTSIGEDYDER.,-0.404809,0.076855,-0.27754,-0.685415,-0.396389,0.066464,-0.848471,-0.168852,-0.261107,-3.046919,...,0.694054,0.846913,2.093918,2.142637,1.19526,0.994538,1.345024,1.867325,1.679094,1.633423
.TAVC+57.021DIPPR.,0.609279,1.052446,0.92244,0.32732,0.460031,0.922994,0.410602,1.156138,0.987365,,...,-0.481868,-0.555861,0.204574,0.906103,-0.643357,-0.557229,0.259802,0.684262,1.074972,0.149373
.IITHPNFNGNTLDNDIMLIK.,0.560243,1.038554,0.688063,0.426846,0.646966,0.850312,0.252548,0.601669,0.54203,-0.468655,...,1.503147,,,,,,,,,


In [41]:
# Save to csv
peptide_z_score_vectors.to_csv('peptide_z_score_vectors.csv')

In [42]:
# Get p-values from z-scores using 2-tailed test
import scipy.stats

peptide_variants_p_values = peptide_variants_z_scores.copy()
for col in filtered_drug_columns:
    peptide_variants_p_values[col] = np.where(
        peptide_variants_z_scores[col].notna(),
        scipy.stats.norm.sf(abs(peptide_variants_z_scores[col]))*2,
        np.nan
    )

peptide_variants_p_values[['Variant', 'Control_Log_Ratio_Median', 'Control_Log_Ratio_StdDev'] + filtered_drug_columns[:9]].head()

Unnamed: 0,Variant,Control_Log_Ratio_Median,Control_Log_Ratio_StdDev,aew541_1000nm,aew541_100nm,aew541_10nm,aew541_30000nm,aew541_3000nm,aew541_300nm,aew541_30nm,aew541_3nm,aew541_dmso
0,.ESESTAGSFSLSVR.,0.0,0.856921,0.198709,0.027242,0.08184,0.743713,0.340708,0.047254,0.078766,0.064598,0.075918
1,.IQDKEGIPPDQQR.,-0.001706,0.905393,0.920279,0.685895,0.928402,0.622741,0.673599,0.789527,0.206164,0.99437,0.948549
2,.IFTSIGEDYDER.,0.0,1.088943,0.685618,0.938739,0.781365,0.493082,0.691818,0.947008,0.396176,0.865913,0.79401
3,.TAVC+57.021DIPPR.,-6e-06,1.20013,0.542339,0.292595,0.356299,0.743426,0.645494,0.35601,0.681364,0.247625,0.323464
4,.IITHPNFNGNTLDNDIMLIK.,-4.8e-05,1.679313,0.575314,0.299012,0.491413,0.669491,0.517654,0.395152,0.800618,0.547395,0.587798


In [43]:
# To see rows with any null values:
rows_with_nulls = peptide_variants_p_values[peptide_variants_p_values.isnull().any(axis=1)]
print(len(rows_with_nulls))

4168


In [44]:
# Get significantly perturbed peptides for each drug treatment
# Filter for p-values <= 0.05

significantly_perturbed_peptides = {}
for col in filtered_drug_columns:
    significantly_perturbed_peptides[col] = []
    for index, row in peptide_variants_p_values.iterrows():
        if row[col] <= 0.05:
            entry = {
                'Variant': row['Variant'],
                'z_score': peptide_variants_z_scores.loc[index, col],
                'p_value': row[col],
                'log_ratio': peptide_variants_log_ratios.loc[index, col]
            }
            significantly_perturbed_peptides[col].append(entry)
    print(f"Number of significantly perturbed peptides for {col}: {len(significantly_perturbed_peptides[col])}")

Number of significantly perturbed peptides for aew541_1000nm: 29
Number of significantly perturbed peptides for aew541_100nm: 162
Number of significantly perturbed peptides for aew541_10nm: 39
Number of significantly perturbed peptides for aew541_30000nm: 48
Number of significantly perturbed peptides for aew541_3000nm: 30
Number of significantly perturbed peptides for aew541_300nm: 175
Number of significantly perturbed peptides for aew541_30nm: 66
Number of significantly perturbed peptides for aew541_3nm: 154
Number of significantly perturbed peptides for aew541_dmso: 110
Number of significantly perturbed peptides for amg208_1000nm: 37
Number of significantly perturbed peptides for amg208_100nm: 33
Number of significantly perturbed peptides for amg208_10nm: 19
Number of significantly perturbed peptides for amg208_30000nm: 37
Number of significantly perturbed peptides for amg208_3000nm: 74
Number of significantly perturbed peptides for amg208_300nm: 39
Number of significantly perturbed 

In [45]:
print(significantly_perturbed_peptides['aew541_1000nm'])

[{'Variant': '.FTDDYQLFEELGKGAFSVVR.', 'z_score': np.float64(-2.071412264744531), 'p_value': 0.038320285708396566, 'log_ratio': np.float64(-2.489210305765021)}, {'Variant': '.DLKPSNILYVDESGNPEC+57.021LR.', 'z_score': np.float64(-2.3243848808137715), 'p_value': 0.020104875161458393, 'log_ratio': np.float64(-2.169456425015465)}, {'Variant': '.YLATLNFVHR.', 'z_score': np.float64(2.0362019210682907), 'p_value': 0.04173008344587532, 'log_ratio': np.float64(2.0317899368971073)}, {'Variant': '.ESTNYPGDYTLC+57.021VSC+57.021DGKVEHYR.', 'z_score': np.float64(2.203495811401803), 'p_value': 0.027559821870007134, 'log_ratio': np.float64(2.8359365838602573)}, {'Variant': '.LFQQILSGVDYC+57.021HR.', 'z_score': np.float64(1.9649198153131924), 'p_value': 0.049423518899439634, 'log_ratio': np.float64(1.815651856294981)}, {'Variant': '.QETVDC+57.021LKK.', 'z_score': np.float64(-3.9025635386761106), 'p_value': 9.517924173849378e-05, 'log_ratio': np.float64(-4.111339881059187)}, {'Variant': '.EVVEEAENGR.', 

In [46]:
# Get dataframe of significantly perturbed peptides for desired treatment

perturbed_df = pd.DataFrame(significantly_perturbed_peptides['aew541_1000nm'])
perturbed_df.head(29)

Unnamed: 0,Variant,z_score,p_value,log_ratio
0,.FTDDYQLFEELGKGAFSVVR.,-2.071412,0.03832029,-2.48921
1,.DLKPSNILYVDESGNPEC+57.021LR.,-2.324385,0.02010488,-2.169456
2,.YLATLNFVHR.,2.036202,0.04173008,2.03179
3,.ESTNYPGDYTLC+57.021VSC+57.021DGKVEHYR.,2.203496,0.02755982,2.835937
4,.LFQQILSGVDYC+57.021HR.,1.96492,0.04942352,1.815652
5,.QETVDC+57.021LKK.,-3.902564,9.517924e-05,-4.11134
6,.EVVEEAENGR.,-2.217947,0.02655842,-2.331555
7,.VLENAEGAR.,-3.20383,0.001356128,-2.837755
8,.GSQITQQSTNQSR.,-2.950428,0.003173339,-3.95388
9,.VSYDVTSAR.,-2.338774,0.01934713,-2.074077


In [47]:
# Protein-Level Analysis

# Get total number of peptide rows that uniquely match to a single protein
unique_protein_peptide_rows = peptide_variants_df[peptide_variants_df['Proteins'].str.contains(';') == False].copy()
print(f"Number of peptide rows that uniquely match to a single protein: {len(unique_protein_peptide_rows)}")

# Get the unique proteins from that dataframe
unique_proteins = unique_protein_peptide_rows['Proteins'].unique()
print(f"Number of unique proteins: {len(unique_proteins)}")
print(f"Unique proteins: {unique_proteins}")

unique_protein_peptide_rows.head()

Number of peptide rows that uniquely match to a single protein: 645
Number of unique proteins: 227
Unique proteins: ['TRYP_PIG' 'sp|P41240|CSK_HUMAN' 'sp|P14174|MIF_HUMAN'
 'sp|P19338|NUCL_HUMAN' 'sp|Q8TD19|NEK9_HUMAN' 'sp|Q00535|CDK5_HUMAN'
 'sp|P21796|VDAC1_HUMAN' 'sp|Q9H773|DCTP1_HUMAN' 'sp|P10809|CH60_HUMAN'
 'sp|P06576|ATPB_HUMAN' 'sp|Q9Y478|AAKB1_HUMAN' 'sp|P28482|MK01_HUMAN'
 'sp|Q16832|DDR2_HUMAN' 'sp|P38646|GRP75_HUMAN' 'sp|P11021|BIP_HUMAN'
 'sp|P04264|K2C1_HUMAN' 'sp|O96013|PAK4_HUMAN' 'sp|P06241|FYN_HUMAN'
 'sp|P41743|KPCI_HUMAN' 'sp|O43353|RIPK2_HUMAN' 'sp|P13639|EF2_HUMAN'
 'sp|Q9Y4K4|M4K5_HUMAN' 'sp|Q9H479|FN3K_HUMAN' 'sp|P17252|KPCA_HUMAN'
 'sp|P08238|HS90B_HUMAN' 'sp|P61981|1433G_HUMAN' 'sp|O14965|AURKA_HUMAN'
 'sp|P68371|TBB4B_HUMAN' 'sp|P62269|RS18_HUMAN' 'sp|Q9Y2U5|M3K2_HUMAN'
 'sp|P62857|RS28_HUMAN' 'sp|P05387|RLA2_HUMAN' 'sp|P62906|RL10A_HUMAN'
 'sp|Q16539-2|MK14_HUMAN' 'sp|P30050|RL12_HUMAN' 'sp|Q01650|LAT1_HUMAN'
 'sp|P04843|RPN1_HUMAN' 'sp|Q9Y5S2|MRCKB_HUMAN' '

Unnamed: 0,rowid,ccms_row_id,Variant,Variant ID,Unmod variant,Total,Total- Unmodified sequence,Variants- Unmodified sequence,Proteins,Mass,...,baricitib_30000nm,baricitib_3000nm,baricitib_300nm,baricitib_30nm,baricitib_3nm,baricitib_dmso,baricitib_pdpd,Control_Mean_Abundance,Control_Median_Abundance,Control_StdDev_Abundance
4,15,15,.IITHPNFNGNTLDNDIMLIK.,37659,.IITHPNFNGNTLDNDIMLIK.,4830,20735,81,TRYP_PIG,2283.2,...,,,,,,,,131311900.0,52608000.0,198586500.0
15,44,44,.VATVSLPR.,93900,.VATVSLPR.,3321,4190,31,TRYP_PIG,842.5,...,72081000.0,74186000.0,80264000.0,109580000.0,102580000.0,108150000.0,,198953600.0,127130000.0,169089500.0
28,70,70,.HSNLVQLLGVIVEEK.,34569,.HSNLVQLLGVIVEEK.,2868,3656,28,sp|P41240|CSK_HUMAN,1677.9,...,17026000.0,16907000.0,17734000.0,37630000.0,27651000.0,23940000.0,13555000.0,79080850.0,24598000.0,168503400.0
48,116,116,.PMFIVNTNVPR.,69186,.PMFIVNTNVPR.,2361,10053,12,sp|P14174|MIF_HUMAN,1287.7,...,,,,,,,,49907860.0,29243500.0,50232160.0
49,117,117,.KFGYVDFESAEDLEK.,43172,.KFGYVDFESAEDLEK.,2353,2644,10,sp|P19338|NUCL_HUMAN,1776.8,...,2148800.0,,1679900.0,6958900.0,5593000.0,4346100.0,2534600.0,7465717.0,4322900.0,12003810.0


In [48]:
# Get ratios, log, ratios, z-scores, and p-values for unique protein rows

unique_protein_peptide_ratios = peptide_variants_ratios[peptide_variants_df['Proteins'].str.contains(';') == False].copy()
unique_protein_peptide_log_ratios = peptide_variants_log_ratios[peptide_variants_df['Proteins'].str.contains(';') == False].copy()
unique_protein_peptide_z_scores = peptide_variants_z_scores[peptide_variants_df['Proteins'].str.contains(';') == False].copy()
unique_protein_peptide_p_values = peptide_variants_p_values[peptide_variants_df['Proteins'].str.contains(';') == False].copy()

unique_protein_peptide_ratios[['Variant', 'Control_Median_Abundance'] + filtered_drug_columns[:9]].head()

Unnamed: 0,Variant,Control_Median_Abundance,aew541_1000nm,aew541_100nm,aew541_10nm,aew541_30000nm,aew541_3000nm,aew541_300nm,aew541_30nm,aew541_3nm,aew541_dmso
4,.IITHPNFNGNTLDNDIMLIK.,52608000.0,2.561968,5.720233,3.175373,2.047787,2.963618,4.169898,1.528152,2.74654,2.484793
15,.VATVSLPR.,127130000.0,,,,,,0.011723,,,
28,.HSNLVQLLGVIVEEK.,24598000.0,0.493373,1.413733,0.717741,0.415522,0.387279,1.698512,3.347671,1.048093,1.139524
48,.PMFIVNTNVPR.,29243500.0,1.389437,2.509891,1.572691,0.931865,2.297776,2.783354,1.127396,1.661532,1.834972
49,.KFGYVDFESAEDLEK.,4322900.0,0.738925,0.730736,0.584908,,0.861297,,,0.763561,


In [49]:
unique_protein_peptide_log_ratios[['Variant', 'Control_Log_Ratio_Median', 'Control_Log_Ratio_StdDev'] + filtered_drug_columns[:9]].head()

Unnamed: 0,Variant,Control_Log_Ratio_Median,Control_Log_Ratio_StdDev,aew541_1000nm,aew541_100nm,aew541_10nm,aew541_30000nm,aew541_3000nm,aew541_300nm,aew541_30nm,aew541_3nm,aew541_dmso
4,.IITHPNFNGNTLDNDIMLIK.,-4.8e-05,1.679313,0.940776,1.744009,1.155425,0.71676,1.086411,1.427892,0.424059,1.010342,0.910189
15,.VATVSLPR.,-0.000394,0.936525,,,,,,-4.446166,,,
28,.HSNLVQLLGVIVEEK.,-0.000358,1.347874,-0.706489,0.346234,-0.331646,-0.878221,-0.948609,0.529753,1.208265,0.046973,0.13061
48,.PMFIVNTNVPR.,-4.3e-05,1.052155,0.328899,0.920239,0.452788,-0.070567,0.831942,1.023657,0.11991,0.50774,0.607029
49,.KFGYVDFESAEDLEK.,0.0,0.95148,-0.302559,-0.313703,-0.5363,,-0.149316,,,-0.269762,


In [50]:
unique_protein_peptide_z_scores[['Variant', 'Control_Log_Ratio_Median', 'Control_Log_Ratio_StdDev'] + filtered_drug_columns[:9]].head()

Unnamed: 0,Variant,Control_Log_Ratio_Median,Control_Log_Ratio_StdDev,aew541_1000nm,aew541_100nm,aew541_10nm,aew541_30000nm,aew541_3000nm,aew541_300nm,aew541_30nm,aew541_3nm,aew541_dmso
4,.IITHPNFNGNTLDNDIMLIK.,-4.8e-05,1.679313,0.560243,1.038554,0.688063,0.426846,0.646966,0.850312,0.252548,0.601669,0.54203
15,.VATVSLPR.,-0.000394,0.936525,,,,,,-4.747093,,,
28,.HSNLVQLLGVIVEEK.,-0.000358,1.347874,-0.523885,0.257139,-0.245786,-0.651294,-0.703516,0.393294,0.896688,0.035115,0.097166
48,.PMFIVNTNVPR.,-4.3e-05,1.052155,0.312636,0.874665,0.430385,-0.067028,0.790744,0.972956,0.114007,0.482612,0.57698
49,.KFGYVDFESAEDLEK.,0.0,0.95148,-0.317987,-0.3297,-0.563648,,-0.15693,,,-0.283518,


In [51]:
unique_protein_peptide_p_values[['Variant', 'Control_Log_Ratio_Median', 'Control_Log_Ratio_StdDev'] + filtered_drug_columns[:9]].head()

Unnamed: 0,Variant,Control_Log_Ratio_Median,Control_Log_Ratio_StdDev,aew541_1000nm,aew541_100nm,aew541_10nm,aew541_30000nm,aew541_3000nm,aew541_300nm,aew541_30nm,aew541_3nm,aew541_dmso
4,.IITHPNFNGNTLDNDIMLIK.,-4.8e-05,1.679313,0.575314,0.299012,0.491413,0.669491,0.517654,0.395152,0.800618,0.547395,0.587798
15,.VATVSLPR.,-0.000394,0.936525,,,,,,2e-06,,,
28,.HSNLVQLLGVIVEEK.,-0.000358,1.347874,0.600359,0.797071,0.805848,0.514857,0.481734,0.694103,0.369885,0.971988,0.922594
48,.PMFIVNTNVPR.,-4.3e-05,1.052155,0.754557,0.381756,0.666916,0.946559,0.429094,0.330575,0.909232,0.629371,0.563953
49,.KFGYVDFESAEDLEK.,0.0,0.95148,0.750495,0.741627,0.572993,,0.8753,,,0.77678,


In [None]:
# Make a dictionary of unique protein peptide statistics and their corresponding perturbations
from scipy.stats import mannwhitneyu
import warnings

warnings.filterwarnings('ignore')
unique_protein_perturbations = {}

for protein in unique_proteins:
    unique_protein_perturbations[protein] = {}
    unique_protein_perturbations[protein]['Variant Abundances'] = unique_protein_peptide_rows[unique_protein_peptide_rows['Proteins'] == protein].copy()
    unique_protein_perturbations[protein]['Ratios'] = unique_protein_peptide_ratios[unique_protein_peptide_ratios['Proteins'] == protein].copy()
    unique_protein_perturbations[protein]['Log Ratios'] = unique_protein_peptide_log_ratios[unique_protein_peptide_log_ratios['Proteins'] == protein].copy()
    unique_protein_perturbations[protein]['Z-Scores'] = unique_protein_peptide_z_scores[unique_protein_peptide_z_scores['Proteins'] == protein].copy()
    unique_protein_perturbations[protein]['P-Values'] = unique_protein_peptide_p_values[unique_protein_peptide_p_values['Proteins'] == protein].copy()
    unique_protein_perturbations[protein]['Num Variants'] = len(unique_protein_perturbations[protein]['Variant Abundances'])
    unique_protein_perturbations[protein]['Variants List'] = unique_protein_perturbations[protein]['Variant Abundances']['Variant'].tolist()
    unique_protein_perturbations[protein]['Median Peptide Ratios'] = unique_protein_perturbations[protein]['Ratios'][filtered_drug_columns].median(axis=0, skipna=True)
    unique_protein_perturbations[protein]['Median Peptide Log Ratios'] = unique_protein_perturbations[protein]['Log Ratios'][filtered_drug_columns].median(axis=0, skipna=True)
    unique_protein_perturbations[protein]['Median Peptide Z-Scores'] = unique_protein_perturbations[protein]['Z-Scores'][filtered_drug_columns].median(axis=0, skipna=True)
    unique_protein_perturbations[protein]['Standard Deviation of Peptide Log Ratios'] = unique_protein_perturbations[protein]['Log Ratios'][filtered_drug_columns].std(axis=0, skipna=True)
    unique_protein_perturbations[protein]['Standard Deviation of Peptide Z-Scores'] = unique_protein_perturbations[protein]['Z-Scores'][filtered_drug_columns].std(axis=0, skipna=True)

    # Get Wilcoxon Rank-Sum U Statistics, Effect Sizes, and P-Values for each drug treatment
    unique_protein_perturbations[protein]['Wilcoxon U Statistics'] = {}
    unique_protein_perturbations[protein]['Wilcoxon P-Values'] = {}
    unique_protein_perturbations[protein]['Wilcoxon Effect Size (Rank-Biserial Correlation)'] = {}
    for col in filtered_drug_columns:
        dmso_col = col.split('_')[0] + '_dmso'
        if col == dmso_col:
            continue
        x = unique_protein_perturbations[protein]['Z-Scores'][col]
        y = unique_protein_perturbations[protein]['Z-Scores'][dmso_col]
        n1 = x.notna().sum() # Get size of x
        n2 = y.notna().sum() # Get size of y
        u_stat, p_value = mannwhitneyu(x, y, nan_policy='omit', alternative='two-sided')

        # Calculate the rank-biserial correlation (r) as the effect size
        if n1 > 3 and n2 > 3 and u_stat is not None and not np.isnan(u_stat):
            r = ((2 * u_stat) / (n1 * n2)) - 1
        else:
            r = None

        unique_protein_perturbations[protein]['Wilcoxon U Statistics'][col] = float(u_stat) if u_stat is not None and not np.isnan(u_stat) else None
        unique_protein_perturbations[protein]['Wilcoxon P-Values'][col] = float(p_value) if p_value is not None and not np.isnan(p_value) else None
        unique_protein_perturbations[protein]['Wilcoxon Effect Size (Rank-Biserial Correlation)'][col] = float(r) if r is not None and not np.isnan(r) else None

print(unique_protein_perturbations['sp|P41240|CSK_HUMAN']['Wilcoxon U Statistics'])
print(unique_protein_perturbations['sp|P41240|CSK_HUMAN']['Wilcoxon P-Values'])
print(unique_protein_perturbations['sp|P41240|CSK_HUMAN']['Wilcoxon Effect Size (Rank-Biserial Correlation)'])


{'aew541_1000nm': 38.0, 'aew541_100nm': 97.0, 'aew541_10nm': 56.0, 'aew541_30000nm': 28.0, 'aew541_3000nm': 31.0, 'aew541_300nm': 74.0, 'aew541_30nm': 44.0, 'aew541_3nm': 91.0, 'amg208_1000nm': 26.0, 'amg208_100nm': 28.0, 'amg208_10nm': 34.0, 'amg208_30000nm': 35.0, 'amg208_3000nm': 20.0, 'amg208_300nm': 27.0, 'amg208_30nm': 23.0, 'amg208_3nm': 67.0, 'amg900_1000nm': 0.0, 'amg900_100nm': 35.0, 'amg900_10nm': 33.0, 'amg900_30000nm': 47.0, 'amg900_3000nm': 41.0, 'amg900_300nm': 47.0, 'amg900_30nm': 35.0, 'amg900_3nm': 18.0, 'arry380_1000nm': 51.0, 'arry380_100nm': 52.0, 'arry380_10nm': 50.0, 'arry380_30000nm': 129.0, 'arry380_3000nm': 31.0, 'arry380_300nm': 25.0, 'arry380_30nm': 137.0, 'arry380_3nm': 136.0, 'asp3026_1000nm': 17.0, 'asp3026_100nm': 60.0, 'asp3026_10nm': 59.0, 'asp3026_30000nm': 27.0, 'asp3026_3000nm': 64.0, 'asp3026_300nm': 55.0, 'asp3026_30nm': 85.0, 'asp3026_3nm': 78.0, 'at13148_1000nm': 36.0, 'at13148_100nm': 30.0, 'at13148_10nm': 40.0, 'at13148_30000nm': 40.0, 'at1314

In [53]:
# Get Effect Size Dataframe for unique proteins
unique_protein_effect_size = pd.DataFrame.from_dict(
    {protein: unique_protein_perturbations[protein]['Wilcoxon Effect Size (Rank-Biserial Correlation)'] for protein in unique_proteins},
    orient='index'
)
unique_protein_effect_size.index.name = 'Protein'
unique_protein_effect_size.head()

Unnamed: 0_level_0,aew541_1000nm,aew541_100nm,aew541_10nm,aew541_30000nm,aew541_3000nm,aew541_300nm,aew541_30nm,aew541_3nm,amg208_1000nm,amg208_100nm,...,barasertibhqpa_30nm,barasertibhqpa_3nm,baricitib_1000nm,baricitib_100nm,baricitib_10nm,baricitib_30000nm,baricitib_3000nm,baricitib_300nm,baricitib_30nm,baricitib_3nm
Protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TRYP_PIG,1.0,1.0,1.0,-1.0,1.0,0.0,-1.0,1.0,-1.0,-1.0,...,-0.5,0.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0
sp|P41240|CSK_HUMAN,-0.512821,0.077778,-0.333333,-0.666667,-0.602564,-0.051282,-0.388889,0.083333,-0.566667,-0.533333,...,-0.209524,0.161905,-1.0,0.5,0.36,-0.76,-0.84,-0.52,0.12,0.04
sp|P14174|MIF_HUMAN,-0.5,0.5,-0.5,-0.5,0.5,0.5,-0.5,-0.5,-1.0,-1.0,...,0.0,-0.5,-1.0,1.0,1.0,-1.0,-1.0,1.0,1.0,1.0
sp|P19338|NUCL_HUMAN,0.333333,0.333333,0.166667,-1.0,-0.066667,1.0,-0.666667,0.466667,-0.533333,-0.533333,...,-0.6,-0.25,-1.0,0.611111,0.5,-0.722222,-1.0,-1.0,0.133333,-0.133333
sp|Q8TD19|NEK9_HUMAN,-0.578231,0.047619,-0.405896,-0.659864,-0.641723,-0.021645,-0.655329,-0.080952,-0.321053,-0.352381,...,-0.225108,-0.076605,-0.960938,0.601562,0.691667,-1.0,-0.973214,-0.328125,0.225,0.141667


In [None]:
# Save the Effect Size DataFrame to a CSV file
unique_protein_effect_size.to_csv('unique_protein_effect_size_threshold_4.csv')

In [55]:
# Compute the same as above for Top Canonical Proteins
canonical_protein_peptide_rows = peptide_variants_df.copy()
canonical_protein_peptide_ratios = peptide_variants_ratios.copy()
canonical_protein_peptide_log_ratios = peptide_variants_log_ratios.copy()
canonical_protein_peptide_z_scores = peptide_variants_z_scores.copy()
canonical_protein_peptide_p_values = peptide_variants_p_values.copy()

# Get the canonical proteins from the filtered data
canonical_proteins = peptide_variants_df['Top canonical protein'].unique()
print(f"Number of top canonical proteins: {len(canonical_proteins)}")
print(f"Top canonical proteins: {canonical_proteins[0]}")

Number of top canonical proteins: 828
Top canonical proteins: sp|P06239|LCK_HUMAN


In [None]:
# Make a dictionary of top canonical protein peptide statistics and their corresponding perturbations
warnings.filterwarnings('ignore')
canonical_protein_perturbations = {}

for protein in canonical_proteins:
    canonical_protein_perturbations[protein] = {}
    canonical_protein_perturbations[protein]['Variant Abundances'] = canonical_protein_peptide_rows[canonical_protein_peptide_rows['Top canonical protein'] == protein].copy()
    canonical_protein_perturbations[protein]['Ratios'] = canonical_protein_peptide_ratios[canonical_protein_peptide_ratios['Top canonical protein'] == protein].copy()
    canonical_protein_perturbations[protein]['Log Ratios'] = canonical_protein_peptide_log_ratios[canonical_protein_peptide_log_ratios['Top canonical protein'] == protein].copy()
    canonical_protein_perturbations[protein]['Z-Scores'] = canonical_protein_peptide_z_scores[canonical_protein_peptide_z_scores['Top canonical protein'] == protein].copy()
    canonical_protein_perturbations[protein]['P-Values'] = canonical_protein_peptide_p_values[canonical_protein_peptide_p_values['Top canonical protein'] == protein].copy()
    canonical_protein_perturbations[protein]['Num Variants'] = len(canonical_protein_perturbations[protein]['Variant Abundances'])
    canonical_protein_perturbations[protein]['Variants List'] = canonical_protein_perturbations[protein]['Variant Abundances']['Variant'].tolist()
    canonical_protein_perturbations[protein]['Median Peptide Ratios'] = canonical_protein_perturbations[protein]['Ratios'][filtered_drug_columns].median(axis=0, skipna=True)
    canonical_protein_perturbations[protein]['Median Peptide Log Ratios'] = canonical_protein_perturbations[protein]['Log Ratios'][filtered_drug_columns].median(axis=0, skipna=True)
    canonical_protein_perturbations[protein]['Median Peptide Z-Scores'] = canonical_protein_perturbations[protein]['Z-Scores'][filtered_drug_columns].median(axis=0, skipna=True)
    canonical_protein_perturbations[protein]['Standard Deviation of Peptide Log Ratios'] = canonical_protein_perturbations[protein]['Log Ratios'][filtered_drug_columns].std(axis=0, skipna=True)
    canonical_protein_perturbations[protein]['Standard Deviation of Peptide Z-Scores'] = canonical_protein_perturbations[protein]['Z-Scores'][filtered_drug_columns].std(axis=0, skipna=True)

    # Get Wilcoxon Rank-Sum U Statistics, Effect Sizes, and P-Values for each drug treatment
    canonical_protein_perturbations[protein]['Wilcoxon U Statistics'] = {}
    canonical_protein_perturbations[protein]['Wilcoxon P-Values'] = {}
    canonical_protein_perturbations[protein]['Wilcoxon Effect Size (Rank-Biserial Correlation)'] = {}
    for col in filtered_drug_columns:
        dmso_col = col.split('_')[0] + '_dmso'
        if col == dmso_col:
            continue
        x = canonical_protein_perturbations[protein]['Z-Scores'][col]
        y = canonical_protein_perturbations[protein]['Z-Scores'][dmso_col]
        n1 = x.notna().sum() # Get size of x
        n2 = y.notna().sum() # Get size of y
        u_stat, p_value = mannwhitneyu(x, y, nan_policy='omit', alternative='two-sided')

        # Calculate the rank-biserial correlation (r) as the effect size
        if n1 > 3 and n2 > 3 and u_stat is not None and not np.isnan(u_stat):
            r = ((2 * u_stat) / (n1 * n2)) - 1
        else:
            r = None

        canonical_protein_perturbations[protein]['Wilcoxon U Statistics'][col] = float(u_stat) if u_stat is not None and not np.isnan(u_stat) else None
        canonical_protein_perturbations[protein]['Wilcoxon P-Values'][col] = float(p_value) if p_value is not None and not np.isnan(p_value) else None
        canonical_protein_perturbations[protein]['Wilcoxon Effect Size (Rank-Biserial Correlation)'][col] = float(r) if r is not None and not np.isnan(r) else None

print(canonical_protein_perturbations['sp|P06239|LCK_HUMAN']['Wilcoxon U Statistics'])
print(canonical_protein_perturbations['sp|P06239|LCK_HUMAN']['Wilcoxon P-Values'])
print(canonical_protein_perturbations['sp|P06239|LCK_HUMAN']['Wilcoxon Effect Size (Rank-Biserial Correlation)'])


Number of variants for sp|P06239|LCK_HUMAN: 17
Number of variants for sp|P62979|RS27A_HUMAN: 9
Number of variants for sp|P35232|PHB_HUMAN: 16
Number of variants for sp|P07437|TBB5_HUMAN: 23
Number of variants for TRYP_PIG: 2
Number of variants for sp|Q13555|KCC2G_HUMAN: 14
Number of variants for sp|P28482|MK01_HUMAN: 24
Number of variants for sp|O14976|GAK_HUMAN: 29
Number of variants for sp|Q13557|KCC2D_HUMAN: 26
Number of variants for sp|O00764|PDXK_HUMAN: 6
Number of variants for sp|P19338|NUCL_HUMAN: 22
Number of variants for sp|Q00535|CDK5_HUMAN: 20
Number of variants for sp|P41240|CSK_HUMAN: 28
Number of variants for sp|P68104|EF1A1_HUMAN: 9
Number of variants for sp|Q99623|PHB2_HUMAN: 16
Number of variants for sp|Q9UHD2|TBK1_HUMAN: 26
Number of variants for sp|Q13131|AAPK1_HUMAN: 20
Number of variants for sp|P43405|KSYK_HUMAN: 15
Number of variants for sp|P07948|LYN_HUMAN: 24
Number of variants for sp|P49841|GSK3B_HUMAN: 16
Number of variants for sp|P50613|CDK7_HUMAN: 13
Number 

In [None]:
# Get Effect Size Dataframe for top canonical proteins
canonical_protein_effect_size = pd.DataFrame.from_dict(
    {protein: canonical_protein_perturbations[protein]['Wilcoxon Effect Size (Rank-Biserial Correlation)'] for protein in canonical_proteins},
    orient='index'
)
canonical_protein_effect_size.index.name = 'Protein'
canonical_protein_effect_size.head()


Unnamed: 0_level_0,aew541_1000nm,aew541_100nm,aew541_10nm,aew541_30000nm,aew541_3000nm,aew541_300nm,aew541_30nm,aew541_3nm,amg208_1000nm,amg208_100nm,...,barasertibhqpa_30nm,barasertibhqpa_3nm,baricitib_1000nm,baricitib_100nm,baricitib_10nm,baricitib_30000nm,baricitib_3000nm,baricitib_300nm,baricitib_30nm,baricitib_3nm
Protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sp|P06239|LCK_HUMAN,-0.662222,0.141667,-0.422222,-0.87619,-0.84,-0.084444,-0.697778,0.12,-0.704082,-0.582418,...,-0.283333,0.066667,-0.504132,0.353535,0.472727,-0.454545,-0.468531,-0.409091,0.123967,0.07438
sp|P62979|RS27A_HUMAN,-0.285714,0.5,0.061224,-0.571429,-0.285714,0.035714,-0.52381,0.071429,-0.47619,-0.380952,...,-0.432099,-0.388889,-0.916667,0.5,1.0,-0.666667,-0.666667,-0.5,0.0,-0.333333
sp|P35232|PHB_HUMAN,-0.296703,0.266272,0.043956,-0.472527,-0.417582,0.164835,-0.401709,0.269231,-0.318681,-0.450549,...,-0.191667,-0.078125,-0.233333,0.322222,0.487179,-0.148718,-0.166667,-0.138462,0.266667,-0.2
sp|P07437|TBB5_HUMAN,-0.291866,0.354978,-0.116883,-0.406699,-0.4329,0.2,-0.340909,0.116883,-0.368421,-0.245,...,-0.410431,-0.124717,-0.974026,0.045455,0.218182,-0.818182,-0.854545,-0.545455,-0.036364,-0.190083
TRYP_PIG,1.0,1.0,1.0,-1.0,1.0,0.0,-1.0,1.0,-1.0,-1.0,...,-0.5,0.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0


In [None]:
# Save the Effect Size DataFrame to a CSV file
canonical_protein_effect_size.to_csv('canonical_protein_effect_size_threshold_4.csv')