# Proteome evaluation and data visualization

This code guides you through the evaluation of DDA data aquired as excel sheet from Scaffold. The excel sheet should contain total spectral counts. 

### Import Libraries

In [21]:
import os
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from collections import OrderedDict
import re 
import plotly.graph_objects as go
import math
import dash_bio

### File Preparation
The files downloaded from scaffold are '.xls' files and contain a header and two lines at the end of the file which need to be removed prior to uploading the data. After removing the header, save the files as .txt file and upload them to your desired folder. The file should have following format: 

In [22]:
TEST=pd.read_csv('/vortexfs1/home/fadime.stemmer/0124_rpom_proteolysis/data/proteomes/DSS-3/Proteome_TSC_240216_Fadime_rpom_ProMM.txt', delimiter='\t')
#print(TEST)

### Load Data
Next we can upload the .txt files as pandas dataframe and print to make sure they are there. 

In [23]:
# DSS3
DSS3_ProMM = pd.read_csv('/vortexfs1/home/fadime.stemmer/0124_rpom_proteolysis/data/proteomes/DSS-3/Annotated_Proteome_TSC_240216_Fadime_rpom_ProMM.txt', delimiter='\t')
#print(DSS_3_ProMM)
DSS3_ACD = pd.read_csv('/vortexfs1/home/fadime.stemmer/0124_rpom_proteolysis/data/proteomes/DSS-3/Annotated_240321_Rpom_ACD.txt', delimiter='\t')
#print(DSS_3_ACD)
DSS3stat_ACD = pd.read_csv('/vortexfs1/home/fadime.stemmer/0124_rpom_proteolysis/data/proteomes/DSS-3/Annotated_20240514_DSS3_ACD_stationary.txt', delimiter='\t')
#print(DSS3stat_ACD)
DSS3stat_ProMM = pd.read_csv('/vortexfs1/home/fadime.stemmer/0124_rpom_proteolysis/data/proteomes/DSS-3/Annotated_240613_DSS3_ProMM_stationary.txt', delimiter='\t')
#print(DSS3stat_ProMM)
DSS3_Secretome = pd.read_csv('/vortexfs1/home/fadime.stemmer/0124_rpom_proteolysis/data/proteomes/DSS-3/Annotated_240618_Secretome_DSS3.txt', delimiter='\t')
#print(DSS3_Secretome)

# MIT1002
MIT1002_ACD = pd.read_csv('/vortexfs1/home/fadime.stemmer/0124_rpom_proteolysis/data/proteomes/MIT1002/Proteome_TSC_240201_Fadime_MIT1002.txt', delimiter='\t')
#print(MIT1002_ACD)
MIT1002_ProMM = pd.read_csv('/vortexfs1/home/fadime.stemmer/0124_rpom_proteolysis/data/proteomes/MIT1002/240430_Fadime_MIT1002_ProMM_TSC.txt', delimiter='\t')
#print(MIT1002_ProMM)
MIT1002stat_ACD = pd.read_csv('/vortexfs1/home/fadime.stemmer/0124_rpom_proteolysis/data/proteomes/MIT1002/20240514_MIT1002_ACD_stationary.txt', delimiter='\t')
#print(MIT1002stat_ACD)
MIT1002stat_ProMM = pd.read_csv('/vortexfs1/home/fadime.stemmer/0124_rpom_proteolysis/data/proteomes/MIT1002/240522_Fadime_MIT1002_ProMM_stationary.txt', delimiter='\t')
#print(MIT1002stat_ProMM)
MIT1002_Secretome = pd.read_csv('/vortexfs1/home/fadime.stemmer/0124_rpom_proteolysis/data/proteomes/MIT1002/240617_Secretome_MIT1002.txt', delimiter='\t')
#print(MIT1002_Secretome)

## Statistics
### Average Triplicates
Next we can average the triplicate analyses and include statistics in our plots. 

In [24]:
# Selecting the relevant columns 
DSS3_ProMM123 = DSS3_ProMM[['240216_Fadime_astral_1ug_Rpom_1', '240216_Fadime_astral_1ug_Rpom_2', '240216_Fadime_astral_1ug_Rpom_3']] 
DSS3_ACD123 = DSS3_ACD[['240312_2ug_Fadime_1', '240312_2ug_Fadime_2', '240312_2ug_Fadime_3']] 
DSS3stat_ACD123 = DSS3stat_ACD[['20240430_RpomDSS3_ACD_stationary1', '20240430_RpomDSS3_ACD_stationary2', '20240430_RpomDSS3_ACD_stationary3']] 
DSS3stat_ProMM123 = DSS3stat_ProMM[['240514_Fadime_Rpom_DSS3_1', '240514_Fadime_Rpom_DSS3_2', '240514_Fadime_Rpom_DSS3_3']] 
DSS3_sec_ACD123 = DSS3_Secretome[['240613_Fadime_astral_1ug_DSS3_ACD_Sec_1', '240613_Fadime_astral_1ug_DSS3_ACD_Sec_2', '240613_Fadime_astral_1ug_DSS3_ACD_Sec_3']] 
DSS3_sec_ProMM123 = DSS3_Secretome[['240613_Fadime_astral_1ug_DSS3_ProMM_Sec_1', '240613_Fadime_astral_1ug_DSS3_ProMM_Sec_2', '240613_Fadime_astral_1ug_DSS3_ProMM_Sec_3']] 

MIT1002_ACD123 = MIT1002_ACD[['240201_Fadime_1b', '240201_Fadime_2b', '240201_Fadime_3b']] 
MIT1002_ProMM123 = MIT1002_ProMM[['240424_Fadime_astral_1ug_Amac_1', '240424_Fadime_astral_1ug_Amac_2', '240424_Fadime_astral_1ug_Amac_3']] 
MIT1002stat_ACD123 = MIT1002stat_ACD[['20240430_AmacMIT1002_ACD_stationary1', '20240430_AmacMIT1002_ACD_stationary2', '20240430_AmacMIT1002_ACD_stationary3']] 
MIT1002stat_ProMM123 = MIT1002stat_ProMM[['240514_Fadime_Alteromonas_MIT1002_1', '240514_Fadime_Alteromonas_MIT1002_2', '240514_Fadime_Alteromonas_MIT1002_3']] 
MIT1002_sec_ACD123 = MIT1002_Secretome[['240613_Fadime_astral_1ug_MIT1002_ACD_Sec_1', '240613_Fadime_astral_1ug_MIT1002_ACD_Sec_2', '240613_Fadime_astral_1ug_MIT1002_ACD_Sec_3']] 
MIT1002_sec_ProMM123 = MIT1002_Secretome[['240613_Fadime_astral_1ug_MIT1002_ProMM_Sec_1', '240613_Fadime_astral_1ug_MIT1002_ProMM_Sec_2', '240613_Fadime_astral_1ug_MIT1002_ProMM_Sec_3']] 

In [25]:
# Computing the row-wise average 
def statistics(df1, df2):
    row_wise_average = df1.mean(axis=1)
    standard_deviation = df1.std(axis=1)
    df2['Row_Average']= row_wise_average
    df2['STD'] = standard_deviation
    
def statistics_Secretome(df1, df2, df3):
    row_wise_average_A = df1.mean(axis=1)
    standard_deviation_A = df1.std(axis=1)
    row_wise_average_P = df2.mean(axis=1)
    standard_deviation_P = df2.std(axis=1)
    df3['Row_Average_A']= row_wise_average_A
    df3['STD_A'] = standard_deviation_A
    df3['Row_Average_P']= row_wise_average_P
    df3['STD_P'] = standard_deviation_P

In [26]:
statistics(DSS3_ProMM123, DSS3_ProMM)
statistics(DSS3_ACD123, DSS3_ACD)
statistics(DSS3stat_ProMM123, DSS3stat_ProMM)
statistics(DSS3stat_ACD123, DSS3stat_ACD)
statistics_Secretome(DSS3_sec_ACD123, DSS3_sec_ProMM123, DSS3_Secretome)

statistics(MIT1002_ProMM123, MIT1002_ProMM)
statistics(MIT1002_ACD123, MIT1002_ACD)
statistics(MIT1002stat_ProMM123, MIT1002stat_ProMM)
statistics(MIT1002stat_ACD123, MIT1002stat_ACD)
statistics_Secretome(MIT1002_sec_ACD123, MIT1002_sec_ProMM123, MIT1002_Secretome)

#print(MIT1002_Secretome)

### Match Annotations of different files

Each of the experiments will have different proteins showing up at different numbers. In order to being able to compare treatments from different samples, we need to match the protein annotations, therefore the length of the arrays and fill in 0 for when the protein appears in one treatment but not the other. 


    Sorts the given DataFrame by 'Accession Number' and retains only specific columns.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame to be sorted and filtered.
    
    Returns:
    pd.DataFrame: The sorted and filtered DataFrame.


In [27]:
# Rename Identified Proteins column for MIT1002
MIT1002_ACD.rename(columns={'Identified Proteins (1608)': 'Annotation'}, inplace=True)
MIT1002stat_ACD.rename(columns={'Identified Proteins (2559)': 'Annotation'}, inplace=True)
MIT1002_ProMM.rename(columns={'Identified Proteins (2893)': 'Annotation'}, inplace=True)
MIT1002stat_ProMM.rename(columns={'Identified Proteins (2592)': 'Annotation'}, inplace=True)
MIT1002_Secretome.rename(columns={'Identified Proteins (1005)':'Annotation'}, inplace=True)
#print(MIT1002_Secretome)

In [28]:
def sort_by_accession_no(df):
    # Sort the DataFrame by 'Accession Number'
    sorted_df = df.sort_values(by='Accession Number')
    
    # Filter the DataFrame to keep only the required columns
    filtered_df = sorted_df[['Accession Number', 'Annotation', 'Row_Average', 'STD']]
    
    return filtered_df

def sort_by_accession_no_sec(df):
    # Sort the DataFrame by 'Accession Number'
    sorted_df = df.sort_values(by='Accession Number')
    
    # Filter the DataFrame to keep only the required columns
    filtered_df = sorted_df[['Accession Number', 'Annotation', 'Row_Average_A', 'Row_Average_P', 'STD_A', 'STD_P']]
    
    return filtered_df

In [29]:
# Sort and select DSS3
Sorted_DSS3_ACD = sort_by_accession_no(DSS3_ACD)
Sorted_DSS3_ProMM = sort_by_accession_no(DSS3_ProMM)
Sorted_DSS3_ACD_stat = sort_by_accession_no(DSS3stat_ACD)
Sorted_DSS3_ProMM_stat = sort_by_accession_no(DSS3stat_ProMM)
Sorted_DSS3_Secretome = sort_by_accession_no_sec(DSS3_Secretome)

# Sort and select MIT1002
Sorted_MIT1002_ACD = sort_by_accession_no(MIT1002_ACD)
Sorted_MIT1002_ProMM = sort_by_accession_no(MIT1002_ProMM)
Sorted_MIT1002_ACD_stat = sort_by_accession_no(MIT1002stat_ACD)
Sorted_MIT1002_ProMM_stat = sort_by_accession_no(MIT1002stat_ProMM)
Sorted_MIT1002_Secretome = sort_by_accession_no_sec(MIT1002_Secretome)

print(Sorted_DSS3_ACD)
print(Sorted_DSS3_Secretome)

     Accession Number                                         Annotation  \
599           SPO0001               glucose inhibited division protein A   
2044          SPO0002               glucose-inhibited division protein B   
1441          SPO0003               chromosome partitioning protein ParA   
814           SPO0004               chromosome partitioning protein parB   
1887          SPO0006  oxygen-independent coproporphyrinogen III oxid...   
...               ...                                                ...   
1355         SPOA0444                                            Unknown   
2738         SPOA0445                                            Unknown   
1872         SPOA0449                                            Unknown   
2786         SPOA0451                                            Unknown   
1686         SPOA0452                                            Unknown   

      Row_Average       STD  
599     56.666667  6.506407  
2044     8.666667  2.081666

Now that we sorted the values, we can write a function that allows us to combine the dataframes of the individual treatments into one, sorted by the annotations. 

In [30]:
def merge_and_format(df1, df2, suf_1='', suf_2=''):
    merged_df = pd.merge(df1, df2, on="Accession Number", how="outer", suffixes=(suf_1, suf_2))
    fillna = merged_df.fillna(0, inplace=True)
    annotations_A = merged_df['Annotation{}'.format(suf_1)].replace(0, "Unknown")
    annotations_P = merged_df['Annotation{}'.format(suf_2)].replace(0, "Unknown")
    return merged_df

def format_sec(df):
    fillna = df.fillna(0, inplace=True)
    annotations = df['Annotation'].replace(0, "Unknown")
    return df

In [31]:
DSS3_MidLog = merge_and_format(Sorted_DSS3_ACD, Sorted_DSS3_ProMM, '_ACD', '_ProMM')
DSS3_Stat = merge_and_format(Sorted_DSS3_ACD_stat, Sorted_DSS3_ProMM_stat, '_ACD', '_ProMM')
DSS3_ACD_MLS = merge_and_format(Sorted_DSS3_ACD, Sorted_DSS3_ACD_stat, '_ML', '_Stat')
DSS3_ProMM_MLS = merge_and_format(Sorted_DSS3_ProMM, Sorted_DSS3_ProMM_stat, '_ML', '_Stat')
DSS3_Secretome = format_sec(Sorted_DSS3_Secretome)

MIT1002_MidLog = merge_and_format(Sorted_MIT1002_ACD, Sorted_MIT1002_ProMM, '_ACD', '_ProMM')
MIT1002_Stat = merge_and_format(Sorted_MIT1002_ACD_stat, Sorted_MIT1002_ProMM_stat, '_ACD', '_ProMM')
MIT1002_ACD_MLS = merge_and_format(Sorted_MIT1002_ACD, Sorted_MIT1002_ACD_stat, '_ML', '_Stat')
MIT1002_ProMM_MLS = merge_and_format(Sorted_MIT1002_ProMM, Sorted_MIT1002_ProMM_stat, '_ML', '_Stat')
MIT1002_Secretome = format_sec(Sorted_MIT1002_Secretome)

print(MIT1002_Secretome)

    Accession Number                                         Annotation  \
436       VTP50002.1  hypothetical protein MIT1002_00005 [Alteromona...   
803       VTP50004.1  glycyl-tRNA synthetase alpha chain [Alteromona...   
212       VTP50009.1            Cytochrome c553 [Alteromonas macleodii]   
813       VTP50016.1  methionyl-tRNA formyltransferase [Alteromonas ...   
555       VTP50017.1        peptide deformylase [Alteromonas macleodii]   
..               ...                                                ...   
587       VTP58108.1  hypothetical protein MIT1002_04201 [Alteromona...   
7         VTP58112.1  protein of unknown function (DUF1259) [Alterom...   
778       VTP58121.1  Signal transduction histidine kinase [Alteromo...   
976       VTP58127.1  Methyl-accepting chemotaxis protein (MCP) sign...   
896       VTP58133.1  hypothetical protein MIT1002_04226 [Alteromona...   

     Row_Average_A  Row_Average_P      STD_A      STD_P  
436       2.333333       7.333333   1.527

## Plot Treatments and Statistics
In order to simplify the data evaluation, we first define a function, which allows us to plot each treatment against each other and calculate statistics such as R2 values for each condition. 

In [32]:
def plot_and_regression(df1, df2, df3, df4, df5, title='', filename=''):
    x = df1.values
    y = df2.values
    z = df3.values
    yerr = df4.values
    xerr = df5.values
    
    # Perform linear regression
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    
    # Calculate predicted values
    predicted_values = slope * x + intercept
    
    # Calculate R squared
    r_squared = r_value ** 2

    #PLOT
    plt.figure(figsize=(8, 6))
    
    # Filter data points based on annotation
    protease_indices = df1.index[df3.str.contains('protease', case=False)]
    peptidase_indices = df1.index[df3.str.contains('peptidase', case=False)]
    metalloprotease_indices = df1.index[df3.str.contains('metalloprotease', case=False)]
    metallopeptidase_indices = df1.index[df3.str.contains('metallopeptidase', case=False)]
    non_protease_indices = df1.index[~df3.str.contains('protease', case=False)]
    
    # Plot non-protease data points in black
    plt.errorbar(x[non_protease_indices], y[non_protease_indices], yerr=yerr[non_protease_indices], xerr=xerr[non_protease_indices], fmt='o', label='Data (Non-Protease)', color='black')
    
    # Plot selected interesting protein functions in different colors
    plt.errorbar(x[protease_indices], y[protease_indices], yerr=yerr[protease_indices], xerr=xerr[protease_indices], fmt='o', label='Data (Protease)', color='blue')
    plt.errorbar(x[peptidase_indices], y[peptidase_indices], yerr=yerr[peptidase_indices], xerr=xerr[peptidase_indices], fmt='o', label='Data (Peptidase)', color='red')
    plt.errorbar(x[metallopeptidase_indices], y[metallopeptidase_indices], yerr=yerr[metallopeptidase_indices], xerr=xerr[metallopeptidase_indices], fmt='o', label='Data (Metallopeptidase)', color='pink')
    plt.errorbar(x[metalloprotease_indices], y[metalloprotease_indices], yerr=yerr[metalloprotease_indices], xerr=xerr[metalloprotease_indices], fmt='o', label='Data (Metalloprotease)', color='lightblue')

    # Generate plot/figure
    plt.plot(x, predicted_values, color='red', label='Linear Regression')
    plt.title(f'{title} (R² = {r_squared:.2f})')
    plt.xlabel(df1.name)
    plt.ylabel(df2.name)
    plt.legend()
    #Comment out if you dont want to download a file
    #plt.savefig(filename, format='pdf')
    plt.show()
    
    return slope, intercept, std_err, df1, df2, df3

### Extract confident proteins

In addition to plotting with error bars we need to figure out which of the proteins are significantly deviating from the linear regression line. Therefore we first need to identify the 5% confidence interval. Once that is done, all the datapoints outside of the confidence interval should be printed, including their annotation and their mean spectral count. 

In [33]:
def extract_outliers(slope, intercept, std_err, df1, df2, df3, filename, confidence_interval=0.95):
    x = df1.values
    y = df2.values
    z = df3.values
    
    # Calculate predicted values
    predicted_values = slope * x + intercept
    
    # Calculate residuals
    residuals = y - predicted_values
    
    # Calculate standard deviation of residuals
    residuals_std = np.std(residuals)
    
    # Calculate the confidence interval for the residuals
    conf_interval = residuals_std * stats.t.ppf((1 + confidence_interval) / 2., len(x)-1)
    
    # Identify points outside the confidence interval
    outlier_indices = np.where(np.abs(residuals) > conf_interval)[0]
    
    # Extract the corresponding rows from the DataFrame
    outliers = df1.iloc[outlier_indices]
    outlier_annotations = df3.iloc[outlier_indices]
    
    # Create a DataFrame to save the outliers and their annotations
    outlier_df = pd.DataFrame({
        'Value': outliers,
        'Annotation': outlier_annotations,
        'Predicted': predicted_values[outlier_indices],
        'Residual': residuals[outlier_indices]
    })
    
    # Write the DataFrame to a file. Comment out if you dont want to download a file
    #outlier_df.to_csv(filename, sep='\t', index=False)
    
    return outlier_df

### Plotting x/y plots
Lets use the plot_and_regression function to plot our x/y plots and extract significant outliers from the 95% confidence interval using the extract_outliers function

In [34]:
#Plot DSS3 in Midlog phase grown in AC Difco vs. grown in ProMM minimal medium
slope, intercept, std_err, df1, df2, df3 = plot_and_regression(DSS3_MidLog["Row_Average_ACD"], DSS3_MidLog["Row_Average_ProMM"], DSS3_MidLog["Annotation_ACD"], DSS3_MidLog["STD_ProMM"], DSS3_MidLog["STD_ACD"], title='DSS-3 (MidLog) ACD vs. ProMM', filename ="DSS3_ML_ACDvsProMM.pdf")
# Then extract the outliers
extract_outliers(slope, intercept, std_err, DSS3_MidLog["Row_Average_ACD"], DSS3_MidLog["Row_Average_ProMM"], DSS3_MidLog["Annotation_ACD"], filename='DSS3_ML_ACDvsProMM_outliers.csv')


ValueError: Cannot mask with non-boolean array containing NA / NaN values

<Figure size 800x600 with 0 Axes>

In [None]:
#Plot MIT1002 in Midlog phase grown in AC Difco vs. grown in ProMM minimal medium
slope, intercept, std_err, df1, df2, df3 = plot_and_regression(MIT1002_MidLog["Row_Average_ACD"], MIT1002_MidLog["Row_Average_ProMM"], MIT1002_MidLog["Annotation_ACD"], MIT1002_MidLog["STD_ProMM"], MIT1002_MidLog["STD_ACD"], title='MIT1002 (MidLog) ACD vs. ProMM', filename ="MIT1002_ML_ACDvsProMM.pdf")
extract_outliers(slope, intercept, std_err, MIT1002_MidLog["Row_Average_ACD"], MIT1002_MidLog["Row_Average_ProMM"], MIT1002_MidLog["Annotation_ACD"], filename='MIT1002_ML_ACDvsProMM_outliers.csv')

In [None]:
#Plot DSS3 in Stationary phase grown in AC Difco vs. grown in ProMM minimal medium
slope, intercept, std_err, df1, df2, df3 = plot_and_regression(DSS3_Stat["Row_Average_ACD"], DSS3_Stat["Row_Average_ProMM"], DSS3_Stat["Annotation_ACD"], DSS3_Stat["STD_ProMM"], DSS3_Stat["STD_ACD"], title='DSS-3 (Stationary) ACD vs. ProMM', filename ="DSS3_S_ACDvsProMM.pdf")
extract_outliers(slope, intercept, std_err, DSS3_Stat["Row_Average_ACD"], DSS3_Stat["Row_Average_ProMM"], DSS3_Stat["Annotation_ACD"], filename='DSS3_S_ACDvsProMM_outliers.csv')

In [None]:
#Plot MIT1002 in Stationary phase grown in AC Difco vs. grown in ProMM minimal medium
slope, intercept, std_err, df1, df2, df3 = plot_and_regression(MIT1002_Stat["Row_Average_ACD"], MIT1002_Stat["Row_Average_ProMM"], MIT1002_Stat["Annotation_ACD"], MIT1002_Stat["STD_ProMM"], MIT1002_Stat["STD_ACD"], title='MIT1002 (Stationary) ACD vs. ProMM', filename ="MIT1002_S_ACDvsProMM.pdf")
extract_outliers(slope, intercept, std_err, MIT1002_Stat["Row_Average_ACD"], MIT1002_Stat["Row_Average_ProMM"], MIT1002_Stat["Annotation_ACD"], filename='MIT1002_S_ACDvsProMM_outliers.csv')

In [None]:
#Plot DSS3 grown in AC Difco in Midlog phase vs. Stationary phase 
slope, intercept, std_err, df1, df2, df3 = plot_and_regression(DSS3_ACD_MLS["Row_Average_ML"], DSS3_ACD_MLS["Row_Average_S"], DSS3_ACD_MLS["Annotation_ML"], DSS3_ACD_MLS["STD_S"], DSS3_ACD_MLS["STD_ML"], title='DSS3 in ACD Midlog vs. Stationary', filename="DSS3_ACD_MLS.pdf")
extract_outliers(slope, intercept, std_err, DSS3_ACD_MLS["Row_Average_ML"], DSS3_ACD_MLS["Row_Average_S"], DSS3_ACD_MLS["Annotation_ML"], filename='DSS3_ACD_MLS_outliers.csv')

In [None]:
#Plot MIT1002 grown in AC Difco in Midlog phase vs. Stationary phase 
slope, intercept, std_err, df1, df2, df3 = plot_and_regression(MIT1002_ACD_MLS["Row_Average_ML"], MIT1002_ACD_MLS["Row_Average_S"], MIT1002_ACD_MLS["Annotation_ML"], MIT1002_ACD_MLS["STD_S"],MIT1002_ACD_MLS["STD_ML"], title='MIT1002 in ACD Midlog vs. Stationary', filename="MIT1002_ACD_MLS.pdf")
extract_outliers(slope, intercept, std_err, MIT1002_ACD_MLS["Row_Average_ML"], MIT1002_ACD_MLS["Row_Average_S"], MIT1002_ACD_MLS["Annotation_ML"], filename='MIT1002_ACD_MLS_outliers.csv')

In [None]:
#Plot DSS3 grown in ProMM in Midlog phase vs. Stationary phase 
slope, intercept, std_err, df1, df2, df3 = plot_and_regression(DSS3_ProMM_MLS["Row_Average_ML"], DSS3_ProMM_MLS["Row_Average_S"], DSS3_ProMM_MLS["Annotation_ML"], DSS3_ProMM_MLS["STD_S"], DSS3_ProMM_MLS["STD_ML"], title='DSS3 in ProMM Midlog vs. Stationary', filename="DSS3_ProMM_MLS.pdf")
extract_outliers(slope, intercept, std_err, DSS3_ProMM_MLS["Row_Average_ML"], DSS3_ProMM_MLS["Row_Average_S"], DSS3_ProMM_MLS["Annotation_ML"], filename='DSS3_ProMM_MLS_outliers.csv')

In [None]:
#Plot MIT1002 grown in ProMM in Midlog phase vs. Stationary phase 
slope, intercept, std_err, df1, df2, df3 = plot_and_regression(MIT1002_ProMM_MLS["Row_Average_ML"], MIT1002_ProMM_MLS["Row_Average_S"], MIT1002_ProMM_MLS["Annotation_ML"], MIT1002_ProMM_MLS["STD_S"],MIT1002_ProMM_MLS["STD_ML"], title='MIT1002 in ProMM Midlog vs. Stationary', filename="MIT1002_ProMM_MLS.pdf")
extract_outliers(slope, intercept, std_err, MIT1002_ProMM_MLS["Row_Average_ML"], MIT1002_ProMM_MLS["Row_Average_S"], MIT1002_ProMM_MLS["Annotation_ML"], filename='MIT1002_ProMM_MLS_outliers.csv')

In [None]:
#Plot DSS3 Secretome from cells grown in ACD vs. ProMM
slope, intercept, std_err, df1, df2, df3 = plot_and_regression(Sorted_DSS3_Secretome["Row_Average_A"], Sorted_DSS3_Secretome["Row_Average_P"], Sorted_DSS3_Secretome["Annotation"], Sorted_DSS3_Secretome["STD_A"], Sorted_DSS3_Secretome["STD_P"], title='DSS3 Secretome ACD vs. ProMM', filename="DSS3_Secretome_AP.pdf")
extract_outliers(slope, intercept, std_err, Sorted_DSS3_Secretome["Row_Average_A"], Sorted_DSS3_Secretome["Row_Average_P"], Sorted_DSS3_Secretome["Annotation"], filename='DSS3_Secretome_AP_outliers.csv')

In [None]:
#Plot MIT1002 Secretome from cells grown in ACD vs. ProMM
slope, intercept, std_err, df1, df2, df3 = plot_and_regression(Sorted_MIT1002_Secretome["Row_Average_A"], Sorted_MIT1002_Secretome["Row_Average_P"], Sorted_MIT1002_Secretome["Annotation"], Sorted_MIT1002_Secretome["STD_A"],Sorted_MIT1002_Secretome["STD_P"], title='MIT1002 Secretome ACD vs. ProMM', filename="MIT1002_Secretome_AP.pdf")
extract_outliers(slope, intercept, std_err, Sorted_MIT1002_Secretome["Row_Average_A"], Sorted_MIT1002_Secretome["Row_Average_P"], Sorted_MIT1002_Secretome["Annotation"], filename='MIT1002_Secretome_AP_outliers.csv')