# Testing if AMT data is comparable to MaxQuant output

In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
import MaxQuant_Postprocessing_Functions as mq
import pandas as pd

In [4]:
#########################
#
# Load Q_Rollup output into dataframe
#
#########################

file = r"D:\Q_Rollup_Exports\amt_proteins.txt"

def load_df(filename):
    df = pd.read_csv(filename, sep='\t', lineterminator='\r', skiprows=1)
    df = df.iloc[:, :-1] # drop last column
    df = df.replace(r'\n',' ', regex=True)
    df = df.rename(columns={'\nMajority protein IDs': 'Majority protein IDs'})
    return df

amt_df = load_df(file)
print(amt_df.head())

  Majority protein IDs  Adult_07_Brain  Adult_08_Brain  Adult_09_Brain  \
0          1433B_MOUSE    3.092920e+11    3.750930e+11    3.238540e+11   
1          1433E_MOUSE    2.357930e+11    2.635480e+11    2.031850e+11   
2          1433F_MOUSE    2.761070e+11    2.954760e+11    2.991710e+11   
3          1433G_MOUSE    4.054070e+11    4.334430e+11    3.880220e+11   
4          1433S_MOUSE    9.680410e+11    1.057940e+12    1.110580e+12   

   Adult_10_Brain  Adult_11_Brain  Adult_12_Brain  
0    3.921020e+11    3.038450e+11    3.434120e+11  
1    2.959540e+11    2.423830e+11    2.622920e+11  
2    3.840410e+11    2.769120e+11    3.130650e+11  
3    4.832490e+11    3.953070e+11    4.445920e+11  
4    1.577300e+12    9.625920e+11    9.747800e+11  


In [5]:
### Divide all abundance values to more closely resemble MQ output

def scale_down_values(df):
    df.iloc[:, 1:] = df.iloc[:,1:].divide(1000)
    
scale_down_values(amt_df)
amt_df.head()

Unnamed: 0,Majority protein IDs,Adult_07_Brain,Adult_08_Brain,Adult_09_Brain,Adult_10_Brain,Adult_11_Brain,Adult_12_Brain
0,1433B_MOUSE,309292000.0,375093000.0,323854000.0,392102000.0,303845000.0,343412000.0
1,1433E_MOUSE,235793000.0,263548000.0,203185000.0,295954000.0,242383000.0,262292000.0
2,1433F_MOUSE,276107000.0,295476000.0,299171000.0,384041000.0,276912000.0,313065000.0
3,1433G_MOUSE,405407000.0,433443000.0,388022000.0,483249000.0,395307000.0,444592000.0
4,1433S_MOUSE,968041000.0,1057940000.0,1110580000.0,1577300000.0,962592000.0,974780000.0


In [6]:
"""
mq.log2_normalize(amt_df)
mq.median_normalize(amt_df)
df.head()
"""

'\nmq.log2_normalize(amt_df)\nmq.median_normalize(amt_df)\ndf.head()\n'

In [9]:
#########################
#
# Load MaxQuant data
#
#########################

mq_file = r'D:\proteinGroupsCleaned.txt'
mq_df = mq.load_df(mq_file)

mq_df = mq.clean_weakly_identified(mq_df)
mq_df = mq.remove_dup_proteinIDs(mq_df)
        
mq_df = mq.slice_by_column(mq_df, 'protein', 'iBAQ ') 

mq_df.head()

Unnamed: 0,Majority protein IDs,iBAQ 04_Liver,iBAQ 05_Liver,iBAQ 06_Liver,iBAQ 07_Brain,iBAQ 07_Heart,iBAQ 07_Kidney,iBAQ 07_Liver,iBAQ 07_Lung,iBAQ 08_Brain,...,iBAQ 10_Kidney,iBAQ 10_Lung,iBAQ 11_Brain,iBAQ 11_Heart,iBAQ 11_Kidney,iBAQ 11_Lung,iBAQ 12_Brain,iBAQ 12_Heart,iBAQ 12_Kidney,iBAQ 12_Lung
0,1433B_MOUSE,98046000.0,108870000.0,156960000.0,999960000.0,54600000.0,201800000.0,71541000.0,332420000.0,968170000.0,...,111510000.0,279310000.0,948820000.0,48104000.0,199000000.0,293140000.0,879450000.0,52335000.0,138180000.0,195620000.0
1,1433E_MOUSE,252490000.0,276160000.0,265140000.0,937830000.0,141360000.0,288800000.0,292600000.0,380270000.0,1455600000.0,...,170950000.0,399690000.0,1130800000.0,121010000.0,383140000.0,417400000.0,1319100000.0,143800000.0,377830000.0,297060000.0
2,1433F_MOUSE,28535000.0,46298000.0,35928000.0,714340000.0,25147000.0,42225000.0,29384000.0,103620000.0,597570000.0,...,30286000.0,83130000.0,692300000.0,27136000.0,48996000.0,101160000.0,710370000.0,26118000.0,47127000.0,79035000.0
3,1433G_MOUSE,178830000.0,183670000.0,191430000.0,1682100000.0,104430000.0,129570000.0,175590000.0,143240000.0,1396700000.0,...,55629000.0,148700000.0,1287100000.0,99296000.0,97703000.0,145570000.0,1376100000.0,109800000.0,64801000.0,103700000.0
4,1433S_MOUSE,53834000.0,61420000.0,92700000.0,507220000.0,42272000.0,76248000.0,68639000.0,149120000.0,534180000.0,...,76782000.0,144840000.0,450380000.0,39513000.0,106010000.0,151510000.0,503080000.0,48631000.0,95807000.0,131840000.0


In [10]:
#########################
#
# Combine Dataframes
#
#########################

amt_df.set_index(['Majority protein IDs'], inplace=True)
amt_df.index = amt_df.index.str.strip() # strip leading spaces
mq_df.set_index(['Majority protein IDs'], inplace=True)

print(amt_df.shape)
print(mq_df.shape)

df = pd.concat([amt_df, mq_df], axis=1)
df = df.iloc[1:, :] # drop empty row

print(df.head())
df.shape

(3098, 6)
(5530, 30)
             Adult_07_Brain  Adult_08_Brain  Adult_09_Brain  Adult_10_Brain  \
1433B_MOUSE     309292000.0    3.750930e+08    3.238540e+08    3.921020e+08   
1433E_MOUSE     235793000.0    2.635480e+08    2.031850e+08    2.959540e+08   
1433F_MOUSE     276107000.0    2.954760e+08    2.991710e+08    3.840410e+08   
1433G_MOUSE     405407000.0    4.334430e+08    3.880220e+08    4.832490e+08   
1433S_MOUSE     968041000.0    1.057940e+09    1.110580e+09    1.577300e+09   

             Adult_11_Brain  Adult_12_Brain  iBAQ 04_Liver  iBAQ 05_Liver  \
1433B_MOUSE     303845000.0     343412000.0     98046000.0    108870000.0   
1433E_MOUSE     242383000.0     262292000.0    252490000.0    276160000.0   
1433F_MOUSE     276912000.0     313065000.0     28535000.0     46298000.0   
1433G_MOUSE     395307000.0     444592000.0    178830000.0    183670000.0   
1433S_MOUSE     962592000.0     974780000.0     53834000.0     61420000.0   

             iBAQ 06_Liver  iBAQ 07_Brain

(5853, 36)

In [11]:
image_dir = r'D:\Images\AMT\\'
    
organ_columns = {} # 'Liver': ['iBAQ 04_Liver', 'iBAQ 05_Liver', ...]
organ_counts = {} # 'Liver': 
groups = ['Brain', 'Heart', 'Kidney', 'Liver', 'Lung']
    
df = mq.filter_low_observed(df, groups, organ_columns, organ_counts)
    
### Normalize and produce box plots
mq.log2_normalize(df)
color_dict = mq.map_colors(groups, organ_columns)
mq.median_normalize(df)
df = df[organ_columns['Brain'] + organ_columns['Heart'] + organ_columns['Kidney'] + organ_columns['Liver'] + organ_columns['Lung']]

### Adult_07_Brain is a huge outlier
df.drop(['Adult_07_Brain'], axis=1, inplace=True)
    
### PCA
imputed_df = mq.impute_missing(df.copy())
pca, pca_data = mq.do_pca(imputed_df)
    
per_var, labels = mq.make_scree_plot(pca, image_dir) 
column_names = imputed_df.columns.values.tolist()

mq.draw_pca_graph(column_names, pca_data, image_dir, color_dict, per_var, labels)
mq.make_pearson_matrix(imputed_df, image_dir)

  return lib.map_infer(x.asobject, func)
