# Creating boxplots to show variation in peptide expression between individual samples of each tissue type 

In [692]:
import Classification_Utils as cu
import matplotlib.pyplot as plt
import MaxQuant_Postprocessing_Functions as mq
import numpy as np
from os import listdir
import pandas as pd
import seaborn as sns
from sklearn.externals import joblib

## Load and combine data from all tissues

In [693]:
files_dir = 'F:\High_Quality_All\\'
file_paths = listdir(files_dir) 

df = cu.combine_csvs(files_dir, file_paths)

In [694]:
df.dropna(axis='index', how='all', inplace=True) # drop any rows where all values are missing
df = df.drop(['\n'])
df.dropna(axis=0, how='all', inplace=True)

print(df.shape)

(154075, 253)


## Clean data
* Log2 transform
* Impute missing values
* Mean/Median normalize

In [695]:
mq.log2_normalize(df)

df_min = df.min().min()
impute_val = df_min/2
df = df.fillna(impute_val)

mq.median_normalize(df)

  df.iloc[:,:] = np.log2(df.iloc[:,:])


## Map each column to a corresponding label

In [696]:
tissues = ['Blood_Plasma', 'Blood_Serum', 'CSF', 'Liver', 'Monocyte', 'Ovary', 'Pancreas', 'Substantia_Nigra', 'Temporal_Lobe']
 
tissues_to_columns = cu.map_tissues_to_columns(df, tissues)

In [697]:
### Filter out peptides where less than [threshold] samples per tissue have non-imputed values

df_cols = df.columns.values.tolist()
organ_counts = {}
    
for tissue in tissues:
    cols = [col for col in df_cols if col.startswith(tissue)] # Get corresponding list of column names
    threshold = 5
    organ_counts[tissue] = (df[cols] != impute_val).sum(1) # count number of samples with non-imputed abundance for each protein
    
conditions = list(organ_counts[t] >= threshold for t in tissues)
filtered_df = df[np.logical_and.reduce(conditions)]

### Filter out peptides not meeting condition:
# At least X tissues must express peptide in Y samples
min_tissues = 5
min_samples = 5

df_cols = df.columns.values.tolist()
tissue_counts = {}
    
for tissue in tissues:
    cols = [col for col in df_cols if col.startswith(tissue)] # Get corresponding list of column names
    tissue_counts[tissue] = (df[cols] != impute_val).sum(1) # count # of samples with non-imputed abundance for each peptide
    
conditions = list(tissue_counts[t] >= min_samples for t in tissues)
#df = df[np.logical_and.reduce(conditions)]

In [698]:
print(df.shape)
print(filtered_df.shape)

(154075, 253)
(9, 253)


In [699]:
column_names = df.columns.values.tolist()
labels = cu.get_labels(column_names, tissues_to_columns)

filtered_column_names = filtered_df.columns.values.tolist()
filtered_labels = cu.get_labels(filtered_column_names, tissues_to_columns)

In [682]:
df.head()

Unnamed: 0_level_0,Blood_Plasma_CPTAC_TrypDige_undepleted_normal_19Apr13_Methow_13-02-13,Blood_Plasma_Darpa_2_human_02_23Jan17_Arwem_16-10-25,Blood_Plasma_OMICS_EBV_HP_UW001_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW002_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW003_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW004_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW005_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW006_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW007_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW008_8Apr16_Arwen_16-01-03,...,Temporal_Lobe_Alz_FX1P159_Guan_1_26Jul10_Andromeda_10-06-28,Temporal_Lobe_Alz_FX1P159_Guan_2_26Jul10_Andromeda_10-06-29,Temporal_Lobe_Alz_FX1P159_Guan_3_26Jul10_Andromeda_10-06-28,Temporal_Lobe_Alz_FX1P159_Guan_Typx2_1_26Jul10_Andromeda_10-06-29,Temporal_Lobe_Alz_FX1P159_Guan_Typx2_2_26Jul10_Andromeda_10-06-28,Temporal_Lobe_Alz_FX1P159_Guan_Typx2_3_26Jul10_Andromeda_10-06-29,Temporal_Lobe_Alz_FX2P57_IMAC_153_9May11_Hawk_11-04-02p,Temporal_Lobe_Alz_FX2P57_IMAC_161_20Apr11_Hawk_10-12-03p,Temporal_Lobe_Alz_FX2P57_IMAC_187_26Apr11_Hawk_10-12-03p,Temporal_Lobe_Alz_FX2P57_IMAC_212_11May11_Hawk_11-04-02p
Peptide,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
\n-.ASTKGPSVFPLAPSSK.S,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208
\n-.DIQM*TQSPSTLSASVGDR.V,26.731951,22.187643,3.022208,29.328345,29.916272,3.022208,27.586706,30.458361,29.00413,31.023004,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208
\n-.DIQM*TQSPSTLSASVGDRVTITCR.A,3.022208,3.022208,3.022208,30.633308,30.815586,3.022208,29.483431,32.564995,30.319263,32.368436,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208
\n-.DIQMTQSPS.T,26.764332,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208
\n-.DIQMTQSPSTLSASVGDR.V,26.387537,28.015792,3.022208,3.022208,31.403752,3.022208,27.695976,29.779972,29.747784,30.255299,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208


## Identify and isolate 2 most and least highly variable peptides between tissues

In [711]:
### TODO: try with and without normalization

liver_only_peptide_df = cu.keep_k_best_features(df, labels, 1)
high_variance_peptide_df = cu.keep_k_best_features(filtered_df, filtered_labels, 1)

all_but_least_variable_peptide = cu.keep_k_best_features(filtered_df,
                                                         filtered_labels, 
                                                         filtered_df.shape[0] - 2).index.values.tolist()

least_variable_peptides = list(set(filtered_df.index.values.tolist()) - set(all_but_least_variable_peptide))
least_variable_peptides_df = filtered_df.loc[least_variable_peptides, :]
low_variance_peptide_df = least_variable_peptides_df.drop(least_variable_peptides_df.index[1])

'\ntop_variable_peptides_df = cu.keep_k_best_features(df, labels, 2)\nall_but_least_variable_peptides = cu.keep_k_best_features(df, labels, df.shape[0] - 2).index.values.tolist()\nleast_variable_peptides = list(set(df.index.values.tolist()) - set(all_but_least_variable_peptides))\nleast_variable_peptides_df = df.loc[least_variable_peptides, :]\n'

In [712]:
peptide_dfs = [liver_only_peptide_df,
               least_variable_peptides_df,
               high_variance_peptide_df]

In [713]:
peptides = liver_only_peptide_df.index.values.tolist() + low_variance_peptide_df.index.values.tolist() + high_variance_peptide_df.index.values.tolist()

peptides = [p.strip('\n') for p in peptides]
peptides

['K.VLILGSGGLSIGQAGEFDYSGSQAVK.A',
 'G.DQTVSDNELQEMSNQGSK.Y',
 'K.TYFPHFDLSHGSAQVK.G']

## For each peptide, gather abundances per sample/tissue

In [714]:
def get_summarized_df(df):

    data = {}
    
    for tissue in tissues:
        cols_to_drop = [col for col in df.columns if not col.startswith(tissue)]
        tissue_df = df.drop(cols_to_drop, axis=1).T
        tissue_df.rename(columns={tissue_df.columns[0]: tissue}, inplace=True)
        data[tissue] = tissue_df[tissue_df.columns[0]].tolist()

    combined_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in data.items() ]))
    return combined_df

In [715]:
summarized_dfs = [get_summarized_df(df) for df in peptide_dfs]

In [716]:
summarized_dfs[2].T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
Blood_Plasma,3.022208,3.022208,29.094614,31.73832,31.104496,30.177597,35.697191,28.50673,39.592581,30.501899,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,
Blood_Serum,3.022208,3.022208,3.022208,3.022208,25.22964,3.022208,3.022208,3.022208,20.327618,3.022208,...,24.273483,3.022208,3.022208,32.23235,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208
CSF,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,27.821285,3.022208,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,30.879132,29.274332,3.022208,3.022208
Liver,31.793896,32.130719,32.953694,3.022208,31.953,30.112348,31.223833,32.996059,31.402736,27.629092,...,35.088718,35.368345,34.019031,34.529796,34.583181,34.676035,34.636121,35.080482,35.472363,34.420287
Monocyte,33.621457,34.366066,34.073436,39.31301,32.97035,38.297208,32.080363,32.814193,27.659013,33.233636,...,,,,,,,,,,
Ovary,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208
Pancreas,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208
Substantia_Nigra,36.710493,36.14511,36.684733,37.252528,36.360141,34.724867,35.832152,35.537482,36.346501,37.476065,...,35.049252,34.757372,34.483517,34.346386,35.132391,35.278013,34.9574,33.579701,35.512475,35.126248
Temporal_Lobe,30.803474,30.639618,31.929313,29.340769,29.52004,28.807471,29.368925,28.489568,27.852993,28.621427,...,31.904728,3.022208,31.396012,32.328361,32.118364,29.551843,28.516531,30.431016,28.731044,29.483277


## Make Boxplots

In [717]:
color_dict = {} # Column name : color
num_colors = 9
colors = sns.color_palette('hls', num_colors)
color = 0

for col in summarized_dfs[0].columns.values:
    color_dict[col] = colors[color]
    color += 1

### Make individual boxplots for each peptide

In [722]:
image_dir = r'D:\images\Human_Tissues\\'
titles = ['Liver_Only', 'Low_Variance', 'High_Variance']

for summarized_df, title in zip(summarized_dfs, titles):
    
    summarized_df = summarized_df.replace(impute_val, np.nan) ### Exclude imputed values
    
    fig, ax = plt.subplots(figsize = (10, 6))
    ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
    sns.boxplot(data = summarized_df, palette = color_dict, ax = ax, linewidth=0.5)
    output_path = image_dir + title + '.pdf'
    
    plt.
    plt.savefig(output_path, bbox_inches = "tight")
    plt.clf()



### Make combined boxplot showing all 3 peptides

In [723]:
### Group by tissue

stacked_dfs = []

for summarized_df, peptide in zip(summarized_dfs, peptides):
    stacked_df = summarized_df.stack()
    stacked_df = stacked_df.reset_index().drop('level_0', axis=1)
    stacked_df.rename(columns={'level_1': 'Tissue', 0: 'Abundance'}, inplace=True)
    stacked_df = stacked_df.assign(Peptide=peptide)
    
    stacked_dfs.append(stacked_df)
    
combined_df = pd.concat([df for df in stacked_dfs])          # CONCATENATE
combined_df = combined_df.replace(impute_val, np.nan) # Exclude imputed values

plot_title = 'Variable Peptide Expression'
    
fig, ax = plt.subplots(figsize = (10, 6))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
sns.boxplot(hue="Peptide", 
            x='Tissue', 
            y='Abundance', 
            data=combined_df, palette='hls', ax=ax, linewidth=0.5)

output_path = image_dir + plot_title + '.pdf'    
plt.savefig(output_path, bbox_inches = "tight")
plt.clf()



In [739]:
### Group by peptide

fig, ax = plt.subplots(figsize = (10, 6))
ax.set_xticklabels(ax.get_xticklabels())
flierprops = dict(markersize=2) # define outlier properties

sns.boxplot(hue="Tissue", 
            x='Peptide', 
            y='Abundance', 
            data=combined_df, palette='hls', ax=ax, linewidth=0.5, flierprops=flierprops)

plt.rcParams["font.family"] = "Arial"
plt.legend(loc=0)

output_path = image_dir + 'Grouped_By_Peptide' + '.pdf'    
plt.savefig(output_path, bbox_inches = "tight")
plt.clf()

