In [1]:
import Classification_Utils as cu
import numpy as np
from os import listdir
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.externals import joblib

In [2]:
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from matplotlib.markers import MarkerStyle
from mpl_toolkits.axes_grid1.inset_locator import mark_inset, zoomed_inset_axes
from matplotlib.lines import Line2D

## Load and clean data
* One data frame for all training and test data
* One data frame for healthy vs diseased comparison

In [3]:
train_test_df = pd.read_csv('FullPeptideQuant.txt', sep='\t', index_col='Peptide')
print(train_test_df.shape)

healthy_diseased_df = pd.read_csv('HealthyDiseasedQuant.txt', sep='\t', index_col='Peptide')
print(healthy_diseased_df.shape)

(55676, 253)
(24932, 149)


## Map each column to a corresponding label

In [4]:
def remove_prefix(col_names):
    new_names = []
    
    for name in col_names:
        if name.startswith('Diseased_'):
            new_names.append(name.replace('Diseased_', ''))
        elif name.startswith('Healthy_'):
            new_names.append(name.replace('Healthy_', ''))
        else:
            new_names.append(name)
            
    return new_names

In [5]:
# Remove "Healthy_" or "Diseased_" prefix from column names
original_healthy_diseased_col_names = healthy_diseased_df.columns.values.tolist()

stripped_col_names = remove_prefix(original_healthy_diseased_col_names)
healthy_diseased_df.columns = stripped_col_names

In [6]:
train_test_tissues = ['Blood_Plasma', 'Blood_Serum', 'CSF', 
                      'Liver', 'Monocyte', 'Ovary', 'Pancreas', 
                      'Substantia_Nigra', 'Temporal_Lobe']
train_test_tissues_to_columns = cu.map_tissues_to_columns(train_test_df, train_test_tissues)
 
healthy_diseased_tissues = ['Blood_Plasma', 'Liver', 'Pancreas', 'Substantia_Nigra', 'CSF']
healthy_diseased_tissues_to_columns = cu.map_tissues_to_columns(healthy_diseased_df, healthy_diseased_tissues)

In [7]:
train_test_column_names = train_test_df.columns.values.tolist()
train_test_labels = cu.get_labels(train_test_column_names, 
                                  train_test_tissues_to_columns)

healthy_diseased_column_names = healthy_diseased_df.columns.values.tolist()
healthy_diseased_labels = cu.get_labels(healthy_diseased_column_names, 
                                        healthy_diseased_tissues_to_columns)

## PCA

### Map columns to colors, and prepare PCA data frames

In [8]:
import seaborn as sns

def map_colors(tissues, tissues_to_columns, num_colors=6, palette='hls'):
    color_dict = {} # Column name : color
    num_colors = num_colors
    colors = sns.color_palette(palette, num_colors)
    color = 0

    for tissue in tissues:
        cols = tissues_to_columns[tissue] # Get the list of column names for the organ
        for col in cols:
            color_dict[col] = colors[color % len(colors)]
        color += 1
        
    return color_dict

In [9]:
train_test_dir = r'D:\Images\Human_Tissues\\'
healthy_diseased_dir = r'D:\Images\Healthy_vs_Diseased\\'

train_test_column_to_color = map_colors(train_test_tissues, 
                                        train_test_tissues_to_columns, 
                                        9)

healthy_diseased_column_to_color = map_colors(healthy_diseased_tissues, 
                                              healthy_diseased_tissues_to_columns, 
                                              5)

In [10]:
train_test_T = train_test_df.T  #Transpose data frame

# For 2 dimensional PCA
pca = PCA() # create a PCA object
pca.fit(train_test_T) # do the math
pca_data = pca.transform(train_test_T) # get PCA coordinates for dataframe

# For 3 dimensional PCA
pca_3 = PCA(n_components=3) 
pca_3.fit(train_test_T)
pca_data_3 = pca_3.transform(train_test_T)

per_var = np.round(pca.explained_variance_ratio_* 100, decimals = 1)
pca_labels = ['PC' + str(x) for x in range(1, len(per_var)+1)] 

per_var_3 = np.round(pca_3.explained_variance_ratio_* 100, decimals = 1)
pca_labels_3 = ['PC' + str(x) for x in range(1, len(per_var_3)+1)] 

In [11]:
healthy_diseased_T = healthy_diseased_df.T  #Transpose data frame

# For 2 dimensional PCA
healthy_diseased_pca = PCA() # create a PCA object
healthy_diseased_pca.fit(healthy_diseased_T) # do the math
healthy_diseased_pca_data = healthy_diseased_pca.transform(healthy_diseased_T) # get PCA coordinates for dataframe

healthy_diseased_per_var = np.round(healthy_diseased_pca.explained_variance_ratio_* 100, decimals = 1)
healthy_diseased_pca_labels = ['PC' + str(x) for x in range(1, len(healthy_diseased_per_var)+1)] 

### 3D PCA of Train and Test

In [14]:
from mpl_toolkits.mplot3d import Axes3D

def draw_3d_pca(column_names, pca_data, base_dir, color_dict, per_var, labels, all_organs, organs_to_columns, title='PCA Plot'):
    
    plt.figure()
    
    pca_df = pd.DataFrame(pca_data, index = column_names, columns = labels)

    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    
    for column in column_names:
        ax.scatter(pca_df.PC1.loc[column], pca_df.PC2.loc[column], pca_df.PC3.loc[column], color=color_dict[column])

    ax.set_xlabel("PC1")
    ax.set_ylabel("PC2")
    ax.set_zlabel("PC3")
    
    new_handles = []
    for organ in all_organs:
        col = organs_to_columns[organ][0]
        color = color_dict[col]
        patch = mpatches.Patch(color=color, label=organ)
        new_handles.append(patch)

    lgd = ax.legend(handles=new_handles, loc=2, bbox_to_anchor=(1.05, 1), ncol=1)
    output_path = base_dir + title + '.pdf'
    fig.savefig(output_path, bbox_inches="tight", bbox_extra_artists=(lgd,), dpi=500)
    fig.clf()

In [15]:
draw_3d_pca(train_test_column_names, pca_data_3, train_test_dir, 
            train_test_column_to_color, per_var_3, pca_labels_3, 
            train_test_tissues, train_test_tissues_to_columns, '3D PCA')

### Plot train and test data, zoomed-in on tight clustering

In [16]:
tt_color_dict = train_test_column_to_color
train_test_pca_df = pd.DataFrame(pca_data, index = train_test_column_names, columns = pca_labels)

fig = plt.figure(1)

ax = fig.add_subplot(111)

plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))

for column in train_test_pca_df.index:
    plt.scatter(train_test_pca_df.PC1.loc[column], train_test_pca_df.PC2.loc[column], 
                color = tt_color_dict[column])

# Plot again in zoomed sub-plot
axins = zoomed_inset_axes(ax, 6, loc='center right') # ax, zoom-factor, location
for column in train_test_pca_df.index:
    axins.scatter(train_test_pca_df.PC1.loc[column], train_test_pca_df.PC2.loc[column], 
                  color = tt_color_dict[column])

x1, x2, y1, y2 = -610, -410, 0, -230 # specify the axis limits
axins.set_xlim(x1, x2) # apply the x-limits
axins.set_ylim(y2, y1) # apply the y-limits

plt.yticks(visible=False)
plt.xticks(visible=False)

mark_inset(ax, axins, loc1=2, loc2=3, fc="none", ec="0.5")

new_handles = []
for organ in train_test_tissues:
    col = train_test_tissues_to_columns[organ][0]
    color = tt_color_dict[col]
    patch = mpatches.Patch(color=color, label=organ)
    new_handles.append(patch)
    
lgd = ax.legend(handles=new_handles, loc=2, bbox_to_anchor=(1, 1), ncol=1)

output_path = train_test_dir + 'Zoomed PCA.pdf'
plt.savefig(output_path, bbox_inches="tight", bbox_extra_artists=(lgd,), dpi=500)
plt.clf()

### Plot Healthy vs Diseased
Empty circles representing diseased tissues, filled circles representing healthy

In [17]:
hd_color_dict = healthy_diseased_column_to_color

hd_pca_df = pd.DataFrame(healthy_diseased_pca_data, index = original_healthy_diseased_col_names, 
                         columns = healthy_diseased_pca_labels)

fig = plt.figure(1)

ax = fig.add_subplot(111)

plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))

for column, stripped_col in zip(original_healthy_diseased_col_names, stripped_col_names):
    fill = 'none' if column.startswith('Diseased') else hd_color_dict[stripped_col]
    ax.scatter(hd_pca_df.PC1.loc[column], 
               hd_pca_df.PC2.loc[column],
               color=hd_color_dict[stripped_col], facecolors=fill)
    
output_path = healthy_diseased_dir + 'PCA.pdf'

new_handles = []

for organ in healthy_diseased_tissues:
    col = healthy_diseased_tissues_to_columns[organ][0]
    color = hd_color_dict[col]
    patch = mpatches.Patch(color=color, label=organ)
    line = Line2D(range(1), range(1), color="white", mec=color, marker='o', markerfacecolor=color,
                  label=organ)
    new_handles.append(patch)

### Append handles for open/closed circles (diseased/healthy)
blank_line = Line2D(range(1), range(1), color="white", marker='o', markerfacecolor="white")
open_circle = Line2D(range(1), range(1), color="white", marker='o', markerfacecolor="black",
                     mec='black', label='Closed Circles: Healthy')
closed_circle = Line2D(range(1), range(1), color="white", marker='o', markerfacecolor="white",
                       mec='black', label='Open Circles: Diseased')

new_handles.append(blank_line)
new_handles.append(open_circle)
new_handles.append(closed_circle)
    
### Zoom in on tightly clustered section
axins = zoomed_inset_axes(ax, 6, loc='center right') # axes, zoom-factor, location

for column, stripped_col in zip(original_healthy_diseased_col_names, stripped_col_names):
    fill = 'none' if column.startswith('Diseased') else hd_color_dict[stripped_col]
    axins.scatter(hd_pca_df.PC1.loc[column], 
                  hd_pca_df.PC2.loc[column], 
                  color=hd_color_dict[stripped_col], facecolors=fill)

x1, x2, y1, y2 = -550, -400, 25, 175 # specify the axis limits
axins.set_xlim(x1, x2) # apply the x-limits
axins.set_ylim(y1, y2) # apply the y-limits

plt.yticks(visible=False)
plt.xticks(visible=False)

mark_inset(ax, axins, loc1=2, loc2=3, fc="none", ec="0.5")
    
lgd = ax.legend(handles=new_handles, loc=2, bbox_to_anchor=(1, 1), ncol=1)

fig.savefig(output_path, bbox_inches="tight", bbox_extra_artists=(lgd,), dpi=500)
fig.clf()