In [1]:
import MaxQuant_Postprocessing_Functions as mq

In [2]:
#########################
#
# Load and clean data
#
#########################

file = "D:\proteinGroups.txt"

df = mq.load_df(file)
df = mq.clean_weakly_identified(df)
df = mq.remove_dup_proteinIDs(df)

iBAQ_df = mq.slice_by_column(df, 'iBAQ ')

groups = ['Brain', 'Heart', 'Kidney', 'Liver', 'Lung']
organ_columns = {} # 'Liver': ['iBAQ 04_Liver', 'iBAQ 05_Liver', ...]
organ_counts = {} # 'Liver': 
    
iBAQ_df = mq.filter_low_observed(iBAQ_df, groups, organ_columns, organ_counts)

In [3]:
#########################
#
# Normalize data and impute missing values with data frame minimum/2
#
#########################

mq.log2_normalize(iBAQ_df)
mq.median_normalize(iBAQ_df)

iBAQ_df.set_index('Majority protein IDs', inplace = True)
iBAQ_df = mq.impute_missing(iBAQ_df)

  return lib.map_infer(x.asobject, func)


In [4]:
#########################
#
# Map each column name to a corresponding label
#
#########################

"""
Input: dataframe
Output: List of strings representing the labels for each dataframe column
"""
def get_labels(df):
    columns = iBAQ_df.columns.values.tolist()
    labels = []

    for column in columns:
        key = next(key for key, value in organ_columns.items() if column in value)
        labels.append(key)
        
    return labels

In [5]:
iBAQ_df = iBAQ_df[organ_columns['Brain'] + organ_columns['Heart'] + organ_columns['Kidney'] + organ_columns['Liver'] + organ_columns['Lung']]

labels = get_labels(iBAQ_df)
print(iBAQ_df.columns.values.tolist())
print(labels)

['iBAQ Adult_07_Brain', 'iBAQ Adult_08_Brain', 'iBAQ Adult_09_Brain', 'iBAQ Adult_10_Brain', 'iBAQ Adult_11_Brain', 'iBAQ Adult_12_Brain', 'iBAQ Adult_07_Heart', 'iBAQ Adult_08_Heart', 'iBAQ Adult_09_Heart', 'iBAQ Adult_10_Heart', 'iBAQ Adult_11_Heart', 'iBAQ Adult_12_Heart', 'iBAQ Adult_07_Kidney', 'iBAQ Adult_08_Kidney', 'iBAQ Adult_09_Kidney', 'iBAQ Adult_10_Kidney', 'iBAQ Adult_11_Kidney', 'iBAQ Adult_12_Kidney', 'iBAQ Adult_04_Liver', 'iBAQ Adult_05_Liver', 'iBAQ Adult_06_Liver', 'iBAQ Adult_07_Liver', 'iBAQ Adult_08_Liver', 'iBAQ Adult_09_Liver', 'iBAQ Adult_07_Lung', 'iBAQ Adult_08_Lung', 'iBAQ Adult_09_Lung', 'iBAQ Adult_10_Lung', 'iBAQ Adult_11_Lung', 'iBAQ Adult_12_Lung']
['Brain', 'Brain', 'Brain', 'Brain', 'Brain', 'Brain', 'Heart', 'Heart', 'Heart', 'Heart', 'Heart', 'Heart', 'Kidney', 'Kidney', 'Kidney', 'Kidney', 'Kidney', 'Kidney', 'Liver', 'Liver', 'Liver', 'Liver', 'Liver', 'Liver', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung']


In [6]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn import cross_validation
from sklearn import preprocessing

# Transpose so that proteins are columns (components)
# Scale data
scaled_data = preprocessing.scale(iBAQ_df.T)

#########################
#
# Split data and labels into test and train groups
#
#########################

### Randomly split:
#X_train, X_test, y_train, y_test = cross_validation.train_test_split(iBAQ_df.T, labels, test_size=0.4, random_state=0)

X_train = np.concatenate([scaled_data[:4, :], scaled_data[6:10, :], scaled_data[12:16, :], scaled_data[18:22, :], scaled_data[24:28, :], scaled_data[30:34, :]], axis=0)

X_test = np.concatenate([scaled_data[4:6, :], scaled_data[10:12, :], scaled_data[16:18, :], scaled_data[22:24, :], scaled_data[28:30, :], scaled_data[34:, :]], axis=0)

y_train = labels[:4] + labels[6:10] + labels[12:16] + labels[18:22] + labels[24:28] + labels[30:34]
y_test = labels[4:6] + labels[10:12] + labels[16:18] + labels[22:24] + labels[28:30] + labels[34:]

print(X_train.shape)
print(X_test.shape)
print(len(y_train))
print(len(y_test))

(20, 4399)
(10, 4399)
20
10




In [7]:
pca = PCA(n_components=2)
pca.fit(X_train)
X_t_train = pca.transform(X_train)
X_t_test = pca.transform(X_test)

print(X_t_train.shape)
print(X_t_test.shape)
print(y_train)

(20, 2)
(10, 2)
['Brain', 'Brain', 'Brain', 'Brain', 'Heart', 'Heart', 'Heart', 'Heart', 'Kidney', 'Kidney', 'Kidney', 'Kidney', 'Liver', 'Liver', 'Liver', 'Liver', 'Lung', 'Lung', 'Lung', 'Lung']


In [9]:
import pandas as pd
import matplotlib.pyplot as plt

base_dir = 'D:\\Images\\Classifier\\'
color_mapping = mq.map_colors(groups, organ_columns)
columns = iBAQ_df.columns.values.tolist()
train_columns = columns[:4] + columns[6:10] + columns[12:16] + columns[18:22] + columns[24:28] + columns[30:34]

per_var, labels = mq.make_scree_plot(pca, base_dir)
mq.draw_pca_graph(train_columns, X_t_train, base_dir, color_mapping, per_var, labels)

## SVC

In [161]:
#########################
#
# SVC Classification
#
#########################

from sklearn.metrics import accuracy_score

clf = SVC()
clf.fit(X_t_train, y_train)
y_pred = clf.predict(X_t_test)

print('score', accuracy_score(y_pred, y_test))
print('pred label', clf.predict(X_t_test))
print('actual', y_test)

score 0.6
pred label ['Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Liver' 'Liver' 'Lung'
 'Lung']
actual ['Brain', 'Brain', 'Heart', 'Heart', 'Kidney', 'Kidney', 'Liver', 'Liver', 'Lung', 'Lung']


## K Neighbors

In [162]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_t_train, y_train)
y_pred = knn.predict(X_t_test)

print('score', accuracy_score(y_pred, y_test))
print('pred', pred_y)
print('actual', y_test)

score 1.0
pred ['Brain' 'Brain' 'Heart' 'Heart' 'Kidney' 'Kidney' 'Liver' 'Liver' 'Lung'
 'Lung']
actual ['Brain', 'Brain', 'Heart', 'Heart', 'Kidney', 'Kidney', 'Liver', 'Liver', 'Lung', 'Lung']
