# Using unlabelled, unfractionated datasets obtained from QExact and VOrbi instruments
* Datasets were searched against H_sapiens_Uniprot_SPROT_2017-04-12, Tryp_Pig_Bov sequence files using MSGFPlus
* Combined results with MASIC results (q <= 0.01) to get quantitation data

In [218]:
import Classification_Utils as cu
import math
import MaxQuant_Postprocessing_Functions as mq
import numpy as np
from os import listdir
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import time

## Load and combine data from all tissues

In [219]:
df = pd.read_csv('FullPeptideQuant.txt', sep='\t', index_col='Peptide')
print(df.shape)

(55676, 253)


## Map each column to a corresponding label

In [220]:
tissues = ['Blood_Plasma', 'Blood_Serum', 'CSF', 'Liver', 'Monocyte', 'Ovary', 'Pancreas', 'Substantia_Nigra', 'Temporal_Lobe']
 
tissues_to_columns = cu.map_tissues_to_columns(df, tissues)

In [221]:
column_names = df.columns.values.tolist()
labels = cu.get_labels(column_names, tissues_to_columns)

## Make train-test split

In [222]:
train_df, test_df, train_labels, test_labels = train_test_split(
    df.T, labels, test_size=0.30,# 30% of the data held out in test set
    random_state=0,    # Setting random_state ensures the same train/test split occurs each time this is run
    stratify=labels)   # Maintain relative ratio of samples from each tissue

## Transform Train Data by Reducing Peptides

In [223]:
def keep_percentile_peptides(df, labels, tissues, percentile, impute_val):
    df = df.T
    tissue_dfs = []
    
    for tissue in tissues:
        cols_to_drop = [col for col in df.columns.values if not col.startswith(tissue)]
        tissue_df = df.drop(cols_to_drop, axis=1)
        
        ### Drop rows with nothing observed
        tissue_df.replace(impute_val, np.nan, inplace=True)
        tissue_df.dropna(axis=0, how='all', inplace=True)
        tissue_df.replace(np.nan, impute_val, inplace=True)
        
        peptide_mean_abundances = tissue_df.mean(axis=1) # pandas series

        # sort by average abundance value, drop below threshold
        peptide_mean_abundances.sort_values(ascending=False, inplace=True)
        num_peptides_to_keep = math.ceil(len(peptide_mean_abundances) * percentile/100)
        peptides_to_drop = peptide_mean_abundances[num_peptides_to_keep:].index.values
        
        print(tissue_df.shape[0], len(peptides_to_drop))
        
        tissue_df.drop(peptides_to_drop, axis=0, inplace=True)
        
        tissue_dfs.append(tissue_df)
        
    # join dataframes
    combined_df = pd.DataFrame()
    for next_df in tissue_dfs:
        combined_df = combined_df.join(next_df, how='outer')
    
    combined_df.replace(np.nan, impute_val, inplace=True)
        
    return combined_df.T

In [224]:
imputed_val = train_df.mode().iloc[0, 0]

percentile_to_keep = 90

print(train_df.shape)
train_df = keep_percentile_peptides(train_df, train_labels, tissues, percentile_to_keep, imputed_val)
print(train_df.shape)

features_to_keep = train_df.columns.values.tolist()

column_names = train_df.index.values.tolist()
train_labels = cu.get_labels(column_names, tissues_to_columns)

(177, 55676)
7456 745
4845 484
1857 185
22400 2240
17001 1700
18600 1860
11937 1193
26817 2681
8847 884
(177, 55482)


## Train various classifiers, using cross-validation to produce an accuracy score

In [225]:
NUM_SPLITS = 1 # number of train/test splits in cross validation

In [226]:
train_df.head()

Peptide,-.DIQM*TQSPSTLSASVGDR.V,-.DIQM*TQSPSTLSASVGDRVTITCR.A,-.DIQMTQSPSTLSASVGDR.V,-.DIQMTQSPSTLSASVGDRVTITCR.A,-.EVQLVETGGGLIQPGGSLR.L,-.GLSDGEWQQVLNVWGKVEADIAGHGQEVLIR.L,-.LGEHNIDVLEGNEQFINAAR.I,-.LGEHNIDVLEGNEQFINAARII.T,-.LGEHNIDVLEGNEQFINAARIITHPN.F,-.LGEHNIDVLEGNEQFINAARIITHPNFN.G,...,Y.YGYGPGYDYSQGSTNYGK.S,Y.YGYTGAFR.C,Y.YLEVNQLEK.F,Y.YLEVNQLEKFDIK.S,Y.YNEATGGNYVPR.A,Y.YNPGNPHNVYMPTSQPPPPPYYPPEDKKTQ.-,Y.YTEFTPTEKDEYACR.V,Y.YTGEKGQNQDYR.G,Y.YVTIIDAPGHR.D,Y.YYIQQDTK.G
Blood_Plasma_OMICS_EBV_HP_UW002_8Apr16_Arwen_16-01-03,29.328345,30.633308,3.022208,3.022208,3.022208,3.022208,29.380036,3.022208,32.159696,3.022208,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208
Blood_Plasma_OMICS_EBV_HP_UW009_8Apr16_Arwen_16-01-03,29.875856,32.504316,29.30182,3.022208,27.142775,3.022208,32.691831,3.022208,31.291647,26.705301,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208
Blood_Plasma_OpPlasma_034_a_13Aug11_Jaguar_11-07-18,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208
Blood_Plasma_OpPlasma_049_a_13Aug11_Jaguar_11-07-16,29.131658,3.022208,29.981145,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208
Blood_Plasma_OpPlasma_039_a_13Aug11_Jaguar_11-07-16,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208


### KNN

In [227]:
knn = cu.knn_model_crossval(train_df, train_labels, NUM_SPLITS)

accuracy: 0.83 (+/- 0.00)


### Logistic Regression

In [228]:
lr = cu.logistic_regression_model_crossval(train_df, train_labels, NUM_SPLITS)

accuracy: 0.98 (+/- 0.00)


### Naive Bayes
* Gaussian
* Multinomial

In [229]:
gnb = cu.bayes_gaussian_model_crossval(train_df, train_labels, NUM_SPLITS)

accuracy: 0.94 (+/- 0.00)


In [None]:
mnb = cu.bayes_multinomial_model_crossval(train_df, train_labels, NUM_SPLITS)

accuracy: 0.81 (+/- 0.00)


### SVC variations

In [None]:
svc_models = cu.SVC_models_crossval(train_df, train_labels, NUM_SPLITS)

accuracy: 0.98 (+/- 0.00)


### Aggregations
* Random Forest
* Gradient Boosting

In [None]:
rf = cu.randomforest_model_crossval(train_df, train_labels, NUM_SPLITS)

In [None]:
gbc = cu.gradient_boosting_crossval(train_df, train_labels, NUM_SPLITS)

## Classify Training Set

### Use models from notebook to predict new data

In [None]:
lr_pred = lr.predict(test_df)
lr_result = lr.score(test_df, test_labels)

mnb_pred = mnb.predict(test_df)
mnb_result = mnb.score(test_df, test_labels)

rf_pred = rf.predict(test_df)
rf_result = rf.score(test_df, test_labels)

svc_pred = svc_models[0].predict(test_df)
svc_result = svc_models[0].score(test_df, test_labels)

knn_pred = knn.predict(test_df)
knn_result = knn.score(test_df, test_labels)

gnb_pred = gnb.predict(test_df)
gnb_result = gnb.score(test_df, test_labels)

gbc_pred = gbc.predict(test_df)
gbc_result = gbc.score(test_df, test_labels)

In [None]:
print(svc_result)
print(knn_result)
print(gnb_result)
print(gbc_result)
print(mnb_result)
print(lr_result)
print(rf_result)

In [None]:
cm_labels = list(set(knn_pred.tolist() + test_labels))

cu.show_confusion_matrices(test_labels, knn_pred, cm_labels)