# Using unlabelled, unfractionated datasets obtained from QExact and VOrbi instruments
* Datasets were searched against H_sapiens_Uniprot_SPROT_2017-04-12, Tryp_Pig_Bov sequence files using MSGFPlus
* Combined results with MASIC to get quantitation data

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [28]:
import Classification_Utils as cu
import MaxQuant_Postprocessing_Functions as mq
from os import listdir
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import preprocessing

## Load (and combine?) data from all tissues

In [70]:
files_dir = 'F:\Data_for_classifier\\'
file_paths = listdir(files_dir) 

df = cu.combine_csvs(files_dir, file_paths)

In [71]:
print(df.shape)
df.head()

(117884, 69)


Unnamed: 0_level_0,Blood_Plasma_CPTAC_TrypDige_undepleted_normal_19Apr13_Methow_13-02-13,Blood_Plasma_Darpa_2_human_02_23Jan17_Arwem_16-10-25,Blood_Plasma_OMICS_EBV_HP_UW001_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW002_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW003_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW004_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW005_8Apr16_Arwen_16-01-03,Blood_Plasma_RZHJ_012_16Jun10_Owl_10-02-04,Blood_Plasma_Trypsin_Digestion_Step5_Sample1_4Mar13_Lynx_13-02-11,Blood_Plasma_Trypsin_Digestion_Step5_Sample2_4Mar13_Lynx_13-02-11,...,Substantia_Nigra_Lewy_1Ct1_3Sep13_Pippin_13-06-18,Substantia_Nigra_Lewy_2Ct1_3Sep13_Pippin_13-06-18,Substantia_Nigra_Lewy_4Ct1_3Sep13_Pippin_13-06-18,Substantia_Nigra_Lewy_5Ct1_3Sep13_Pippin_13-06-18,Substantia_Nigra_Lewy_6Ct1_13Sep13_Pippin_13-06-18,Substantia_Nigra_Lewy2_1Ct1_2Nov13_Samwise_13-07-28,Substantia_Nigra_Lewy2_2Ct1_2Nov13_Samwise_13-07-28,Substantia_Nigra_Lewy2_4Ct1_2Nov13_Samwise_13-07-28,Substantia_Nigra_Lewy2_5Ct1_2Nov13_Samwise_13-07-28,Substantia_Nigra_Lewy2_6Ct1_2Nov13_Samwise_13-07-28
Peptide,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
\n,,,,,,,,,,,...,,,,,,,,,,
\n-.DIQM*TQSPSTLSASVGDR.V,111460000.0,4776900.0,,674080000.0,1013200000.0,,201570000.0,,,,...,,,,,,,,,,
\n-.DIQM*TQSPSTLSASVGDRVTITCR.A,,,,1665500000.0,1889800000.0,,750580000.0,,,,...,,,,,,,,,,
\n-.DIQMTQSPS.T,113990000.0,,,,,,,,,,...,,,,,,,,,,
\n-.DIQMTQSPSTLSASVGDR.V,87789000.0,271390000.0,,,2841000000.0,,217430000.0,,12897000.0,29051000.0,...,,,,138110000.0,,,,,,


## Clean data
* Log2 transform
* Mean/Median normalize
* Impute missing values

In [72]:
mq.log2_normalize(df)

df_min = df.min().min()
impute_val = df_min/2
df = df.fillna(impute_val)

# mean normalize
#df = (df - df.mean())/df.std()

# median normalize
mq.median_normalize(df)

  df.iloc[:,:] = np.log2(df.iloc[:,:])


## Map each column to a corresponding label

In [73]:
tissues = ['Blood_Plasma', 'Blood_Serum', 'Liver', 'Monocyte', 'Ovary', 'Pancreas', 'Substantia_Nigra']
            
tissues_to_columns = cu.map_tissues_to_columns(df, tissues)
#tissues_to_columns

In [74]:
column_names = df.columns.values.tolist()
labels = cu.get_labels(column_names, tissues_to_columns)

# Sort columns by tissue type for visualization purposes

## Visualize data
* Normalized boxplots
* Scree plot
* PCA plot
* Pearson Matrix

In [75]:
image_dir = r'D:\Images\Human_Tissues\\'

column_to_color = mq.map_colors(tissues, tissues_to_columns, 7)

In [78]:
mq.make_seaborn_boxplot(df, image_dir, 'Median_normalized_boxplots', column_to_color)

In [80]:
scaled_data = preprocessing.scale(df.T)
#scaled_data = df.T

pca = PCA() # create a PCA object
pca.fit(scaled_data) # do the math
pca_data = pca.transform(scaled_data) # get PCA coordinates for dataframe

per_var, pca_labels = mq.make_scree_plot(pca, image_dir)
mq.draw_pca_graph2(column_names, pca_data, image_dir, column_to_color, per_var, pca_labels, tissues, tissues_to_columns)

  "matplotlib is currently using a non-GUI backend, "


In [77]:
mq.make_pearson_matrix(df, image_dir)

## Test various classifiers using cross-validation

In [82]:
NUM_FOLDS = 8
transformed_df = df.T

### Decision Tree

In [84]:
dt = cu.decisiontree_model_crossval(transformed_df, labels, NUM_FOLDS)

Scores: [ 0.85714286  0.69230769  0.85714286  0.85714286  0.85714286  0.71428571
  1.          1.        ]
Accuracy: 0.85 (+/- 0.21)


### KNN

In [85]:
knn = cu.knn_model_crossval(transformed_df, labels, NUM_FOLDS)

Scores: [ 0.71428571  0.92307692  1.          1.          0.71428571  0.71428571
  0.71428571  1.        ]
Accuracy: 0.85 (+/- 0.27)


### Logistic Regression

In [86]:
lr = cu.logistic_regression_model_crossval(transformed_df, labels, NUM_FOLDS)

Scores: [ 0.92857143  0.92307692  1.          1.          1.          0.85714286
  1.          1.        ]
Accuracy: 0.96 (+/- 0.10)


### Naive Bayes
* Gaussian
* Multinomial

In [87]:
gnb = cu.bayes_gaussian_model_crossval(transformed_df, labels, NUM_FOLDS)

Scores: [ 1.          0.92307692  1.          1.          1.          0.85714286
  1.          1.        ]
Accuracy: 0.97 (+/- 0.10)


In [88]:
mnb = cu.bayes_multinomial_model_crossval(transformed_df, labels, NUM_FOLDS)

Scores: [ 1.          1.          1.          1.          1.          0.85714286
  1.          1.        ]
Accuracy: 0.98 (+/- 0.09)


### SVC variations

In [90]:
svc = cu.SVC_models_crossval(transformed_df, labels, NUM_FOLDS)

Scores: [ 0.92857143  1.          1.          1.          1.          0.85714286
  1.          1.        ]
Accuracy: 0.97 (+/- 0.10)
Scores: [ 0.92857143  0.92307692  1.          1.          1.          0.85714286
  0.85714286  1.        ]
Accuracy: 0.95 (+/- 0.12)
Scores: [ 0.14285714  0.15384615  0.14285714  0.14285714  0.14285714  0.14285714
  0.14285714  0.14285714]
Accuracy: 0.14 (+/- 0.01)
Scores: [ 0.92857143  0.92307692  0.85714286  1.          0.85714286  0.85714286
  0.85714286  1.        ]
Accuracy: 0.91 (+/- 0.12)


### Aggregations
* Random Forest
* Gradient Boosting

In [92]:
rf = cu.randomforest_model_crossval(transformed_df, labels, NUM_FOLDS)

Scores: [ 0.92857143  0.92307692  1.          1.          1.          0.85714286
  1.          1.        ]
Accuracy: 0.96 (+/- 0.10)


In [93]:
gbc = cu.gradient_boosting_crossval(transformed_df, labels, NUM_FOLDS)

Scores: [ 0.92857143  1.          1.          1.          1.          0.85714286
  1.          1.        ]
Accuracy: 0.97 (+/- 0.10)


## Tune parameters of best models (if applicable)
* Check accuracy score and F1 score (measure of precision and recall)

### Gradient Boosting grid search

### Random Forest grid search

In [None]:
rf_grid = cu.rf_grid_search(NUM_FOLDS, 1)

rf_grid.fit(transformed_df, labels)

print('Best Random Forest parameters:\n', rf_grid.best_params_)
print('\nBest Cross-Validation score:\n', rf_grid.best_score_)

### SVC grid search

## PCA of data reduced according to best grid search reduction method

##  Confusion matrices of best models

## Top expressed proteins/peptides per tissue

## Save model
* Save array/dataframe of features (via pickle?) along with final model
* Write script to classify new data-- load features and fit new data on them