# Using unlabelled, unfractionated datasets obtained from QExact and VOrbi instruments
* Datasets were searched against H_sapiens_Uniprot_SPROT_2017-04-12, Tryp_Pig_Bov sequence files using MSGFPlus
* Combined results with MASIC to get quantitation data

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import Classification_Utils as cu
import MaxQuant_Postprocessing_Functions as mq
from os import listdir
import pandas as pd

## Load (and combine?) data from all tissues

In [10]:
files_dir = 'F:\Data_for_classifier\\'
file_paths = listdir(files_dir) 

df = cu.combine_csvs(files_dir, file_paths)

In [4]:
df.head()

Unnamed: 0_level_0,Blood_Plasma_CPTAC_TrypDige_undepleted_normal_19Apr13_Methow_13-02-13,Blood_Plasma_Darpa_2_human_02_23Jan17_Arwem_16-10-25,Blood_Plasma_OMICS_EBV_HP_UW001_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW002_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW003_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW004_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW005_8Apr16_Arwen_16-01-03,Blood_Plasma_RZHJ_012_16Jun10_Owl_10-02-04,Blood_Plasma_Trypsin_Digestion_Step5_Sample1_4Mar13_Lynx_13-02-11,Blood_Plasma_Trypsin_Digestion_Step5_Sample2_4Mar13_Lynx_13-02-11
Peptide,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
\n-.DIQM*TQSPSTLSASVGDR.V,111460000.0,4776900.0,,674080000.0,1013200000.0,,201570000.0,,,
\n-.DIQM*TQSPSTLSASVGDRVTITCR.A,,,,1665500000.0,1889800000.0,,750580000.0,,,
\n-.DIQMTQSPS.T,113990000.0,,,,,,,,,
\n-.DIQMTQSPSTLSASVGDR.V,87789000.0,271390000.0,,,2841000000.0,,217430000.0,,12897000.0,29051000.0
\n-.DIQMTQSPSTLSASVGDRVTITCR.A,,,,,6444900000.0,,,,,


## Clean data
* Log2 transform
* Mean/Median normalize
* Impute missing values

In [11]:
mq.log2_normalize(df)

# mean normalize
df = (df - df.mean())/df.std()

# median normalize
#mq.median_normalize(df)

df_min = df.min().min()
impute_val = df_min/2
df = df.fillna(impute_val)

  df.iloc[:,:] = np.log2(df.iloc[:,:])


## Map each column to a corresponding label

In [12]:
#tissues = ['Blood_Plasma', 'Blood_Serum', 'Liver', 'Monocyte', 'Ovary', 'Pancreas', 'Substantia_Nigra', 'Temporal_Lobe']
tissues = ['Blood_Plasma']
            
tissues_to_columns = cu.map_tissues_to_columns(df, tissues)
tissues_to_columns

{'Blood_Plasma': ['Blood_Plasma_CPTAC_TrypDige_undepleted_normal_19Apr13_Methow_13-02-13',
  'Blood_Plasma_Darpa_2_human_02_23Jan17_Arwem_16-10-25',
  'Blood_Plasma_OMICS_EBV_HP_UW001_8Apr16_Arwen_16-01-03',
  'Blood_Plasma_OMICS_EBV_HP_UW002_8Apr16_Arwen_16-01-03',
  'Blood_Plasma_OMICS_EBV_HP_UW003_8Apr16_Arwen_16-01-03',
  'Blood_Plasma_OMICS_EBV_HP_UW004_8Apr16_Arwen_16-01-03',
  'Blood_Plasma_OMICS_EBV_HP_UW005_8Apr16_Arwen_16-01-03',
  'Blood_Plasma_RZHJ_012_16Jun10_Owl_10-02-04',
  'Blood_Plasma_Trypsin_Digestion_Step5_Sample1_4Mar13_Lynx_13-02-11',
  'Blood_Plasma_Trypsin_Digestion_Step5_Sample2_4Mar13_Lynx_13-02-11']}

In [15]:
labels = cu.get_labels(df, tissues, tissues_to_columns)

# Sort columns by tissue type for visualization purposes

StopIteration: 

## Visualize data
* Normalized boxplots
* Scree plot
* PCA plot
* Pearson Matrix

In [13]:
image_dir = r'D:\Images\Human_Tissues\\'

column_to_color = mq.map_colors(tissues, tissues_to_columns)
column_to_color

{'Blood_Plasma_CPTAC_TrypDige_undepleted_normal_19Apr13_Methow_13-02-13': (0.85999999999999999,
  0.37119999999999997,
  0.33999999999999997),
 'Blood_Plasma_Darpa_2_human_02_23Jan17_Arwem_16-10-25': (0.85999999999999999,
  0.37119999999999997,
  0.33999999999999997),
 'Blood_Plasma_OMICS_EBV_HP_UW001_8Apr16_Arwen_16-01-03': (0.85999999999999999,
  0.37119999999999997,
  0.33999999999999997),
 'Blood_Plasma_OMICS_EBV_HP_UW002_8Apr16_Arwen_16-01-03': (0.85999999999999999,
  0.37119999999999997,
  0.33999999999999997),
 'Blood_Plasma_OMICS_EBV_HP_UW003_8Apr16_Arwen_16-01-03': (0.85999999999999999,
  0.37119999999999997,
  0.33999999999999997),
 'Blood_Plasma_OMICS_EBV_HP_UW004_8Apr16_Arwen_16-01-03': (0.85999999999999999,
  0.37119999999999997,
  0.33999999999999997),
 'Blood_Plasma_OMICS_EBV_HP_UW005_8Apr16_Arwen_16-01-03': (0.85999999999999999,
  0.37119999999999997,
  0.33999999999999997),
 'Blood_Plasma_RZHJ_012_16Jun10_Owl_10-02-04': (0.85999999999999999,
  0.37119999999999997,
  0.

In [14]:
mq.make_seaborn_boxplot(df, image_dir, 'Median_normalized_boxplots', column_to_color)

## Test various classifiers using cross-validation

### Decision Tree

### KNN

### Logistic Regression

### Naive Bayes
* Gaussian
* Multinomial

### SVC variations

### Aggregations
* Decision Tree
* Gradient Boosting

## Tune parameters of best models (if applicable)
* Check accuracy score and F1 score (measure of precision and recall)

##  Confusion matrices of best models

## Top expressed proteins/peptides per tissue

## Save model
* Save array/dataframe of features (via pickle?) along with final model
* Write script to classify new data-- load features and fit new data on them