# Building Classifiers

In [15]:
import Classification_Utils as cu
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

## Load and combine data from all tissues

In [16]:
df = pd.read_csv('FullPeptideQuant.txt', sep='\t', index_col='Peptide')
print(df.shape)

(68623, 253)


## Map each column to a corresponding label

In [17]:
tissues = ['Blood_Plasma', 'Blood_Serum', 'CSF', 'Liver', 'Monocyte', 'Ovary', 'Pancreas', 'Substantia_Nigra', 'Temporal_Lobe']
 
tissues_to_columns = cu.map_tissues_to_columns(df, tissues)

In [18]:
column_names = df.columns.values.tolist()
labels = cu.get_labels(column_names, tissues_to_columns)

In [19]:
df.head()

Unnamed: 0_level_0,Blood_Plasma_CPTAC_TrypDige_undepleted_normal_19Apr13_Methow_13-02-13,Blood_Plasma_Darpa_2_human_02_23Jan17_Arwem_16-10-25,Blood_Plasma_OMICS_EBV_HP_UW001_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW002_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW003_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW004_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW005_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW006_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW007_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW008_8Apr16_Arwen_16-01-03,...,Temporal_Lobe_Alz_FX1P159_Guan_1_26Jul10_Andromeda_10-06-28,Temporal_Lobe_Alz_FX1P159_Guan_2_26Jul10_Andromeda_10-06-29,Temporal_Lobe_Alz_FX1P159_Guan_3_26Jul10_Andromeda_10-06-28,Temporal_Lobe_Alz_FX1P159_Guan_Typx2_1_26Jul10_Andromeda_10-06-29,Temporal_Lobe_Alz_FX1P159_Guan_Typx2_2_26Jul10_Andromeda_10-06-28,Temporal_Lobe_Alz_FX1P159_Guan_Typx2_3_26Jul10_Andromeda_10-06-29,Temporal_Lobe_Alz_FX2P57_IMAC_153_9May11_Hawk_11-04-02p,Temporal_Lobe_Alz_FX2P57_IMAC_161_20Apr11_Hawk_10-12-03p,Temporal_Lobe_Alz_FX2P57_IMAC_187_26Apr11_Hawk_10-12-03p,Temporal_Lobe_Alz_FX2P57_IMAC_212_11May11_Hawk_11-04-02p
Peptide,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-.DIQM*TQSPSTLSASVGDR.V,26.731951,22.187643,3.022208,29.328345,29.916272,3.022208,27.586706,30.458361,29.00413,31.023004,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208
-.DIQM*TQSPSTLSASVGDRVTITCR.A,3.022208,3.022208,3.022208,30.633308,30.815586,3.022208,29.483431,32.564995,30.319263,32.368436,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208
-.DIQMTQSPSTLSASVGDR.V,26.387537,28.015792,3.022208,3.022208,31.403752,3.022208,27.695976,29.779972,29.747784,30.255299,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208
-.DIQMTQSPSTLSASVGDRVTITCR.A,3.022208,3.022208,3.022208,3.022208,32.585511,3.022208,3.022208,30.845879,30.983525,27.526416,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208
-.EVQLVETGGGLIQPGGSLR.L,24.54622,3.022208,3.022208,3.022208,26.732727,3.022208,28.163126,23.836245,3.022208,28.436388,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208


In [20]:
df.columns.values.tolist()

['Blood_Plasma_CPTAC_TrypDige_undepleted_normal_19Apr13_Methow_13-02-13',
 'Blood_Plasma_Darpa_2_human_02_23Jan17_Arwem_16-10-25',
 'Blood_Plasma_OMICS_EBV_HP_UW001_8Apr16_Arwen_16-01-03',
 'Blood_Plasma_OMICS_EBV_HP_UW002_8Apr16_Arwen_16-01-03',
 'Blood_Plasma_OMICS_EBV_HP_UW003_8Apr16_Arwen_16-01-03',
 'Blood_Plasma_OMICS_EBV_HP_UW004_8Apr16_Arwen_16-01-03',
 'Blood_Plasma_OMICS_EBV_HP_UW005_8Apr16_Arwen_16-01-03',
 'Blood_Plasma_OMICS_EBV_HP_UW006_8Apr16_Arwen_16-01-03',
 'Blood_Plasma_OMICS_EBV_HP_UW007_8Apr16_Arwen_16-01-03',
 'Blood_Plasma_OMICS_EBV_HP_UW008_8Apr16_Arwen_16-01-03',
 'Blood_Plasma_OMICS_EBV_HP_UW009_8Apr16_Arwen_16-01-03',
 'Blood_Plasma_OMICS_EBV_HP_UW010_8Apr16_Arwen_16-01-03',
 'Blood_Plasma_OMICS_EBV_HP_UW011_8Apr16_Arwen_16-01-03',
 'Blood_Plasma_OMICS_EBV_HP_UW012_8Apr16_Arwen_16-01-03',
 'Blood_Plasma_OMICS_EBV_HP_UW013_19Apr16_Arwen_16-01-03',
 'Blood_Plasma_OMICS_EBV_HP_UW014_8Apr16_Arwen_16-01-03',
 'Blood_Plasma_OMICS_EBV_HP_UW015_8Apr16_Arwen_16-01-03'

## Make train-test split

In [6]:
train_df, test_df, train_labels, test_labels = train_test_split(
    df.T, labels, 
    test_size=0.30,    # 30% of the data held out in test set
    random_state=0,    # Setting random_state ensures the same train/test split occurs each time this is run
    stratify=labels)   # Maintain ratio of tissues represented in each set

train_features = train_df.columns.values.tolist()
std_scaler = StandardScaler()
train_df = std_scaler.fit_transform(train_df)
test_df = std_scaler.fit_transform(test_df)

min_max_scaler = MinMaxScaler()
min_max_train_df = min_max_scaler.fit_transform(train_df)
min_max_test_df = min_max_scaler.fit_transform(test_df)

## Train various classifiers, using cross-validation to produce an accuracy score

In [7]:
NUM_SPLITS = 100 # number of train/test splits in cross validation

### KNN

In [8]:
knn = cu.knn_model_crossval(train_df, train_labels, NUM_SPLITS)

accuracy: 0.81 (+/- 0.10)


### Logistic Regression

In [9]:
lr = cu.logistic_regression_model_crossval(train_df, train_labels, NUM_SPLITS)

accuracy: 0.99 (+/- 0.03)


### Naive Bayes
* Gaussian
* Multinomial

In [10]:
gnb = cu.bayes_gaussian_model_crossval(train_df, train_labels, NUM_SPLITS)

accuracy: 0.97 (+/- 0.05)


In [11]:
mnb = cu.bayes_multinomial_model_crossval(min_max_train_df, train_labels, NUM_SPLITS)

accuracy: 0.95 (+/- 0.06)


### SVC 

In [12]:
svc = cu.SVC_model_crossval(train_df, train_labels, NUM_SPLITS)

accuracy: 0.98 (+/- 0.04)


### Aggregations
* Random Forest
* Gradient Boosting

In [13]:
rf = cu.randomforest_model_crossval(train_df, train_labels, NUM_SPLITS)

accuracy: 0.97 (+/- 0.04)


In [14]:
gbc = cu.gradient_boosting_crossval(train_df, train_labels, NUM_SPLITS)

KeyboardInterrupt: 

## Classify Training Set

### Use models from notebook to predict new data

In [None]:
lr_pred = lr.predict(test_df)
lr_result = lr.score(test_df, test_labels)

mnb_pred = mnb.predict(min_max_test_df)
mnb_result = mnb.score(test_df, test_labels)

rf_pred = rf.predict(test_df)
rf_result = rf.score(test_df, test_labels)

svc_pred = svc.predict(test_df)
svc_result = svc.score(test_df, test_labels)

gbc_pred = gbc.predict(test_df)
gbc_result = gbc.score(test_df, test_labels)

gnb_pred = gnb.predict(test_df)
gnb_result = gnb.score(test_df, test_labels)

knn_pred = knn.predict(test_df)
knn_result = knn.score(test_df, test_labels)

In [None]:
print(lr_result)
print(mnb_result)
print(rf_result)
print(svc_result)
print(gbc_result)
print(gnb_result)
print(knn_result)

##  Confusion matrices of model's predictions on new data

In [None]:
cm_labels = list(set(gnb_pred.tolist() + test_labels))

cu.show_confusion_matrices(test_labels, gnb_pred, cm_labels)

## Classify Liver Cell Line Data

In [None]:
cell_line_df = pd.read_csv('TrainTestCellLineQuant.txt', sep='\t', index_col='Peptide')
cell_line_df = cell_line_df.filter(like='Cell_Line', axis=1) # Break off cell line data

cell_line_df = cell_line_df.T[train_features]
cell_line_df = std_scaler.fit_transform(cell_line_df)
min_max_cell_line_df = min_max_scaler.fit_transform(cell_line_df)

cell_line_labels = ['Liver' for i in range(10)]
cell_line_df.shape

In [None]:
lr_cell_line_pred = lr.predict(cell_line_df)
lr_cell_line_result = lr.score(cell_line_df, cell_line_labels)

mnb_cell_line_pred = mnb.predict(min_max_cell_line_df)
mnb_cell_line_result = mnb.score(min_max_cell_line_df, cell_line_labels)

rf_cell_line_pred = rf.predict(cell_line_df)
rf_cell_line_result = rf.score(cell_line_df, cell_line_labels)

svc_cell_line_pred = svc.predict(cell_line_df)
svc_cell_line_result = svc.score(cell_line_df, cell_line_labels)

gbc_cell_line_pred = gbc.predict(cell_line_df)
gbc_cell_line_result = gbc.score(cell_line_df, cell_line_labels)

gnb_cell_line_pred = gnb.predict(cell_line_df)
gnb_cell_line_result = gnb.score(cell_line_df, cell_line_labels)

knn_cell_line_pred = knn.predict(cell_line_df)
knn_cell_line_result = knn.score(cell_line_df, cell_line_labels)

In [None]:
print(lr_cell_line_result)
print(mnb_cell_line_result)
print(rf_cell_line_result)
print(svc_cell_line_result)
print(gbc_cell_line_result)
print(gnb_cell_line_result)
print(knn_cell_line_result)

In [None]:
print(lr_cell_line_pred)
print(mnb_cell_line_pred)
print(rf_cell_line_pred)
print(svc_cell_line_pred)
print(gbc_cell_line_pred)
print(gnb_cell_line_pred)
print(knn_cell_line_pred)

##  Confusion matrices of cell line predictions 

In [None]:
cellline_cm_labels = list(set(['Liver'] + lr_cell_line_pred.tolist()))

cu.show_confusion_matrices(cell_line_labels, lr_cell_line_pred, cellline_cm_labels)