# Comparing human lung data to mouse tissues
Variations tested:
* Based on protein abundance
* Normalizing all data together
* iBAQ abundance values

Variations to test:
* Based on peptide abundance
* Normalizing mouse and human data separately
* LFQ abundance values

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import Classification_Utils as cu
import MaxQuant_Postprocessing_Functions as mq
import pandas as pd

## 1. Load mouse data

In [3]:
mouse_protein_file = "D:\proteinGroups.txt"

mouse_protein_df = mq.load_df(mouse_protein_file)
mouse_protein_df = mq.clean_weakly_identified(mouse_protein_df)
mouse_protein_df = mq.remove_dup_proteinIDs(mouse_protein_df)

mouse_iBAQ_df = mq.slice_by_column(mouse_protein_df, 'protein', 'iBAQ ')
mouse_LFQ_df = mq.slice_by_column(mouse_protein_df, 'protein', 'LFQ')

mouse_iBAQ_df.columns = cu.rename_columns(mouse_iBAQ_df, 'Adult', 'Mouse')
mouse_LFQ_df.columns = cu.rename_columns(mouse_LFQ_df, 'Adult', 'Mouse')

mouse_groups = ['Brain', 'Heart', 'Kidney', 'Liver', 'Lung']
mouse_organ_to_columns = {}
mouse_organ_counts = {} 

mouse_iBAQ_df['Majority protein IDs'] = mouse_iBAQ_df['Majority protein IDs'].str[:-6] # strip off '_Mouse'
mouse_LFQ_df['Majority protein IDs'] = mouse_LFQ_df['Majority protein IDs'].str[:-6] # strip off '_Mouse'
mouse_iBAQ_df.set_index('Majority protein IDs', inplace = True)
mouse_LFQ_df.set_index('Majority protein IDs', inplace = True)

## 2. Load human data

* Human dataset info:
    * Instrument: QExactHF03
    * Separation Type: LC-Waters-Formic_3hr
    * Tool: MSGFPlus_MzMl
    * Jobs: 1498824-1498852
    * Param file: MSGFDB_PartTryp_MetOx_StatCysAlk_10ppmParTol.txt
    * Unlabelled samples

In [4]:
human_lung_protein_file = r'F:\Human_Lung_Raw_Files\LungMAP\combined\txt\human_lung_proteinGroups.txt'
human_groups = ['Human_Lung']

human_lung_df = mq.load_df(human_lung_protein_file)
human_lung_df = mq.clean_weakly_identified(human_lung_df)
human_lung_df = mq.remove_dup_proteinIDs(human_lung_df)
        
human_lung_iBAQ_df = mq.slice_by_column(human_lung_df, 'protein', 'iBAQ ') 
human_lung_LFQ_df = mq.slice_by_column(human_lung_df, 'protein', 'LFQ')
    
human_lung_organ_columns = {}
human_lung_organ_counts = {} 

human_lung_iBAQ_df['Majority protein IDs'] = human_lung_iBAQ_df['Majority protein IDs'].str[:-6]
human_lung_LFQ_df['Majority protein IDs'] = human_lung_LFQ_df['Majority protein IDs'].str[:-6]
human_lung_iBAQ_df.set_index('Majority protein IDs', inplace = True)
human_lung_LFQ_df.set_index('Majority protein IDs', inplace = True)

## 3. Load human-mouse correspondance data

In [5]:
mapping_file = r'D:\Human_Mouse_Mapping.txt'
mapping_df = pd.read_csv(mapping_file, usecols=['Matched Term', 'Symbol', 'Species'], sep='\t', lineterminator='\r', encoding = 'latin1')
mapping_df = mapping_df.replace(r'\n','', regex=True)

# Filter out entries not containing human in the "Species" column
mapping_df = mapping_df[mapping_df['Species'].isnull() | mapping_df['Species'].str.contains('Human')]
mapping_df.set_index('Matched Term', inplace=True)
mapping_df.drop(['Species'], axis=1, inplace=True)

mapping_df['Symbol'].replace(to_replace=' (includes others)', value='', inplace=True) # remove trailing comments

In [6]:
#########################
#
# Change mouse proteinIDs to common symbol
#
#########################

mouse_proteins = mouse_iBAQ_df.index.values.tolist()
human_proteins = human_lung_iBAQ_df.index.values.tolist()
raw_mappings = mapping_df.to_dict('index') # {mouse protein: {'Symbol': common protein}}
mappings = {}

# Break up rows with multiple mouse proteins
for old_key, val in raw_mappings.items():
    keys = old_key.split()
    for new_key in keys:
        mappings[new_key] = raw_mappings[old_key]
        
mouse_iBAQ_df.reset_index(inplace=True)

for protein in mouse_proteins:
    if protein not in human_proteins:
        to_replace = protein + '_MOUSE'
        if to_replace in mappings:
            mapping = mappings[to_replace]
            new_sym = mapping['Symbol']
            mouse_iBAQ_df.replace(protein, new_sym, inplace=True)
        
mouse_iBAQ_df.set_index('Majority protein IDs', inplace=True)

In [7]:
print(mouse_iBAQ_df.head())

                      iBAQ Mouse_04_Liver  iBAQ Mouse_05_Liver  \
Majority protein IDs                                             
1433B                          80377000.0          106810000.0   
1433E                         251680000.0          225180000.0   
1433F                          32883000.0           46963000.0   
1433G                         175610000.0          166310000.0   
1433S                          53834000.0           62327000.0   

                      iBAQ Mouse_06_Liver  iBAQ Mouse_07_Brain  \
Majority protein IDs                                             
1433B                         129430000.0         6.599400e+08   
1433E                         266450000.0         1.231800e+09   
1433F                          44594000.0         7.019100e+08   
1433G                         193140000.0         1.754000e+09   
1433S                          93074000.0         5.072200e+08   

                      iBAQ Mouse_07_Heart  iBAQ Mouse_07_Kidney  \
Majorit

## 4. Combine data 

### a. Normalize Separately 

### b. Normalize Together 

In [8]:
#########################
#
# Join mouse data to human data
#
#########################

combined_df = mouse_iBAQ_df.join(human_lung_iBAQ_df)

all_organs = ['Mouse.*Brain', 'Mouse.*Heart', 'Mouse.*Kidney', 'Mouse.*Liver', 'Mouse.*Lung', 'Human_Lung']
organs_to_columns = {}
organs_to_observed_counts = {}

combined_df = mq.filter_low_observed(combined_df, all_organs, organs_to_columns, organs_to_observed_counts)
mq.log2_normalize(combined_df)
mq.median_normalize(combined_df)
combined_df = mq.reorder_columns(combined_df, all_organs, organs_to_columns)

  df.iloc[:,:] = np.log2(df.iloc[:,:])


## 5. Plots: Normalized Boxplot, PCA, Pearson matrix

In [9]:
base_dir = r'D:\Images\Classifier\\'
combined_dir = base_dir + 'Human_Lung_Mouse_Tissues_'
combined_color_mapping = mq.map_colors(all_organs, organs_to_columns)

mq.make_seaborn_boxplot(combined_df, combined_dir, 'Median Normalized Boxplot', combined_color_mapping)

combined_df = mq.impute_missing(combined_df)

all_columns = combined_df.columns.values.tolist()

In [10]:
combined_pca, combined_pca_data = mq.do_pca(combined_df, 'protein')

combined_per_var, combined_labels = mq.make_scree_plot(combined_pca, combined_dir)
mq.draw_pca_graph(all_columns, combined_pca_data, combined_dir, combined_color_mapping, combined_per_var, combined_labels)

In [11]:
mq.make_pearson_matrix(combined_df, combined_dir, dimensions=(20,15))

## 6. Classifiers 

In [12]:
#########################
#
# Split off mouse data for training and human data for testing
#
#########################

human_lung_cols = human_lung_iBAQ_df.columns.values.tolist()
mouse_cols = mouse_iBAQ_df.columns.values.tolist()

mouse_data = combined_df[mouse_cols].T
human_lung_data = combined_df[human_lung_cols].T

In [13]:
mouse_organs_to_columns = {k:v for (k,v) in organs_to_columns.items() if 'Mouse' in k}
human_organs_to_columns = {k:v for (k,v) in organs_to_columns.items() if 'Human' in k}

In [14]:
#########################
#
# Get mouse (training) labels and human (test) labels
#
#########################

mouse_labels = cu.get_labels(mouse_data, mouse_cols, mouse_organs_to_columns)
mouse_labels = [label.replace('Mouse.*', '') for label in mouse_labels]

human_lung_labels = cu.get_labels(human_lung_data, human_lung_cols, human_organs_to_columns)
human_lung_labels = [label.replace('Human_', '') for label in human_lung_labels]

### SVC variations

In [16]:
models = cu.SVC_models(mouse_data, mouse_labels, human_lung_data, human_lung_labels)


*** Model:  SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

score 1.0
pred label ['Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung'
 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung'
 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'Lung']
actual ['Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung']

*** Model:  LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0) 

score 1.0
pred label ['Lung' 'Lung' 'Lung' 'Lung' 'Lung' 'L

### KNN

In [None]:
knn = cu.knn_model_crossval(mouse_data, mouse_labels, 4)

In [None]:
human_lung_pred = cu.make_test_prediction(knn, human_lung_data, human_lung_labels)

print("\n")
cu.show_prediction_probabilities(knn, human_lung_data, 4)

### Decision Tree

In [None]:
dt = cu.decisiontree_model_crossval(mouse_data, mouse_labels, 4)

In [None]:
human_lung_pred = cu.make_test_prediction(dt, human_lung_data, human_lung_labels)

print("\n")
cu.show_prediction_probabilities(dt, human_lung_data, 0)

print(human_lung_data.T.iloc[:,:1])
pred_probabilities = dt.predict_proba(human_lung_data.T.iloc[:,:1])
classes = dt.classes_

print('Prediction probabilities for sample:')
for prob in zip(classes, pred_probabilities):
    print(prob[0], ':', prob[1])

## 7. Highly expressed proteins
** Feature selection **
* SelectKBest
* SelectPercentile
* MutualInfoClassif