In [3]:
# Will reload modeules after this when they change!
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import pandas as pd
import numpy as np
import pathlib
import pickle
from scipy.ndimage import median_filter
from sklearn.metrics.pairwise import cosine_similarity
import glob
import re
from collections import Counter
from results_processing_on_dl_results import *

This notebook analyzes the results from three seperate experiments:
1. Authentics standard spotting (VS, n=9):
metaspace_MSMS_VS_9_spotted_datasets.ipynb

2. Wholebody MSI (various, n=10):
metaspace_MSMS_whole_body_matched_II.ipynb

3. High-quality datasets in METASPACE (top labs, n=328):
metaspace_MSMS_high_quality_datasets.ipynb

These experiments were processed through the METASPACE MS/MS workflow.  Output dataframes and image arrays are concatonated here to answer a series of scientific questions about the workflow: 
https://docs.google.com/document/d/1QDb5LIYcyF2fl_EFr8TJowLTrUtZYziBpA34Xtf6DrY/edit?usp=sharing

Next notebook in series is:
http://localhost:8888/notebooks/PycharmProjects/word2vec/score_datasets_and_ids.ipynb

In [None]:
# Get HQ DS
hq_ds_ids = pd.read_pickle('combined_analysis/hq_ds_ids.pickle')
# Spotted standard ds
spotted_df = pd.read_pickle('combined_analysis/spot_ds_ids_v2.pickle')
# Get whole body ds
wb_df = pd.read_pickle('to_metaspace/wholebody_matched/wholebody_ms2_df.pickle')
all_metadata = pd.concat([hq_ds_ids, spotted_df, wb_df], sort=True)

In [None]:
# Inputs for getting cos score annotations from ordered np.arr images
x = list(all_metadata[all_metadata.expt_type == 'high_quality'].ds_id_out)
y = ['2020-05-14_16h32m01s', '2020-05-14_16h32m04s', '2020-05-14_16h32m07s',
     '2020-05-14_16h32m10s', '2020-05-14_16h32m14s', '2020-05-14_16h32m16s',
     '2020-05-14_16h32m19s', '2020-05-14_16h32m22s', '2020-05-14_16h32m26s']
z = list(all_metadata[all_metadata.expt_type == 'wholebody'].ds_id_out)
to_score_wcos = [{'METASPACE_img_df_2020_May_18_good_quality_arr': x},
                 {'METASPACE_img_df_2020_May_20_spotted_9x_arr': y},
                 {'METASPACE_img_df_2020_May_26_wholebody_arr': z}]
df_list = []
for expt in to_score_wcos:
    for path, ds_ids in expt.items():
        df = annotate_cos_parent_fragment(path, ds_ids, path + '_df.pickle')
        df_list.append(df)
master_df = pd.concat(df_list, sort=True)

# Drop 2 bad results out of 600k, add formula annotations
master_df = master_df[master_df.cos != 'error!'].copy(deep=True)
master_df['delta_formula'] = master_df.apply((lambda x: delta_formula(x.par_formula,
                                                                     x.formula)), axis=1)
# Join with metadata
master_df = pd.read_pickle('data_analysis/master_df')
master_df.rename(columns={'ds_id': 'ds_id_out'}, inplace=True)
master_df = pd.merge(master_df, all_metadata, on='ds_id_out', how='left')

# Drop wholebody derivitized dataset, few annotations, inappropriate db
master_df = master_df[master_df.name_y != 'servier_TT_mouse_wb_fmpts_derivatization_CHCA'].copy(deep=True)

# Filter HQ-DS for 100 ID’s at 10% (parent) 318 --> 264
df = pd.read_pickle('/Users/dis/PycharmProjects/neutral_loss/high_quality.pickle')
df = df[['ds_id', 'fdr']]
df = df[df.fdr <= 0.1].copy(deep=True)
df['fdr'] = df.fdr.apply(lambda x: bool(x))
df = df.groupby(['ds_id']).sum().reset_index()
df = df[df.fdr.astype(int) >= 100].copy(deep=True)
good_ds_id = list(df.ds_id)
df = master_df[master_df.ds_id.isin(good_ds_id)].copy(deep=True)
master_df = master_df[master_df.expt_type != 'high_quality'].copy(deep=True)
master_df = pd.concat([df, master_df])

master_df.to_pickle('data_analysis/master_df')

In [5]:
from molmass import Formula

def ion_mass(formula, polarity):
    # From ion formula and polarity --> exact ion mass
    if polarity == 'positive':
        return Formula(formula).isotope.mass - 0.00055
    else:
        return Formula(formula).isotope.mass - 0.00055

In [7]:
# Annotate cm3_msms with exact mass of ions, isobars, and isobar counts
pos_df = pd.read_csv('to_metaspace/cm3_msms_all_pos.csv', sep='\t')
pos_df['polarity'] = 'positive'
pos_df['ion_mass'] = pos_df.apply(lambda x: ion_mass(x.formula, x.polarity),
                             axis=1)
pos_df = find_3_ppm_overlap_in_ds(pos_df, expt=False, out_col='db_isobar')
pos_df = n_isobars(pos_df, 'db_n_isobar', 'db_isobar')

neg_df = pd.read_csv('to_metaspace/cm3_msms_all_neg.csv', sep='\t')
neg_df['polarity'] = 'negative'
neg_df['ion_mass'] = neg_df.apply(lambda x: ion_mass(x.formula, x.polarity),
                             axis=1)
neg_df = find_3_ppm_overlap_in_ds(neg_df, expt=False, out_col='db_isobar')
neg_df = n_isobars(neg_df, 'db_n_isobar', 'db_isobar')

cm3_msms_df = pd.concat([pos_df, neg_df], sort=True).iloc[:,1:]
cm3_msms_df.to_pickle('to_metaspace/cm3_msms_all_both.pickle')

In [9]:
# Load df's to join:
master_df = pd.read_pickle('data_analysis/master_df')
cm3_msms_df = pd.read_pickle('to_metaspace/cm3_msms_all_both.pickle')
cm3_msms_df = cm3_msms_df[['polarity', 'id', 'formula',
             'db_n_isobar', 'ion_mass', 'db_isobar']].copy(deep=True)

# Makes key column to join on!
master_df['id'] = master_df['id_x'] + "_" + master_df['par_frag']
merged_df = master_df.merge(cm3_msms_df, how='left', 
                         on=['polarity', 'id', 'formula'])
# Drop 1411 of 686,821 rows that didn't join successfully
merged_df = merged_df[merged_df.ion_mass.isnull() != True]

In [11]:
# Annotate isobars within each dataset by ds_id
merged_df = find_3_ppm_overlap_in_ds(merged_df, expt=True, out_col='ds_isobar')
merged_df = n_isobars(merged_df, 'ds_n_isobar', 'ds_isobar')

0 2017-07-21_14h46m19s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[out_col] = df[col].apply(lambda x: find_3_ppm(x, col, target, df))


1 2016-12-01_18h42m02s
2 2017-08-08_08h49m46s
3 2019-07-09_11h26m26s
4 2017-08-16_11h15m04s
5 2019-07-23_23h51m54s
6 2017-07-18_10h11m20s
7 2017-02-17_14h41m43s
8 2019-09-12_00h25m56s
9 2018-10-15_11h30m30s
10 2017-03-14_05h58m20s
11 2016-10-14_17h51m23s
12 2018-06-14_00h45m31s
13 2017-08-08_09h05m39s
14 2017-08-08_09h04m00s
15 2018-08-01_14h19m26s
16 2017-07-20_14h22m38s
17 2017-08-01_07h51m51s
18 2018-08-01_13h35m03s
19 2017-08-01_07h50m35s
20 2018-10-20_22h35m05s
21 2017-07-07_07h37m30s
22 2018-08-01_14h18m42s
23 2019-10-29_21h29m01s
24 2018-09-14_22h42m37s
25 2016-09-22_11h16m27s
26 2017-07-21_09h18m43s
27 2019-10-29_16h53m33s
28 2019-03-08_16h46m27s
29 2019-05-10_20h18m39s
30 2018-09-04_00h48m58s
31 2017-12-20_16h28m12s
32 2018-01-04_15h10m02s
33 2017-08-09_10h07m05s
34 2017-08-01_07h48m41s
35 2019-07-19_19h41m43s
36 2016-11-18_06h01m50s
37 2018-07-23_12h10m59s
38 2017-02-23_08h53m07s
39 2017-12-20_16h49m07s
40 2018-11-21_21h36m35s
41 2017-07-21_09h23m08s
42 2017-08-08_14h28m01s
4

In [14]:
merged_df.to_pickle('data_analysis/scored_df.pickle')

Next notebook is: http://localhost:8888/notebooks/PycharmProjects/word2vec/score_datasets_and_ids.ipynb