In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import requests
import zipfile
import glob
import pickle
import matplotlib.pyplot as plt

from parse_reference_spectra import parse_gnps_json, output_loop
from spectra_to_sirius import master_loop
from clone_ds_to_beta import copy_beta
from results_from_metaspace_msms_process import logon_metaspace
from metaspace.sm_annotation_utils import SMInstance
from results_from_metaspace_msms_process import split_data_frame_list
from sirius_to_metaspace_db import primary_loop
from cosine_spectra import score_alignment, read_ms_file
from metaspace_msms_mirror_spectra import mirror_main

Workflow steps on tissue data for METASAPCE MS/MS publication.

https://github.com/DinosaurInSpace/metaspace_msms_paper/blob/master/README.md
II. Steps 1-7

In [None]:
## 0. Load well behaved molecules from I.
# well_behaved_hits = read_pickle('intermediate/val_spotted.pickle')

# for testing:
spotted_cmpds_df = pd.read_csv('input/spotted_cmpds.csv', sep="\t")

In [None]:
## 1. List of ds_ids and metadata for selected experiments.
cmv3_wb_ds_name_dict = {'2016-09-21_16h07m45s': 'Technologie_ServierTES-WBrat-vehicle',
                        '2017-05-17_19h49m04s': 'whole body xenograft (1) [RMS norm]',
                        '2017-05-17_19h50m07s': 'wb xenograft trp pathway dosed- rms_corrected',
                        '2017-05-29_07h28m52s': 'servier_TT_mouse_wb_fmpts_derivatization_CHCA',
                        '2017-07-24_19h42m31s': 'Servier_Ctrl_mouse_wb_lateral_plane_9aa',
                        '2017-07-26_18h25m14s': 'Servier_Ctrl_mouse_wb_median_plane_9aa',
                        '2017-08-03_15h09m06s': 'Servier_Ctrl_mouse_wb_median_plane_chca',
                        '2017-08-03_15h09m51s': 'Servier_Ctrl_mouse_wb_lateral_plane_chca',
                        '2017-08-11_07h59m58s': 'Servier_Ctrl_mouse_wb_lateral_plane_DHB',
                        '2017-08-11_08h01m02s': 'Servier_Ctrl_mouse_wb_median_plane_DHB'
                        }

In [None]:
wholebody_ds_df = pd.DataFrame()
wholebody_ds_df['ds_id'] = cmv3_wb_ds_name_dict.keys()
wholebody_ds_df['polarity'] = ['positive', 'positive', 'positive', 
                               'positive', 'negative', 'negative', 
                               'positive', 'positive', 'positive', 
                               'positive']
wholebody_ds_df['group'] = ['Servier', 'Genentech', 'Genentech',
                           'Servier', 'Servier', 'Servier',
                           'Servier', 'Servier', 'Servier',
                           'Servier']
wholebody_ds_df['analyzer'] = 'FTICR'
wholebody_ds_df['expt_type'] = 'wholebody'
wholebody_ds_df['ds_id_in'] = wholebody_ds_df['ds_id']
wholebody_ds_df['ds_id_out'] = None
wholebody_ds_df['search'] = 'METASPACE_MSMS'
wholebody_ds_df['name'] = cmv3_wb_ds_name_dict.values()

wholebody_ds_df

In [None]:
## 2. Download METASPACE results.
# Clones to beta and searches with cm3
out_dict_list = []
for ds in cmv3_wb_ds_pol_dict.keys():
    x = copy_beta(ds, 'HNaKM', 'core_metabolome_v3')
    out_dict_list.append(x)
    
out_df = pd.DataFrame(out_dict_list)
out_df.to_pickle('intermediate/wholebody_cm3_df.pickle')

In [None]:
# Downloads results
df_list = []
for ms_ds_id in out_df.ds_id_out:
    ds_id_in = list(out_df[out_df.ds_id_out == ms_ds_id].ds_id_in)[0]
    sm = SMInstance(host='https://beta.metaspace2020.eu')
    sm = logon_metaspace(sm)
    ds = sm.dataset(id=ms_ds_id)
    results_df = ds.results(database='core_metabolome_v3').reset_index()
    results_df['ds_id_in'] = ds_id_in
    df_list.append(results_df)

In [None]:
# Filters MS1 results for FDR <=0.2
ms1_df = pd.concat(df_list)
ms1_df = ms1_df[ms1_df.fdr <=0.2]
ms1_df = ms1_df[['ds_id_in', 'moleculeIds']]
ms1_df = split_data_frame_list(ms1_df, 'moleculeIds')

In [None]:
## 3. Generating custom MS2 database using observed compounds.
for ds_id in list(wholebody_ds_df.ds_id):
    limit_list = list(ms1_df[ms1_df.ds_id_in == ds_id].moleculeIds.unique())
    polarity = list(wholebody_ds_df[wholebody_ds_df.ds_id == ds_id].polarity)[0]
    primary_loop(limit_list,
                 ds,
                 'II_wholebody',
                 polarity,
                 'intermediate/sirius_out/exp_positive.pickle',
                 'intermediate/sirius_out/theo_positive.pickle',
                 'intermediate/sirius_out/exp_negative.pickle',
                 'intermediate/sirius_out/theo_negative.pickle',
                 'intermediate/databases/ref_expt_df.pickle',
                 'intermediate/databases/ref_theo_df.pickle'
                 )

# Send databases to Vitaly for uploading

In [None]:
## 4. Running and interpreting the METASPACE MSMS results.
from clone_ds_to_beta import copy_beta

ori_ds_db = ['2016-09-21_16h07m45s',
             '2017-05-17_19h49m04s',
             '2017-05-17_19h50m07s',
             '2017-05-29_07h28m52s',
             '2017-07-24_19h42m31s',
             '2017-07-26_18h25m14s',
             '2017-08-03_15h09m06s',
             '2017-08-03_15h09m51s',
             '2017-08-11_07h59m58s',
             '2017-08-11_08h01m02s']
out_dict_list = []
for ds in ori_ds_db:
    print(ds)
    x = copy_beta(ds, 'M', ds)
    out_dict_list.append(x)
    
out_df = pd.DataFrame(out_dict_list)
out_df.to_pickle('intermediate/wholebody_ms2_df.pickle')

In [None]:
wholebody_ds_df.update(out_df)

In [None]:
## 5. Generate colocalization weighted psuedo-MS/MS spectra from ISF data.

In [None]:
# Test inputs, replace with actual data when available.
original_ds_id_on_prod = '2020-03-12_17h55m21s'  # Update to run!
db_id_on_beta = '2020-05-13_17h50m21s'  # Update to run!
ds_id_on_beta = '2020-05-14_16h32m01s'  # Update to run!
path_to_reports = 'TEMP/reporting/'  # Update to run!
polarity = ['positive', 'negative'][0]  # Update to run!
psuedo_y_axis = ['binary', 'fdr', 'msm', 'cos', 'intensity'][0]
ref_spectra_df = pd.read_pickle("input/cm3_reference_spectra_df.pickle")

In [None]:
# Real inputs as data becomes available
original_ds_id_on_prod = None  # Update to run!
db_id_on_beta = None  # Update to run!
ds_id_on_beta = None  # Update to run!
path_to_reports = 'TEMP/reporting/'  # Update to run!
polarity = ['positive', 'negative'][0]  # Update to run!
psuedo_y_axis = ['binary', 'fdr', 'msm', 'cos', 'intensity'][0]
ref_spectra_df = pd.read_pickle("input/cm3_reference_spectra_df.pickle")


In [None]:
# Filter reference spectra and results on good spotted from above!
x = list(spotted_cmpds_df.id)
ref_spectra_df = ref_spectra_df[ref_spectra_df.id.isin(x)]

In [None]:
# Have to run some kind of loop for each dataset!
mirror_main(original_ds_id_on_prod,
                db_id_on_beta,
                ds_id_on_beta,
                path_to_reports,
                polarity,
                psuedo_y_axis,
                ref_spectra_df)

In [None]:
## 6. Score predicted, experimental, and psuedo-MS/MS spectra together for well-behaved subset.

# What scoring method?

In [None]:
## 7. Plot examples.
# Make a list of good examples to plot for paper...
def load_arr_pickle(path):
    # Loads pickled np.array
    with open(path, 'rb') as f:
        return pickle.load(f)

plt.imshow(load_arr_pickle('*.pickle'))

To do:
1. Run through it and check to make sure steps work.