In [1]:
import pandas as pd
import numpy as np
import sys
import os, os.path

The purpose of this experiment is to figure out how to best automate a larger number of searches on the beta server.  The goal, is to search all existing public datasets on the beta server with 'core_metabolome_v3' for the 'H2O' neutral loss, at FDR=0.5.  Data will later be filtered to FDR<=0.2 for parent, and FDR<=0.5 for neutral loss, since the current FDR calculations may unfairly penalize neutral losses.

Here, we will use a subset of "high-quality" datasets from the top-4 labs submitting orbitrap/FTMS in the positive and negative ion modes.

Data will be searched against a "core metabolome DB".

#Off-line steps:
1. Metadata as csv was downloaded from: https://beta.metaspace2020.eu/ on 2020 Feb 06.
2. Data were imported into Google Sheets, since formatting is comma seperated but then there are commas in text files:
https://docs.google.com/spreadsheets/d/1DOLikG1euG-brCrMB5jrKwiu7bknPUVGVf8JsZXY3hM/edit?usp=sharing
3. Data were further filtered for quality as described above in: "neutral_loss/good_nl_reports/high_quality_data_investigations.ipynb"
4. Core metabolome was calculated as in: "core_metabolome/core_metabolome_v3.pickle" 

In [2]:
# All datasets on beta server
beta_raw = pd.read_csv('/Users/dis/PycharmProjects/neutal_loss_2/Metaspace_beta_2020_Feb.csv',
                       sep='\t')

# Good quality datasets
good_ds_list = list(pd.read_csv('good_ds_2020_Feb_25.txt').good_datasets)

# Filter for only good datasets
beta_raw = beta_raw[beta_raw['datasetId'].isin(good_ds_list)]

# Core metabolome
# core_metabolome = pd.read_pickle('core_metabolome_v1.pickle')

In [3]:
meta = beta_raw[['datasetId', 'group', 'analyzer', 'polarity', 'FDR@10%']].rename(columns=({'FDR@10%':'FDR10-v4'}))
meta.to_pickle('ds_id_meta.pickle')

In [4]:
beta_df = beta_raw.replace({',':''}, regex=True)
columns = ['datasetId', 'datasetName', 'polarity', 'organism', 'organismPart', 'analyzer', 
           'ionisationSource', 'maldiMatrix']
temp_df = beta_df[columns].copy(deep=True)
beta_df['query'] = temp_df.apply(lambda x: ','.join(x.dropna().values.tolist()), axis=1)
beta_df['result'] = ''

In [5]:
beta_df.shape

(433, 20)

In [None]:
#%%capture cap_out
i = 0 # 0
j = 432 # Max = 3248

while i <= j:
    x = "Current row is: " + str(i) + " of 432"
    print(x)
    i += 1
    row = beta_df.iloc[i,:]

    with open('/Users/dis/PycharmProjects/neutral_loss/good_nl_reports/neutral_loss_report_beta_test_3.py', 'r') as file:
        filedata = file.read()

    filedata = filedata.replace('Literal_to_replace_ds_id', 
                                row.query)
    filedata = filedata.replace('Literal_to_replace_out', 
                                row.query.split(',')[0])

    with open('/Users/dis/PycharmProjects/neutral_loss/good_nl_reports/neutral_loss_report_beta_test_4.py', 'w') as file:
        file.write(filedata)
        
    %run -i 'neutral_loss_report_beta_test_4.py'

Current row is: 0 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 637 annotations for 2018-10-12_09h25m16s @ 0.5
Current row is: 1 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 626 annotations for 2018-10-16_09h53m21s @ 0.5
Current row is: 2 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 680 annotations for 2018-10-31_10h26m49s @ 0.5
Current row is: 3 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 739 annotations for 2018-11-08_15h29m23s @ 0.5
Current row is: 4 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 678 annotations for 2019-01-17_19h54m30s @ 0.5
Current row is: 5 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 1533 annotations for 2019-03-08_16h46m27s @ 0.5
Current row is: 6 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 2406 annotations 

Exception: 2019-11-19_09h49m15s not found

Current row is: 18 of 432
Unauthorized. Only public but not private datasets will be accessible.


Exception: 2019-11-19_17h59m17s not found

Current row is: 19 of 432
Unauthorized. Only public but not private datasets will be accessible.


Exception: 2019-11-27_11h08m37s not found

Current row is: 20 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 677 annotations for 2017-08-04_12h38m01s @ 0.5
Current row is: 21 of 432
Unauthorized. Only public but not private datasets will be accessible.


Exception: 2018-06-18_10h46m15s not found

Current row is: 22 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 324 annotations for 2018-09-14_18h15m42s @ 0.5
Current row is: 23 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 2190 annotations for 2016-10-10_10h50m00s @ 0.5
Current row is: 24 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 2153 annotations for 2016-11-15_12h00m00s @ 0.5
Current row is: 25 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 273 annotations for 2018-03-26_22h32m23s @ 0.5
Current row is: 26 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 951 annotations for 2018-02-22_10h25m27s @ 0.5
Current row is: 27 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 1063 annotations for 2016-10-14_17h51m23s @ 0.5
Current row is: 28 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 1580 ann

Exception: 2016-09-22_11h16m16s not found

Current row is: 65 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 596 annotations for 2017-08-30_14h30m02s @ 0.5
Current row is: 66 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 385 annotations for 2017-08-25_07h57m51s @ 0.5
Current row is: 67 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 302 annotations for 2017-08-18_16h00m58s @ 0.5
Current row is: 68 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 319 annotations for 2017-08-16_11h15m04s @ 0.5
Current row is: 69 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 364 annotations for 2017-08-16_08h30m18s @ 0.5
Current row is: 70 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 129 annotations for 2017-08-16_07h50m21s @ 0.5
Current row is: 71 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 252 annotat

Got 722 annotations for 2017-08-01_07h45m52s @ 0.5
Current row is: 121 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 709 annotations for 2017-08-01_07h45m10s @ 0.5
Current row is: 122 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 721 annotations for 2017-08-01_07h44m05s @ 0.5
Current row is: 123 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 695 annotations for 2017-08-01_07h43m16s @ 0.5
Current row is: 124 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 759 annotations for 2017-08-01_07h42m41s @ 0.5
Current row is: 125 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 722 annotations for 2017-08-01_07h41m55s @ 0.5
Current row is: 126 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 747 annotations for 2017-08-01_07h41m18s @ 0.5
Current row is: 127 of 432
Unauthorized. Only public bu

Got 98 annotations for 2017-03-11_19h45m33s @ 0.5
Current row is: 176 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 198 annotations for 2017-03-11_19h48m10s @ 0.5
Current row is: 177 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 835 annotations for 2016-12-21_14h25m44s @ 0.5
Current row is: 178 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 156 annotations for 2016-12-21_14h07m58s @ 0.5
Current row is: 179 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 866 annotations for 2016-12-21_09h49m12s @ 0.5
Current row is: 180 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 234 annotations for 2016-10-21_07h18m06s @ 0.5
Current row is: 181 of 432
Unauthorized. Only public but not private datasets will be accessible.
Got 494 annotations for 2016-09-22_11h16m15s @ 0.5
Current row is: 182 of 432
Unauthorized. Only public but

In [None]:
i
cap_out.show()

In [12]:
# Counts reports
# path joining version for other paths
DIR = '/Users/dis/PycharmProjects/neutral_loss/good_nl_reports/reports'
print(len([name for name in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, name))]))

318


In [10]:
original = sys.stdout
sys.stdout = open('cap_out_Feb__14_2020.txt', 'w')
cap_out.show()
sys.stdout = original

In [None]:
# Repeat download after all processing complete with reprocess turned off!
# Download to a fresh new folder

In [None]:
# Next script is: 
'http://localhost:8888/notebooks/PycharmProjects/neutral_loss/nl_0_3_clean_join_nb.ipynb'