In [1]:
import pandas as pd
import numpy as np
import sys
import os, os.path

The purpose of this experiment is to figure out how to best automate a larger number of searches on the beta server.  The goal, is to search all existing public datasets on the beta server with 'core_metabolome_v3' for the 'H2O' neutral loss, at FDR=0.5.  Data will later be filtered to FDR<=0.2 for parent, and FDR<=0.5 for neutral loss, since the current FDR calculations may unfairly penalize neutral losses.

Here, we will use a subset of "high-quality" datasets from the top-4 labs submitting orbitrap/FTMS in the positive and negative ion modes.

Data will be searched against a "core metabolome DB".

#Off-line steps:
1. Metadata as csv was downloaded from: https://beta.metaspace2020.eu/ on 2020 Feb 06.
2. Data were imported into Google Sheets, since formatting is comma seperated but then there are commas in text files:
https://docs.google.com/spreadsheets/d/1DOLikG1euG-brCrMB5jrKwiu7bknPUVGVf8JsZXY3hM/edit?usp=sharing
3. Data were further filtered for quality as described above in: "neutral_loss/good_nl_reports/high_quality_data_investigations.ipynb"
4. Core metabolome was calculated as in: "core_metabolome/core_metabolome_v3.pickle" 

In [2]:
# All datasets on beta server
beta_raw = pd.read_csv('/Users/dis/PycharmProjects/neutal_loss_2/Metaspace_beta_2020_Feb.csv',
                       sep='\t')

# Good quality datasets
good_ds_list = list(pd.read_csv('good_ds_2020_Feb_25.txt').good_datasets)

# Filter for only good datasets
beta_raw = beta_raw[beta_raw['datasetId'].isin(good_ds_list)]

# Core metabolome
# core_metabolome = pd.read_pickle('core_metabolome_v1.pickle')

In [3]:
meta = beta_raw[['datasetId', 'group', 'analyzer', 'polarity', 'FDR@10%']].rename(columns=({'FDR@10%':'FDR10-v4'}))
meta.to_pickle('ds_id_meta.pickle')

In [4]:
beta_df = beta_raw.replace({',':''}, regex=True)
columns = ['datasetId', 'datasetName', 'polarity', 'organism', 'organismPart', 'analyzer', 
           'ionisationSource', 'maldiMatrix']
temp_df = beta_df[columns].copy(deep=True)
beta_df['query'] = temp_df.apply(lambda x: ','.join(x.dropna().values.tolist()), axis=1)
beta_df['result'] = ''

In [5]:
beta_df.shape

(433, 20)

In [6]:
#%%capture cap_out
i = 0 # 0
j = 432 # Max = 3248

while i <= j:
    x = "Current row is: " + str(i) + " of 432"
    print(x)
    i += 1
    row = beta_df.iloc[i,:]

    with open('/Users/dis/PycharmProjects/neutral_loss/good_nl_reports/neutral_loss_report_beta_test_3.py', 'r') as file:
        filedata = file.read()

    filedata = filedata.replace('Literal_to_replace_ds_id', 
                                row.query)
    filedata = filedata.replace('Literal_to_replace_out', 
                                row.query.split(',')[0])

    with open('/Users/dis/PycharmProjects/neutral_loss/good_nl_reports/neutral_loss_report_beta_test_4.py', 'w') as file:
        file.write(filedata)
        
    %run -i 'neutral_loss_report_beta_test_4.py'

Current row is: 0 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 1 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 2 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 3 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 4 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 5 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 6 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 7 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 8 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 9 of 432
Una

Authorized.
Current row is: 76 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 77 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 78 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 79 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 80 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 81 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 82 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 83 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 84 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current

Authorized.
Current row is: 151 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 152 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 153 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 154 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 155 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 156 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 157 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 158 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 159 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized

Authorized.
Current row is: 226 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 227 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 228 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 229 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 230 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 231 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 232 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 233 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 234 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized

Authorized.
Current row is: 301 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 302 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 303 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 304 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 305 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 306 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 307 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 308 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 309 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized

Authorized.
Current row is: 376 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 377 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 378 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 379 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 380 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 381 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 382 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 383 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized.
Current row is: 384 of 432
Unauthorized. Only public but not private datasets will be accessible.
Authorized

IndexError: single positional indexer is out-of-bounds

In [None]:
i
cap_out.show()

In [12]:
# Counts reports
# path joining version for other paths
DIR = '/Users/dis/PycharmProjects/neutral_loss/good_nl_reports/reports'
print(len([name for name in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, name))]))

318


In [10]:
original = sys.stdout
sys.stdout = open('cap_out_Feb__14_2020.txt', 'w')
cap_out.show()
sys.stdout = original

In [None]:
# Repeat download after all processing complete with reprocess turned off!
# Download to a fresh new folder

In [None]:
# Next script is: 
'http://localhost:8888/notebooks/PycharmProjects/neutral_loss/nl_0_3_clean_join_nb.ipynb'