# 3. Generating a set of random chemicals

In [1]:
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from pathlib import Path
import glob

In [4]:
import sys
sys.path.append('../..')

In [5]:
from vimms.DataGenerator import extract_hmdb_metabolite, get_data_source, get_spectral_feature_database
from vimms.MassSpec import IndependentMassSpectrometer
from vimms.Controller import SimpleMs1Controller
from vimms.Common import *
from vimms.Roi import make_roi, RoiToChemicalCreator, extract_roi
from vimms.Chemicals import ChemicalCreator

In [6]:
# set_log_level_info()
set_log_level_debug()

### Load Existing Chromatogram and Fragment Pickle File

In order to do this you can either use the existing pickle file or you can generate your own pickle file using Section 1

In [7]:
data_dir = os.path.abspath(os.path.join(os.getcwd(),'..','..','tests','integration','fixtures'))
ps = load_obj(Path(data_dir,'peak_sampler_mz_rt_int_beerqcb_fullscan.p'))

Define an output folder containing our results

In [8]:
out_dir = Path(base_dir, 'results', 'MS1_single')

### Download HMDB sample

In [9]:
compound_file = Path(base_dir, 'hmdb_compounds.p')
hmdb_compounds = load_obj(compound_file)
if hmdb_compounds is None: # if file does not exist

    # download the entire HMDB metabolite database
    url = 'http://www.hmdb.ca/system/downloads/current/hmdb_metabolites.zip'

    out_file = download_file(url)
    compounds = extract_hmdb_metabolite(out_file, delete=True)
    save_obj(compounds, compound_file)

else:
    print('Loaded %d DatabaseCompounds from %s' % (len(hmdb_compounds), compound_file))

Loaded 114087 DatabaseCompounds from C:\Users\Vinny\work\vimms\demo\01. Data\example_data\hmdb_compounds.p


In [10]:
hmdb = load_obj(Path(base_dir, 'hmdb_compounds.p'))

### Create a new HMDB sample

In [11]:
# the list of ROI sources created in the previous notebook '01. Download Data.ipynb'
ROI_Sources = [str(Path(base_dir,'beers', 'datasets'))]

# minimum MS1 intensity of chemicals
min_ms1_intensity = 1.75E5

# m/z and RT range of chemicals
rt_range = [(0, 1440)]
mz_range = [(0, 1050)]

# the number of chemicals in the sample
n_chems = 6500

# maximum MS level (we do not generate fragmentation peaks when this value is 1)
ms_level = 1

In [12]:
chems = ChemicalCreator(ps, ROI_Sources, hmdb)
hmdb_dataset = chems.sample(mz_range, rt_range, min_ms1_intensity, n_chems, ms_level)
save_obj(hmdb_dataset, Path(out_dir, 'hmdb_dataset.p'))

2020-07-17 16:59:40.868 | DEBUG    | vimms.Chemicals:__init__:239 - Sorting database compounds by masses
2020-07-17 16:59:45.482 | DEBUG    | vimms.Chemicals:sample:272 - 6500 chemicals to be created.
2020-07-17 16:59:46.312 | DEBUG    | vimms.Chemicals:_sample_formulae:346 - Sampling formula 0/6500
2020-07-17 16:59:53.743 | DEBUG    | vimms.Chemicals:_sample_formulae:346 - Sampling formula 500/6500
2020-07-17 17:00:05.688 | DEBUG    | vimms.Chemicals:_sample_formulae:346 - Sampling formula 1000/6500
2020-07-17 17:00:11.647 | DEBUG    | vimms.Chemicals:_sample_formulae:346 - Sampling formula 1500/6500
2020-07-17 17:00:17.380 | DEBUG    | vimms.Chemicals:_sample_formulae:346 - Sampling formula 2000/6500
2020-07-17 17:00:23.574 | DEBUG    | vimms.Chemicals:_sample_formulae:346 - Sampling formula 2500/6500
2020-07-17 17:00:29.422 | DEBUG    | vimms.Chemicals:_sample_formulae:346 - Sampling formula 3000/6500
2020-07-17 17:00:34.493 | DEBUG    | vimms.Chemicals:_sample_formulae:346 - Sampli

In [13]:
for chem in hmdb_dataset[0:10]:
    print(chem)

KnownChemical - 'C8H10N2O3S' rt=706.81 max_intensity=1540634.01
KnownChemical - 'C16H15N5O7S2' rt=628.85 max_intensity=29865355.23
KnownChemical - 'C17H14F3N3O2S' rt=656.33 max_intensity=5392742.37
KnownChemical - 'H3NO3S' rt=248.51 max_intensity=248833.42
KnownChemical - 'C11H19NS' rt=251.28 max_intensity=3417417.09
KnownChemical - 'C17H17N3O3S' rt=235.21 max_intensity=379594.54
KnownChemical - 'C14H25N3O4S' rt=482.16 max_intensity=583376.06
KnownChemical - 'C5H10OS' rt=283.33 max_intensity=497063.32
KnownChemical - 'C14H18O3' rt=260.40 max_intensity=1155595.19
KnownChemical - 'C18H18O4' rt=795.26 max_intensity=620436.20
