## Import packages

In [6]:
import os, sys
import numpy as np
import pickle as pkl
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
import gc

sys.path.insert(0, os.path.dirname(os.path.abspath('..')))
from mass_automation.experiment import Spectrum, Experiment

### Vectorization 
is a procedure where mass spectrum is encoded with vector.


In MEDUSA, vectorization algorithm is based on division of spectra on intervals (bins) and counting measure (Maximal intensity, mean intensity, e.t.c.) in each interval. Each bin with counted measure characterises one component of the resulting vector.

These procedure can be easily realized with spectrum method *vectrorize*.

If you have many mass spectra, you can automatize this procedure and create dataset with vectors

<b>!! Specify your directory path manually in the cell below. At the time of the first launch there is an example of a possible path for Windows !! </b>

In [1]:
dir_path = 'D:\\mass_spectra\\TEA_dataset\\'

Unfortunatelly, the procedure takes a time for a big amount of spectra

In [None]:
spectra_names = os.listdir(dir_path)  # Create list with spectra filenames in directory 

spec_vecs = {}
for spectra_name in tqdm(spectra_names):
    
    if spectra_name.split('.')[-1] == 'mzXML':  # Check if file in directory is in mzXML format
        spec_path = dir_path + spectra_name
        exp = Experiment(spec_path, 32, 2)    # Create experiment object with 32 scans and 2m points (optional parameters) 
        spec = exp.summarize(4, 9)            # Sumarize some spectra in experiment in one spectrum object 
        vector = spec.vectorize(min_mass=150,
                                max_mass=1000,
                                delta_mass=1,
                                method=np.max  # Vectorize spectrum with maximal intensity method
                                )             
        del spec, exp  # We don't want MemoryError on our calculators :)
        gc.collect()
        spec_vecs[spectra_name] = vector      # Write vectors in dictionary
        

HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 26.14it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 37.42it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 34.51it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 32.40it/s]


### Now we can save resulting dataset in pickle format to use it for clustering with PCA and cluster maps

<b> !! Specify the path for saving pickle manually in the cell below. At the time of the first launch Dataset are meant to be saved in *data/plot_pca_files/new_spec_vecs_dictionary_user_try.pkl* !! </b>

In [7]:
save_path = os.path.join(os.path.dirname(os.path.abspath('..')),
                         'data',
                         'plot_pca_files',
                         'new_spec_vecs_dictionary_user_try.pkl')

In [None]:
with open(save_path, 'wb') as f:
    pkl.dump(spec_vecs, f)