# Select subset of compounds for disentanglement

In [14]:
import os
import numpy as np
import pandas as pd
from specvae import utils
from specvae.dataset import Spectra
from IPython.display import display
import plotly.express as px

In [15]:
# Parameters
dataset = "HMDB"

## Load data

In [16]:
if dataset == 'MoNA':
    base_path = utils.get_project_path() / '.data' / 'MoNA'
    data_path = base_path / 'MoNA_full.csv'
    metadata_path = base_path / 'MoNA_meta.npy'
elif dataset == 'HMDB':
    base_path = utils.get_project_path() / '.data' / 'HMDB'
    data_path = base_path / 'HMDB_full.csv'
    metadata_path = base_path / 'HMDB_meta.npy'

metadata = None
if os.path.exists(metadata_path):
    metadata = np.load(metadata_path, allow_pickle=True).item()

## Dataset

In [17]:
df = Spectra.open(data_path)
df.describe()

Unnamed: 0.1,Unnamed: 0,collision_energy,ionization_mode,ionization_mode_id,kingdom_id,superclass_id,class_id,subclass_id
count,92916.0,92916.0,92916.0,92916.0,92916.0,92916.0,92916.0,92916.0
mean,46457.5,23.333333,0.5,0.5,0.997159,9.04559,103.739119,240.160855
std,26822.683143,12.472258,0.500003,0.500003,0.053228,4.776928,48.438618,143.486602
min,0.0,10.0,0.0,0.0,0.0,0.0,-1.0,-1.0
25%,23228.75,10.0,0.0,0.0,1.0,7.0,87.0,133.0
50%,46457.5,20.0,0.5,0.5,1.0,7.0,87.0,198.0
75%,69686.25,40.0,1.0,1.0,1.0,12.0,115.0,395.0
max,92915.0,40.0,1.0,1.0,1.0,21.0,252.0,404.0


## Select by collision energy

In [18]:
df1 = df[df['collision_energy'].isin([5*i for i in range(1, 21)])].copy()

In [19]:
df1.groupby('collision_energy').size().reset_index(name='counts')

Unnamed: 0,collision_energy,counts
0,10,30972
1,20,30972
2,40,30972


In [20]:
fig = px.histogram(df1, x='collision_energy', width=1000, height=500)
fig.show()

In [21]:
df1 = df1[df1['instrument_type_id'].isin([1, 0, 7, 2, 10, 17])].copy()
df_it = df1.groupby('instrument_type_id').size().reset_index(name='counts')
df_it

KeyError: 'instrument_type_id'

In [None]:
fig = px.pie(df_it, values='counts', names='instrument_type_id', hover_data=None, width=1000, height=500)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [22]:
df_im = df1.groupby('ionization_mode_id').size().reset_index(name='counts')
df_im

Unnamed: 0,ionization_mode_id,counts
0,0,46458
1,1,46458


In [23]:
fig = px.pie(df_im, values='counts', names='ionization_mode_id', hover_data=None, width=1000, height=500)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [24]:
df1

Unnamed: 0.1,Unnamed: 0,spectrum,collision_energy,HMDB_map,SMILES,split,id,ionization_mode,ionization_mode_id,kingdom,superclass,class,subclass,kingdom_id,superclass_id,class_id,subclass_id,HMDB
0,0,29.03912516:0.7243706179 57.03403978:4.5379311...,10,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,1,1,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87,HMDB31492
1,1,29.03912516:9.587454265 55.01838972:2.54439610...,20,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,1,1,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87,HMDB31492
2,2,27.0234751:6.640847978 29.03912516:6.54806256 ...,40,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,1,1,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87,HMDB31492
3,3,27.0234751:0.03173920628 29.00273965:0.0149975...,10,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,0,0,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87,HMDB31492
4,4,27.0234751:0.3171135208 29.00273965:0.47242358...,20,HMDB:HMDB0031492,CCC(=O)C(=O)CC,train,HMDB31492,0,0,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,1,14,159,87,HMDB31492
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92911,92911,41.03912516:3.321276734 43.05477522:1.41515277...,20,HMDB:HMDB0012534,CC\C=C/C\C=C/C=C/[C@H](O)C\C=C/C\C=C/CCCC(O)=O,test,HMDB12534,1,1,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Eicosanoids,1,7,78,137,HMDB12534
92912,92912,27.0234751:1.818499485 41.03912516:10.00329657...,40,HMDB:HMDB0012534,CC\C=C/C\C=C/C=C/[C@H](O)C\C=C/C\C=C/CCCC(O)=O,test,HMDB12534,1,1,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Eicosanoids,1,7,78,137,HMDB12534
92913,92913,17.00273965:0.1727702035 43.05477522:0.1824458...,10,HMDB:HMDB0012534,CC\C=C/C\C=C/C=C/[C@H](O)C\C=C/C\C=C/CCCC(O)=O,test,HMDB12534,0,0,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Eicosanoids,1,7,78,137,HMDB12534
92914,92914,44.99765427:0.9915595017 59.01330434:5.8014561...,20,HMDB:HMDB0012534,CC\C=C/C\C=C/C=C/[C@H](O)C\C=C/C\C=C/CCCC(O)=O,test,HMDB12534,0,0,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Eicosanoids,1,7,78,137,HMDB12534


In [None]:
filepath = base_path / ('%s_dis.csv' % dataset)
df1.drop(columns=['Unnamed: 0'], inplace=True)
df1.to_csv(filepath)