In [1]:
from src.data.mgf_tools.mgf_get import *
from src.config import *
from src.utils import *
import pandas as pd
from deepmol.splitters import MultiTaskStratifiedSplitter
from deepmol.compound_featurization import MorganFingerprint
from deepmol.datasets import SmilesDataset
from src.config import *
import pickle

# DATA SPLIT


In [2]:
mgf_data = mgf_path


In [3]:
spectra = mgf_get_spectra(mgf_data)

In [4]:
smiles_data = mgf_get_smiles(spectra)

smiles_df = pd.DataFrame(smiles_data)

In [5]:
processed_spectra = mgf_deconvoluter(mgf_data=spectra, mz_vocabs=mz_vocabs, min_num_peaks=5, max_num_peaks=max_num_peaks, noise_rmv_threshold=0.01, mass_error=0.01, allowed_spectral_entropy=True, log=False)


In [6]:
spectrum_ids = [spec[0] for spec in processed_spectra]
print(f"Espectros processados: {len(spectrum_ids)}")

filtered_smiles= smiles_df[smiles_df['spectrum_id'].isin(spectrum_ids)]
print(f"SMILES filtrados: {len(filtered_smiles)}")

In [7]:
smiles_list = filtered_smiles['smiles'].tolist()
ids_list = filtered_smiles['spectrum_id'].tolist()

In [8]:
dataset = SmilesDataset(smiles=smiles_list, ids=ids_list)
print(f"Dataset inicial: {len(dataset)} samples")

In [9]:
dataset = MorganFingerprint().featurize(dataset)
print(f"Fingerprints gerados: {len(dataset)}")

In [10]:
dataset.X

In [11]:
dataset._y = dataset.X

In [12]:
dataset.y

In [13]:
train_dataset, val_dataset, test_dataset = MultiTaskStratifiedSplitter().train_valid_test_split(
    dataset, frac_train=0.8, frac_val=0.1, frac_test=0.1, seed=0)

In [14]:
print(f"SPLIT")
print(f"Train: {len(train_dataset)} samples")
print(f"Validation: {len(val_dataset)} samples")
print(f"Test: {len(test_dataset)} samples")

In [15]:
def generate_stats(y_train: np.ndarray, y_test: np.ndarray, y_val: np.ndarray=None):
    """
    Parameters
    ----------
    y_train : np.ndarray
        Labels of the train set
    y_test : np.ndarray
        Labels of the test set
    y_val : np.ndarray, optional
        Labels of the validation set, by default None
    
    Returns
    -------
    Tuple[pd.DataFrame, Any]
        DataFrame with the stats of the split, styled table
    """
    y_test_sum = np.sum(y_test, axis=0)
    y_train_sum = np.sum(y_train, axis=0)

    sum_of_all = pd.DataFrame([y_train_sum, y_test_sum], index=["train", "test"])

    if y_val is not None:
        y_val_sum = np.sum(y_val, axis=0)
        sum_of_all = pd.DataFrame([y_train_sum, y_test_sum, y_val_sum], index=["train", "test", "validation"])
        sum_of_all.loc['Validation relative split', :] = sum_of_all.loc['validation', :] / (sum_of_all.loc['train', :] + sum_of_all.loc['test', :] + sum_of_all.loc['validation', :]) * 100
        sum_of_all.loc['Test relative split', :] = sum_of_all.loc['test', :] / (sum_of_all.loc['train', :] + sum_of_all.loc['test', :]+ sum_of_all.loc['validation', :]) * 100
        sum_of_all.loc['Train relative split', :] = sum_of_all.loc['train', :] / (sum_of_all.loc['train', :] + sum_of_all.loc['test', :]+ sum_of_all.loc['validation', :]) * 100

    else:
        sum_of_all.loc['Test relative split', :] = sum_of_all.loc['test', :] / (sum_of_all.loc['train', :] + sum_of_all.loc['test', :]) * 100
        sum_of_all.loc['Train relative split', :] = sum_of_all.loc['train', :] / (sum_of_all.loc['train', :] + sum_of_all.loc['test', :]) * 100

    df = pd.melt(sum_of_all.T.reset_index(), id_vars=['index']).rename(columns={'index': 'EC', 'value': 'Percentage of data'})
    if y_val is not None:
        df = df[(df["variable"]!="train") & (df["variable"]!="validation") & (df["variable"]!="test")]
    else: 
        df = df[(df["variable"]!="train") & (df["variable"]!="test")]

    df1 = sum_of_all.loc['Test relative split', :].describe()
    df2 = sum_of_all.loc['Train relative split', :].describe()
    if y_val is not None:
        df3 = sum_of_all.loc['Validation relative split', :].describe()
        stats_table = pd.concat([df1, df2, df3], axis=1)
    else:
        stats_table = pd.concat([df1, df2], axis=1)

    stats_table.drop(['count'], inplace=True)
    table_styled = stats_table.style.background_gradient(cmap="YlGn")
    

    return df, table_styled


train_labels = train_dataset.y
val_labels = val_dataset.y
test_labels = test_dataset.y


df, table_styled = generate_stats(train_labels, test_labels, val_labels)

table_styled

## Getting Test set SMILES

In [5]:
from pathlib import Path

mgf_spectra = mgf_get_spectra(mgf_path)

REPO_ROOT = Path().cwd().parent

artifacts_dir = REPO_ROOT / "src/data/artifacts"
split_pkl = artifacts_dir / 'split_ids.pkl'

if not split_pkl.exists():
        raise FileNotFoundError("Split file not found")

with open(split_pkl, 'rb') as f:
    splits = pickle.load(f)
    print(f"Loaded splits: Train({len(splits['train'])}), Val({len(splits['val'])}), Test({len(splits['test'])})")
        
test_ids = set(splits['test'])        
test_spectra = [spec for spec in mgf_spectra if spec['params'].get('spectrum_id') in test_ids]

smiles_test_df = mgf_get_smiles(test_spectra, as_dataframe=True)

smiles_test_df.to_csv('test_set_smiles.csv', index=False)

# GNPS dataset

In [1]:
import json
from pathlib import Path

seed = 5
artifacts_dir = 'src/data/artifacts'


artifacts_dir = Path(artifacts_dir) / str(seed)

with open(artifacts_dir / 'pipeline_config.json', 'r') as f:
            pipeline_config = json.load(f)

max_num_peaks = pipeline_config['max_num_peaks']
max_seq_len = pipeline_config['max_seq_len']
mz_vocabs = pipeline_config['mz_vocabs']
vocab_size = pipeline_config['vocab_size']

In [2]:
import pandas as pd
import pickle

# 1. Carregar os IDs dos splits
with open('src/data/artifacts/5/split_ids.pkl', 'rb') as f:
    splits = pickle.load(f)

# 2. Carregar o ficheiro de fingerprints que já tens no disco
all_fps = pd.read_pickle('src/data/artifacts/5/fingerprints.pkl')

# 3. Filtrar apenas os espectros que foram parar ao treino
train_fps = all_fps[all_fps['spectrum_id'].isin(splits['train'])]

# 4. Remover a coluna do ID e ver quantas fingerprints diferentes sobram
unique_molecules = train_fps.drop(columns=['spectrum_id']).drop_duplicates()

print(f"Total de espectros no treino: {len(train_fps)}")
print(f"Moléculas (Fingerprints) ÚNICAS no treino: {len(unique_molecules)}")

Total de espectros no treino: 216311
Moléculas (Fingerprints) ÚNICAS no treino: 15086


In [10]:
type(train_fps)

fp_columns = [col for col in train_fps.columns if col != 'spectrum_id']

fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')

print("\n TOP 10 Moléculas mais frequentes")
print(fp_counts['frequencia'].head(10))

print("\n Distribuição")
print(fp_counts['frequencia'].describe())

  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index


 TOP 10 Moléculas mais frequentes
0    685
1    637
2    581
3    541
4    536
5    523
6    498
7    494
8    475
9    467
Name: frequencia, dtype: int64

 Distribuição
count    15086.000000
mean        14.338526
std         33.308494
min          1.000000
25%          1.000000
50%          4.000000
75%         13.000000
max        685.000000
Name: frequencia, dtype: float64


  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')
  fp_counts=train_fps.value_counts(subset=fp_columns).reset_index(name='frequencia')


# IDSL_Mint dataset


In [10]:
with open('src/data/artifacts/4/split_ids.pkl', 'rb') as f:
    splits = pickle.load(f)

# 2. Carregar o ficheiro de fingerprints que já tens no disco
all_fps = pd.read_pickle('src/data/artifacts/4/fingerprints.pkl')

# 3. Filtrar apenas os espectros que foram parar ao treino
train_fps = all_fps[all_fps['spectrum_id'].isin(splits['train'])]

# 4. Remover a coluna do ID e ver quantas fingerprints diferentes sobram
unique_molecules = train_fps.drop(columns=['spectrum_id']).drop_duplicates()

print(f"Total de espectros no treino: {len(train_fps)}")
print(f"Moléculas (Fingerprints) ÚNICAS no treino: {len(unique_molecules)}")

Total de espectros no treino: 63540
Moléculas (Fingerprints) ÚNICAS no treino: 1711
