In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

import wandb
import os
from sklearn.decomposition import PCA
import GPUtil
import itertools
import io

from collections import Counter

# Loading Data:
---

In [2]:
file_path = '/mnt/usb/cmdunham/preprocessed_ims_data/train_data.csv'
train = pd.read_csv(file_path)
file_path = '/mnt/usb/cmdunham/preprocessed_ims_data/val_data.csv'
val = pd.read_csv(file_path)
file_path = '/mnt/usb/cmdunham/preprocessed_ims_data/test_data.csv'
test = pd.read_csv(file_path)


file_path = '../data/name_smiles_embedding_file.csv'
name_smiles_embedding_df = pd.read_csv(file_path)

file_path = '/mnt/usb/cmdunham/MoNA_embeddings_big_df.csv'
mass_spec_embeddings = pd.read_csv(file_path)
mass_spec_embeddings = mass_spec_embeddings.rename(columns={
    'METHYL PROPIONATE': 'Methyl Propionate', 'DIETHYL MALEATE':'Diethyl Maleate'
    })

file_path = '../data/mass_spec_encoder_generated_embeddings.csv'
mass_spec_encoder_generated_embeddings = pd.read_csv(file_path)
mass_spec_encoder_generated_embeddings = mass_spec_encoder_generated_embeddings.drop('Unnamed: 0', axis=1)

In [3]:
# set the df index to be the chemical abbreviations in col 'Unnamed: 0'
name_smiles_embedding_df.set_index('Unnamed: 0', inplace=True)
name_smiles_embedding_df.head()

Unnamed: 0_level_0,Name,SMILES,embedding
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BKG,background,,
DEM,Diethyl Malonate,CCOC(=O)CC(=O)OCC,"[0.3809721, 0.0005454041, 0.25539744, -0.24272..."
DEB,"1,2,3,4-Diepoxybutane",C1C(O1)C2CO2,"[0.06318794, 0.009022224, 0.42160064, 0.195722..."
MES,2-(N-morpholino)ethanesulfonic acid,C1COCCN1CCS(=O)(=O)O,"[-0.32520828, 0.009838344, -0.15108332, 0.2845..."
DMMP,Dimethyl methylphosphonate,COP(=O)(C)OC,"[0.12106811, 0.00294244, -0.14450458, 0.072665..."


In [4]:
embedding_floats = []
for chem_name in name_smiles_embedding_df.index:
    if chem_name == 'BKG':
        embedding_floats.append(None)
    else:
        embedding_float = name_smiles_embedding_df['embedding'][chem_name].split('[')[1]
        embedding_float = embedding_float.split(']')[0]
        embedding_float = [np.float32(num) for num in embedding_float.split(',')]
        embedding_floats.append(embedding_float)

name_smiles_embedding_df['Embedding Floats'] = embedding_floats

## Setting up GPU:
---

In [5]:
if torch.cuda.is_available():
    # Get the list of GPUs
    gpus = GPUtil.getGPUs()

    # Find the GPU with the most free memory
    best_gpu = max(gpus, key=lambda gpu: gpu.memoryFree)

    # Print details about the selected GPU
    print(f"Selected GPU ID: {best_gpu.id}")
    print(f"  Name: {best_gpu.name}")
    print(f"  Memory Free: {best_gpu.memoryFree} MB")
    print(f"  Memory Used: {best_gpu.memoryUsed} MB")
    print(f"  GPU Load: {best_gpu.load * 100:.2f}%")

    # Set the device for later use
    device = torch.device(f'cuda:{best_gpu.id}')
    print('Current device ID: ', device)

    # Set the current device in PyTorch
    torch.cuda.set_device(best_gpu.id)
else:
    device = torch.device('cpu')
    print('Using CPU')

# Confirm the currently selected device in PyTorch
print("PyTorch current device ID:", torch.cuda.current_device())
print("PyTorch current device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

Selected GPU ID: 0
  Name: NVIDIA GeForce RTX 4090
  Memory Free: 21682.0 MB
  Memory Used: 2534.0 MB
  GPU Load: 98.00%
Current device ID:  cuda:0
PyTorch current device ID: 0
PyTorch current device name: NVIDIA GeForce RTX 4090


# Creating Chads:
---

### Spectra:
---

It might make more sense to split each sample into two rows, one for the positive spectrum, one for the negative.

In [None]:
# drop first two cols ('Unnamed:0' and 'index') and last 9 cols ('Label' and OneHot encodings) to get just spectra
train_spectra = train.iloc[:,2:-9]
train_chem_encodings = train.iloc[:,-8:]

val_spectra = val.iloc[:,2:-9]
val_chem_encodings = val.iloc[:,-8:]

test_spectra = test.iloc[:,2:-9]
test_chem_encodings = test.iloc[:,-8:]

In [None]:
# create tensors of spectra, true embeddings, and chemical name encodings for train and val
train_chem_labels = list(train['Label'])
train_embeddings_tensor = torch.Tensor([name_smiles_embedding_df['Embedding Floats'][chem_name] for chem_name in train_chem_labels]).to(device)
train_spectra_tensor = torch.Tensor(train_spectra.values).to(device)
train_chem_encodings_tensor = torch.Tensor(train_chem_encodings.values).to(device)

val_chem_labels = list(val['Label'])
val_embeddings_tensor = torch.Tensor([name_smiles_embedding_df['Embedding Floats'][chem_name] for chem_name in val_chem_labels]).to(device)
val_spectra_tensor = torch.Tensor(val_spectra.values).to(device)
val_chem_encodings_tensor = torch.Tensor(val_chem_encodings.values).to(device)

test_chem_labels = list(test['Label'])
test_embeddings_tensor = torch.Tensor([name_smiles_embedding_df['Embedding Floats'][chem_name] for chem_name in test_chem_labels]).to(device)
test_spectra_tensor = torch.Tensor(test_spectra.values).to(device)
test_chem_encodings_tensor = torch.Tensor(test_chem_encodings.values).to(device)