In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

import wandb
import os
from sklearn.decomposition import PCA
import itertools

from collections import Counter

In [2]:
import importlib
import functions as f
# Reload the functions module after updates
importlib.reload(f)

<module 'functions' from '/home/cmdunham/ChemicalDataGeneration/models/functions.py'>

# Loading Data:
---

In [3]:
# file_path = '../data/carls/train_carls.feather'
file_path = '../data/carls/train_carls_one_per_spec.feather'
train_carls = pd.read_feather(file_path)
train_carls = train_carls.drop(columns=['level_0'])

# # file_path = '../data/carls/val_carls.feather'
file_path = '../data/carls/val_carls_one_per_spec.feather'
val_carls = pd.read_feather(file_path)
val_carls = val_carls.drop(columns=['level_0'])

# # file_path = '../data/carls/test_carls.feather'
file_path = '../data/carls/test_carls_one_per_spec.feather'
test_carls = pd.read_feather(file_path)
test_carls = test_carls.drop(columns=['level_0'])

In [4]:
# file_path = '../data/MoNA_embeddings_multiple_instrument_types.csv'
# mass_spec_embeddings = pd.read_csv(file_path)
# mass_spec_embeddings = mass_spec_embeddings.rename(columns={
#     'METHYL PROPIONATE': 'Methyl Propionate', 'DIETHYL MALEATE':'Diethyl Maleate'
#     })

# file_path = '../data/mass_spec_encoder_output.csv'
# mass_spec_encoder_generated_embeddings = pd.read_csv(file_path)

In [5]:
file_path = '../data/name_smiles_embedding_file.csv'
name_smiles_embedding_df = pd.read_csv(file_path)

# set the df index to be the chemical abbreviations in col 'Unnamed: 0'
name_smiles_embedding_df.set_index('Unnamed: 0', inplace=True)
name_smiles_embedding_df.head()

Unnamed: 0_level_0,Name,SMILES,embedding
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BKG,Background,,
DEM,Diethyl Malonate,CCOC(=O)CC(=O)OCC,"[0.3809719, 0.0005454039, 0.25539753, -0.24272..."
DEB,"1,2,3,4-Diepoxybutane",C1C(O1)C2CO2,"[0.06318759, 0.009022221, 0.42160067, 0.195722..."
MES,2-(N-morpholino)ethanesulfonic acid,C1COCCN1CCS(=O)(=O)O,"[-0.32520813, 0.009838342, -0.15108328, 0.2845..."
DMMP,Dimethyl methylphosphonate,COP(=O)(C)OC,"[0.12106821, 0.0029424396, -0.14450483, 0.0726..."


In [6]:
# file_path = '../data/mass_spec_data_for_kevin_steven.csv'
# mass_spec_encoder_generated_embeddings = pd.read_csv(file_path)

file_path = '../data/mass_spec_name_smiles_embedding_file.csv'
mass_spec_name_smiles_embedding_df = pd.read_csv(file_path)

# set the df index to be the chemical abbreviations in col 'Unnamed: 0'
mass_spec_name_smiles_embedding_df.set_index('Unnamed: 0', inplace=True)
mass_spec_name_smiles_embedding_df.head()

Unnamed: 0_level_0,SMILES,embedding,Name
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(5R,11R)-5,11-Dimethylpentacosane",CCCCCCCCCCCCCCC(C)CCCCCC(C)CCCC,"[-0.048933726, 0.0017029907, -0.272093, -0.369...","(5R,11R)-5,11-Dimethylpentacosane"
"1,3-Diaminopropane",C(CN)CN,"[-0.22048391, -0.0011410714, 0.06557533, 0.193...","1,3-Diaminopropane"
"1,4-Butanediamine",C(CCN)CN,"[-0.21609126, -0.0005372154, 0.24562673, 0.293...","1,4-Butanediamine"
1-Hexanol,CCCCCCO,"[-0.23157702, 0.0011296304, 0.21073784, -0.179...",1-Hexanol
1-Octanol,CCCCCCCCO,"[-0.2599937, 0.0021180445, 0.109357715, -0.293...",1-Octanol


In [7]:
embedding_floats = []
for chem_name in name_smiles_embedding_df.index:
    if chem_name == 'BKG':
        embedding_floats.append(None)
    else:
        embedding_float = name_smiles_embedding_df['embedding'][chem_name].split('[')[1]
        embedding_float = embedding_float.split(']')[0]
        embedding_float = [np.float32(num) for num in embedding_float.split(',')]
        embedding_floats.append(embedding_float)

name_smiles_embedding_df['Embedding Floats'] = embedding_floats

In [8]:
mass_spec_embedding_floats = []
for chem_name in mass_spec_name_smiles_embedding_df.index:
    embedding_float = mass_spec_name_smiles_embedding_df['embedding'][chem_name].split('[')[1]
    embedding_float = embedding_float.split(']')[0]
    embedding_float = [np.float32(num) for num in embedding_float.split(',')]
    mass_spec_embedding_floats.append(embedding_float)

mass_spec_name_smiles_embedding_df['Embedding Floats'] = mass_spec_embedding_floats

In [9]:
# filtering out chems with < 5 embeddings
# mass_spec_chem_counts = Counter(mass_spec_encoder_generated_embeddings['Label'])
# mass_spec_chem_counts = Counter(mass_spec_encoder_generated_embeddings['chem_name'])
# chems_above_5 = [key for key, count in mass_spec_chem_counts.items() if count >= 5]
filtered_mass_spec_embeddings = pd.DataFrame([emb for emb in mass_spec_name_smiles_embedding_df['Embedding Floats']]).T #mass_spec_embeddings[chems_above_5]
cols = mass_spec_name_smiles_embedding_df.index
filtered_mass_spec_embeddings.columns = cols
# filtered_mass_spec_encoder_generated_embeddings = mass_spec_encoder_generated_embeddings[mass_spec_encoder_generated_embeddings['Label'].isin(chems_above_5)]

# Combine embeddings for IMS simulants and mass spec chems to use for plotting pca
ims_embeddings = pd.DataFrame([emb for emb in name_smiles_embedding_df['Embedding Floats']][1:]).T
cols = name_smiles_embedding_df.index[1:]
ims_embeddings.columns = cols
all_true_embeddings = pd.concat([ims_embeddings, filtered_mass_spec_embeddings], axis=1)
# all_true
all_true_embeddings.head()

Unnamed: 0,DEM,DEB,MES,DMMP,DPM,JP8,TEPO,DtBP,"(5R,11R)-5,11-Dimethylpentacosane","1,3-Diaminopropane",...,Methyl Octadecanoate,Naphthalene,Pentadecane,Phenanthrene,Pyrene,Spermidine,Succinic Acid,Testosterone,Tryptophan,Undecane
0,0.380972,0.063188,-0.325208,0.121068,-0.023968,0.025142,0.193039,0.22609,-0.048934,-0.220484,...,0.169634,0.468407,0.117451,0.579996,0.444973,-0.182377,0.064577,-0.049732,0.188416,0.110121
1,0.000545,0.009022,0.009838,0.002942,0.00272,0.011977,0.000974,0.000808,0.001703,-0.001141,...,0.005854,-0.00069,0.001458,-0.00171,-0.004033,0.002279,0.035474,0.068347,0.001636,0.000952
2,0.255398,0.421601,-0.151083,-0.144505,0.158301,0.542705,0.066288,-0.06216,-0.272093,0.065575,...,-0.12364,0.257259,-0.039999,0.064326,0.064121,0.435218,-0.251742,-0.038517,-0.450764,-0.037651
3,-0.242728,0.195723,0.284503,0.072665,-0.010084,0.364973,-0.195223,-0.045342,-0.369603,0.193559,...,-0.355696,0.331527,-0.441098,0.421061,0.428294,0.452678,0.044983,-0.575486,0.635695,-0.432776
4,-0.305107,-0.167326,-0.099838,-0.107286,-0.067723,-0.236272,-0.272051,-0.07588,-0.023527,-0.015358,...,-0.038613,-0.050993,-0.022548,-0.034104,-0.019335,-0.019928,-0.010587,-0.412169,-0.049434,-0.02541


# Training Encoder on Carls:
---

In [10]:
device = f.set_up_gpu()

Selected GPU ID: 0
  Name: NVIDIA H100 PCIe
  Memory Free: 80991.0 MB
  Memory Used: 3.0 MB
  GPU Load: 0.00%
Current device ID:  cuda:0
PyTorch current device ID: 0
PyTorch current device name: NVIDIA H100 PCIe


In [11]:
# file_path = '/mnt/usb/cmdunham/preprocessed_ims_data/train_carls_dif_backgrounds.csv'
# train_embeddings_tensor, train_carl_tensor, train_chem_encodings_tensor, train_carl_indices_tensor = f.create_dataset_tensors_with_dask(file_path, name_smiles_embedding_df, device, carl=True)
# file_path = '/mnt/usb/cmdunham/preprocessed_ims_data/val_carls_dif_backgrounds.csv'
# val_embeddings_tensor, val_carl_tensor, val_chem_encodings_tensor, val_carl_indices_tensor = f.create_dataset_tensors_with_dask(file_path, name_smiles_embedding_df, device, carl=True)
# file_path='/mnt/usb/cmdunham/preprocessed_ims_data/test_carls_dif_backgrounds.csv'
# test_embeddings_tensor, test_carl_tensor, test_chem_encodings_tensor, test_carl_indices_tensor = f.create_dataset_tensors_with_dask(file_path, name_smiles_embedding_df, device, carl=True)

In [12]:
train_embeddings_tensor, train_carl_tensor, train_chem_encodings_tensor, train_carl_indices_tensor = f.create_dataset_tensors(train_carls, name_smiles_embedding_df, device, carl=True)
val_embeddings_tensor, val_carl_tensor, val_chem_encodings_tensor, val_carl_indices_tensor = f.create_dataset_tensors(val_carls, name_smiles_embedding_df, device, carl=True)
test_embeddings_tensor, test_carl_tensor, test_chem_encodings_tensor, test_carl_indices_tensor = f.create_dataset_tensors(test_carls, name_smiles_embedding_df, device, carl=True)

In [13]:
sorted_chem_names = list(train_carls.columns[-8:])
del train_carls, val_carls, test_carls

In [None]:
# Things that need to be changed for each encoder/dataset/target embedding
notebook_name = '/home/cmdunham/ChemicalDataGeneration/models/carl_encoder.ipynb'
# notebook_name = 'C:/Users/cmdunham/OneDrive/Documents/phd_program/ChemicalDataGeneration/models/carl_encoder.ipynb'
architecture = 'carl_encoder'
dataset_type = 'carls'
target_embedding = 'ChemNet'
encoder_path = 'trained_models/carl_to_chemnet_encoder_reparameterization.pth'

config = {
    'wandb_entity': 'catemerfeld',
    'wandb_project': 'ims_encoder_decoder',
    'gpu':True,
    'threads':1,
}

os.environ['WANDB_NOTEBOOK_NAME'] = notebook_name

wandb.login(key='9729ad7b1f3a60f1072cdf7af979b737955733d4')

[34m[1mwandb[0m: Currently logged in as: [33mcatemerfeld[0m. Use [1m`wandb login --relogin`[0m to force relogin




[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/cmdunham/.netrc


True

In [15]:
# Reload the functions module after updates
importlib.reload(f)

<module 'functions' from '/home/cmdunham/ChemicalDataGeneration/models/functions.py'>

In [None]:
early_stopping_threshold = 20
wandb_kwargs = {
    'architecture': architecture,
    'optimizer':'AdamW',
    'loss':'MSELoss',
    'dataset': dataset_type,
    'target_embedding': target_embedding,
    'early stopping threshold':early_stopping_threshold
}

# sorted_chem_names = list(train_carls.columns[-8:])

model_hyperparams = {
  'batch_size':[32],
  'epochs': [500],
  'learning_rate':[.00001],
  }

train_data = TensorDataset(train_carl_tensor, train_chem_encodings_tensor, train_embeddings_tensor, train_carl_indices_tensor)
val_data = TensorDataset(val_carl_tensor, val_chem_encodings_tensor, val_embeddings_tensor, val_carl_indices_tensor)
test_data = TensorDataset(test_carl_tensor, test_chem_encodings_tensor, test_embeddings_tensor, test_carl_indices_tensor)

best_hyperparams = f.train_model(
    'Encoder', train_data, val_data, test_data, 
    device, config, wandb_kwargs, 
    all_true_embeddings, name_smiles_embedding_df, model_hyperparams, 
    sorted_chem_names, encoder_path, save_emb_pca_to_wandb=True, early_stop_threshold=early_stopping_threshold,
    input_type='Carl', show_wandb_run_name=True, lr_scheduler=True
    )

In [17]:
# best_model = f.Encoder().to(device)
# best_model.load_state_dict(torch.load(encoder_path))
# encoder_criterion = nn.MSELoss()
# print(best_model)

In [18]:
# encoder_path = '../models/best_carl_to_chemnet_encoder.pth'
# torch.save(best_model.state_dict(), encoder_path)

## Generating Carl Embedding Predictions:
---

In [19]:
# file_path='/mnt/usb/cmdunham/preprocessed_ims_data/train_carls_avg_backgrounds.csv'
# train_carls = pd.read_csv(file_path)
# file_path = '/mnt/usb/cmdunham/preprocessed_ims_data/val_carls_avg_backgrounds.csv'
# val_carls = pd.read_csv(file_path)
# file_path='/mnt/usb/cmdunham/preprocessed_ims_data/test_carls_avg_backgrounds.csv'
# test_carls = pd.read_csv(file_path)

# train_embeddings_tensor, train_carl_tensor, train_chem_encodings_tensor, train_carl_indices_tensor = f.create_dataset_tensors(train_carls, name_smiles_embedding_df, device, carl=True)
# val_embeddings_tensor, val_carl_tensor, val_chem_encodings_tensor, val_carl_indices_tensor = f.create_dataset_tensors(val_carls, name_smiles_embedding_df, device, carl=True)
# test_embeddings_tensor, test_carl_tensor, test_chem_encodings_tensor, test_carl_indices_tensor = f.create_dataset_tensors(test_carls, name_smiles_embedding_df, device, carl=True)

In [20]:
# # encoder_path = '../models/carl_to_chemnet_encoder_used_for_results.pth'
# encoder_path = '../models/best_carl_to_chemnet_encoder.pth'
# best_model = f.Encoder().to(device)
# best_model.load_state_dict(torch.load(encoder_path, map_location='cuda:0'))
# encoder_criterion = nn.MSELoss()

In [21]:
# # batch_size = best_hyperparams['batch_size']
# batch_size = 32

# train_dataset = DataLoader(
#     TensorDataset(
#         train_carl_tensor, 
#         train_chem_encodings_tensor, 
#         train_embeddings_tensor,
#         train_carl_indices_tensor
#         ), 
#         batch_size=batch_size, 
#         shuffle=False
#         )
# predicted_embeddings, output_name_encodings, average_loss, input_carl_indices = f.predict_embeddings(train_dataset, best_model, device, encoder_criterion)
# # input_carl_indices = [idx.cpu().detach().numpy() for idx_list in input_carl_indices for idx in idx_list]
# input_carl_indices = [idx for idx_list in input_carl_indices for idx in idx_list]
# # predicted_embeddings = [emb.cpu().detach().numpy() for emb_list in predicted_embeddings for emb in emb_list]
# predicted_embeddings = [emb for emb_list in predicted_embeddings for emb in emb_list]
# # output_name_encodings = [enc.cpu().detach().numpy() for enc_list in output_name_encodings for enc in enc_list]
# output_name_encodings = [enc for enc_list in output_name_encodings for enc in enc_list]
# train_preds_df = pd.DataFrame(predicted_embeddings)
# train_preds_df.insert(0, 'index', input_carl_indices)
# name_encodings_df = pd.DataFrame(output_name_encodings)
# # name_encodings_df.columns = train_carls.columns[-8:]
# name_encodings_df.columns = sorted_chem_names
# train_preds_df = pd.concat([train_preds_df, name_encodings_df], axis=1)

In [22]:
# # batch_size = best_hyperparams['batch_size']
# batch_size = 32

# val_dataset = DataLoader(
#     TensorDataset(
#         val_carl_tensor, 
#         val_chem_encodings_tensor, 
#         val_embeddings_tensor,
#         val_carl_indices_tensor
#         ), 
#         batch_size=batch_size, 
#         shuffle=False
#         )
# predicted_embeddings, output_name_encodings, average_loss, input_carl_indices = f.predict_embeddings(val_dataset, best_model, device, encoder_criterion)
# # input_carl_indices = [idx.cpu().detach().numpy() for idx_list in input_carl_indices for idx in idx_list]
# input_carl_indices = [idx for idx_list in input_carl_indices for idx in idx_list]
# # predicted_embeddings = [emb.cpu().detach().numpy() for emb_list in predicted_embeddings for emb in emb_list]
# predicted_embeddings = [emb for emb_list in predicted_embeddings for emb in emb_list]
# # output_name_encodings = [enc.cpu().detach().numpy() for enc_list in output_name_encodings for enc in enc_list]
# output_name_encodings = [enc for enc_list in output_name_encodings for enc in enc_list]
# val_preds_df = pd.DataFrame(predicted_embeddings)
# val_preds_df.insert(0, 'index', input_carl_indices)
# name_encodings_df = pd.DataFrame(output_name_encodings)
# # name_encodings_df.columns = val_carls.columns[-8:]
# name_encodings_df.columns = sorted_chem_names
# val_preds_df = pd.concat([val_preds_df, name_encodings_df], axis=1)

In [23]:
# # batch_size = best_hyperparams['batch_size']
# batch_size = 32

# test_dataset = DataLoader(
#     TensorDataset(
#         test_carl_tensor, 
#         test_chem_encodings_tensor, 
#         test_embeddings_tensor,
#         test_carl_indices_tensor
#         ), 
#         batch_size=best_hyperparams['batch_size'], 
#         shuffle=False
#         )
# predicted_embeddings, output_name_encodings, average_loss, input_carl_indices = f.predict_embeddings(test_dataset, best_model, device, encoder_criterion)
# # input_carl_indices = [idx.cpu().detach().numpy() for idx_list in input_carl_indices for idx in idx_list]
# input_carl_indices = [idx for idx_list in input_carl_indices for idx in idx_list]
# # predicted_embeddings = [emb.cpu().detach().numpy() for emb_list in predicted_embeddings for emb in emb_list]
# predicted_embeddings = [emb for emb_list in predicted_embeddings for emb in emb_list]
# # output_name_encodings = [enc.cpu().detach().numpy() for enc_list in output_name_encodings for enc in enc_list]
# output_name_encodings = [enc for enc_list in output_name_encodings for enc in enc_list]
# test_preds_df = pd.DataFrame(predicted_embeddings)
# test_preds_df.insert(0, 'index', input_carl_indices)
# name_encodings_df = pd.DataFrame(output_name_encodings)
# # name_encodings_df.columns = val_carls.columns[-8:]
# name_encodings_df.columns = sorted_chem_names
# test_preds_df = pd.concat([test_preds_df, name_encodings_df], axis=1)

In [24]:
# file_path = '../data/encoder_embedding_predictions/train_preds.csv'
# train_preds_df.to_csv(file_path, index=False)
# file_path = '../data/encoder_embedding_predictions/val_preds.csv'
# val_preds_df.to_csv(file_path, index=False)
# file_path = '../data/encoder_embedding_predictions/test_preds.csv'
# test_preds_df.to_csv(file_path, index=False)

In [25]:
# encoder_path = 'trained_models/carl_to_chemnet_encoder_used_for_results.pth'
# torch.save(best_model.state_dict(), encoder_path)

## Viewing Encoder Results:
---

In [26]:
# # sorted_chem_names = list(train_carls.columns[-8:])

# # train_data = TensorDataset(train_carl_tensor, train_chem_encodings_tensor, train_embeddings_tensor, train_carl_indices_tensor)

# # # batch_size = best_hyperparams['batch_size']
# # batch_size = 32

# f.plot_pca(
#     train_data, batch_size, best_model, device, 
#     encoder_criterion, sorted_chem_names, all_true_embeddings, 
#     name_smiles_embedding_df, 'Train', input_type='IMS', show_wandb_run_name = False, log_wandb=False
#     )

In [27]:
# sorted_chem_names = list(val_carls.columns[-8:])

# val_data = TensorDataset(val_carl_tensor, val_chem_encodings_tensor, val_embeddings_tensor, val_carl_indices_tensor)

# # batch_size = best_hyperparams['batch_size']
# batch_size = 32

# f.plot_pca(
#     val_data, batch_size, best_model, device, 
#     encoder_criterion, sorted_chem_names, all_true_embeddings, 
#     name_smiles_embedding_df, 'Validation', input_type='IMS', show_wandb_run_name = False, log_wandb=False
#     )

In [28]:
# sorted_chem_names = list(test_carls.columns[-8:])

# test_data = TensorDataset(test_carl_tensor, test_chem_encodings_tensor, test_embeddings_tensor, test_carl_indices_tensor)

# # batch_size = best_hyperparams['batch_size']
# batch_size = 32

# f.plot_pca(
#     test_data, batch_size, best_model, device, 
#     encoder_criterion, sorted_chem_names, all_true_embeddings, 
#     name_smiles_embedding_df, 'Test', input_type='IMS', show_wandb_run_name = False, log_wandb=False
#     )