In [1]:
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
import numpy as np

import torch
import torch.nn as nn
# import torch.optim as optim
# import torchvision
# from ray import tune
from torch.utils.data import DataLoader, TensorDataset #, Dataset
# import torch.nn.functional as F

import wandb
import os
from sklearn.decomposition import PCA
import GPUtil
import itertools
import io

from collections import Counter

In [2]:
# Loading Data:
file_path = '/mnt/usb/cmdunham/preprocessed_ims_data/val_data.csv'
val = pd.read_csv(file_path)

In [None]:
class Autoencoder(nn.Module):
  def __init__(self):
    super().__init__()
    self.encoder = nn.Sequential(
      nn.Linear(1676,652),
    )

  def forward(self, x):
    x = self.encoder(x)
    return x

In [4]:
if torch.cuda.is_available():
    # Get the list of GPUs
    gpus = GPUtil.getGPUs()

    # Find the GPU with the most free memory
    best_gpu = max(gpus, key=lambda gpu: gpu.memoryFree)

    # Print details about the selected GPU
    print(f"Selected GPU ID: {best_gpu.id}")
    print(f"  Name: {best_gpu.name}")
    print(f"  Memory Free: {best_gpu.memoryFree} MB")
    print(f"  Memory Used: {best_gpu.memoryUsed} MB")
    print(f"  GPU Load: {best_gpu.load * 100:.2f}%")

    # Set the device for later use
    device = torch.device(f'cuda:{best_gpu.id}')
    print('Current device ID: ', device)

    # Set the current device in PyTorch
    torch.cuda.set_device(best_gpu.id)
else:
    device = torch.device('cpu')
    print('Using CPU')

# Confirm the currently selected device in PyTorch
print("PyTorch current device ID:", torch.cuda.current_device())
print("PyTorch current device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

Selected GPU ID: 1
  Name: NVIDIA GeForce RTX 4090
  Memory Free: 17082.0 MB
  Memory Used: 7135.0 MB
  GPU Load: 0.00%
Current device ID:  cuda:1
PyTorch current device ID: 1
PyTorch current device name: NVIDIA GeForce RTX 4090


In [None]:
def train_one_epoch(train_dataset, device, model, criterion, optimizer, epoch, combo):
  epoch_training_loss = 0

  predicted_spectra = []
  output_name_encodings = []
  original_spectra = []

  # train_dataset = DataLoader(TensorDataset(train_spectra_tensor, train_chem_encodings_tensor, train_embeddings_tensor), batch_size=combo['batch_size'], shuffle=True)
  for true_spectra, name_encodings, true_embeddings in train_dataset:
    # move inputs to device
    true_spectra = true_spectra.to(device)
    name_encodings = name_encodings.to(device)
    true_embeddings = true_embeddings.to(device)

    # backprapogation
    optimizer.zero_grad()

    # forward pass
    batch_predicted_spectra = model(true_embeddings)

    loss = criterion(batch_predicted_spectra, true_spectra)
    # accumulate epoch training loss
    epoch_training_loss += loss.item()

    loss.backward()
    optimizer.step()

    # at last epoch store output embeddings and corresponding labels to output list
    if (epoch + 1) == combo['epochs']:
      for enc, spec, true_spec in zip(name_encodings, batch_predicted_spectra, true_spectra):
        output_name_encodings.append(enc.cpu().detach().numpy())
        predicted_spectra.append(spec.cpu().detach().numpy())
        original_spectra.append(true_spec.cpu().detach().numpy())

  # divide by number of batches to calculate average loss
  average_loss = epoch_training_loss/len(train_dataset)
  if (epoch + 1) == combo['epochs']:
    return average_loss, predicted_spectra, output_name_encodings, original_spectra
  else:
    return average_loss

In [None]:
# drop first two cols ('Unnamed:0' and 'index') and last 9 cols ('Label' and OneHot encodings) to get just spectra

val_spectra = val.iloc[:,2:-9]
val_chem_encodings = val.iloc[:,-8:]

# create tensors of spectra, true embeddings, and chemical name encodings for train and val

val_chem_labels = list(val['Label'])
val_embeddings_tensor = torch.Tensor([name_smiles_embedding_df['Embedding Floats'][chem_name] for chem_name in val_chem_labels]).to(device)
val_spectra_tensor = torch.Tensor(val_spectra.values).to(device)
val_chem_encodings_tensor = torch.Tensor(val_chem_encodings.values).to(device)

In [None]:
# set var deciding if results plot for this run is saved to wandb
log_wandb = True

# Last 8 cols of the df are the chem names
sorted_chem_names = list(val.columns[-8:])
model_config = {
    'batch_size': [32],
    'epochs': [30],
learning_rate = .001

val_dataset = DataLoader(TensorDataset(val_spectra_tensor, val_chem_encodings_tensor, val_embeddings_tensor), batch_size=combo['batch_size'], shuffle=False)
generator = Autoencoder().to(device)

generator_optimizer = torch.optim.AdamW(generator.parameters(), lr = combo['learning_rate'])
generator_criterion = nn.MSELoss()

wandb_kwargs = {
  'learning_rate': combo['learning_rate'],
  'epochs': combo['epochs'],
  'batch_size': combo['batch_size'],
  'model_architecture': 'generator',
  'optimizer':'AdamW',
  'loss': 'MSELoss'
}

run_with_wandb(config, **wandb_kwargs)

print('--------------------------')
print('--------------------------')
print('New run with hyperparameters:')
for key in combo:
  print(key, ' : ', combo[key])
print('--------------------------')
print('--------------------------')

for epoch in range(combo['epochs']):
  # Set model to training mode
  generator.train(True)

  # do a pass over the data
  # at last epoch get predicted embeddings and chem names
  if (epoch + 1) == combo['epochs']:
    average_loss, predicted_spectra, output_name_encodings, original_spectra = train_one_epoch(
      train_dataset, device, generator, generator_criterion, generator_optimizer, epoch, combo
      )
  else:
    average_loss = train_one_epoch(
      train_dataset, device, generator, generator_criterion, generator_optimizer, epoch, combo
      )

  epoch_val_loss = 0  
  # evaluate model on validation data
  generator.eval() # Set model to evaluation mode
  with torch.no_grad():
    for val_true_spectra, val_name_encodings, val_true_embeddings in val_dataset:
      val_true_spectra = val_true_spectra.to(device)
      val_name_encodings = val_name_encodings.to(device)
      val_true_embeddings = val_true_embeddings.to(device)

      val_batch_predicted_spectra = generator(val_true_embeddings)

      val_loss = generator_criterion(val_batch_predicted_spectra, val_true_spectra)
      # accumulate epoch validation loss
      epoch_val_loss += val_loss.item()

  # divide by number of batches to calculate average loss
  val_average_loss = epoch_val_loss/len(val_dataset)

  # log losses to wandb
  wandb.log({"Generator Training Loss": average_loss, "Generator Validation Loss": val_average_loss})

  if (epoch + 1) % 10 == 0:
    print('Epoch[{}/{}]:'.format(epoch+1, combo['epochs']))
    print(f'   Training loss: {average_loss}')
    print(f'   Validation loss: {val_average_loss}')
    print('-------------------------------------------')


  if average_loss < lowest_loss:
    lowest_loss = average_loss
    best_hyperparams = combo

  wandb.finish()

print('Hyperparameters for best model: ')
for key in best_hyperparams:
  print('   ', key, ' : ', best_hyperparams[key])