In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F

import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

import csv

from sklearn.decomposition import PCA

In [4]:
embeddings = pd.read_csv('/home/cmdunham/mass_spec/mass_spec_repo/data/MoNA_embeddings_multiple_instrument_types.csv')
spectra = pd.read_csv('/home/cmdunham/mass_spec/mass_spec_repo/data/scaled_spectra_with_instrument_type.csv')

In [95]:
def create_dataset_sections(spectra, embeddings, columns, batch_size=32, noise = None):
  
    """
    Given a list of column names, return a DataLoader object with data for those columns

    Args:
    - spectra (pd.DataFrame): Mass spec data with observations as columns.
    - embeddings (pd.DataFrame): Column headers are chemical names. Each column represents one chemical's Chemception embedding.
    - columns (list): Column names to be included in the dataset 
    - noise (str): Location for noise in the dataset - 'spectrum', 'embedding' or 'condition'
    
    Returns:
    Tuple:
    - train_input (DataLoader object): spectrum, encoding information and embedding, labels - chem names
    - test_input (DataLoader object): spectrum, encoding information and embedding, labels - chem names
    """

    labels = []
    selected_embeddings = []
    for col in columns:
        embedding = embeddings[col.split('.')[0]]
        if noise == 'embedding':
            selected_embeddings.append([0 for _ in range(embedding.shape[0])])
        else:
            selected_embeddings.append(embedding)
        # label is chemical name encoding
        labels.append(list(spectra[col][915:-12]))

    # combine spectrum data, condition encoding and chemception embedding
    spectrum_data = spectra[list(columns)][:915].values
    if noise == 'spectrum':
        spectrum_data = np.zeros_like(spectrum_data)
    condition_encodings = spectra[list(columns)][-12:].values
    if noise == 'condition':
        condition_encodings = np.zeros_like(condition_encodings)
    selected_embeddings = torch.tensor(selected_embeddings).T
    labels = torch.tensor(labels, dtype=torch.float)

    input_data = torch.tensor(np.vstack((spectrum_data, condition_encodings, selected_embeddings)).T, dtype=torch.float)
    input_data = TensorDataset(input_data, labels)
    input_data = DataLoader(dataset=input_data, batch_size=batch_size, shuffle=True)

    return input_data

In [96]:
########### Chem names + instrument encodings as labels

def create_dataset(spectra, embeddings, test_chem = 'Succinic Acid', instrument_idx = 8, batch_size=32, training_chems = None, noise = None):
  """
  Clean and format data

  Args:
  - spectra (pd.DataFrame): Mass spec data with observations as columns.
  - embeddings (pd.DataFrame): Column headers are chemical names. Each column represents one chemical's Chemception embedding.
  - instrument_idx: (int) Encoded index of instrument to be set aside for testing
  - batch_size: (int) Batch size to use for training
  - training_chems: (list) When training small group models this param can be used to specify which chems to include in the dataset
  - noise: (str) 'spectrum', 'embedding' or 'condition' - Location for noise in the dataset
  Returns:
    Tuple:
    - train_input (DataLoader object): true spectrum, encoding information and 512 0s to be filled with embedding, labels - chem names
    - train_comparison (DataLoader object): true spectrum, encoding information and true embedding, labels - chem names
    - test_input (DataLoader object): blank spectrum to be filled in, encoding information and true embedding, labels - chem names
    - test_comparison (DataLoader object): true spectrum, encoding information and true embedding, labels - chem names
  """
  all_test_chem_columns = [col for col in spectra.columns if test_chem in col]

  # separate specified chemical and instrument type for testing  
  test_cols = []
  for col in all_test_chem_columns:
    instrument_encoding = list(spectra[col][-12:])
    # filter out the test instrument type
    if instrument_encoding.index(1) == instrument_idx:
      test_cols.append(col)

  non_test_cols = set(spectra.columns) - set(test_cols)
  # use either specified chemicals (for small group training) or everything except test cols for training data
  if training_chems:
    train_cols = [col for col in non_test_cols if col.split('.')[0] in training_chems]
  else:
    train_cols = non_test_cols

  train_input = create_dataset_sections(spectra, embeddings, train_cols, batch_size=batch_size, noise=noise)
  test_input = create_dataset_sections(spectra, embeddings, test_cols, batch_size=batch_size, noise=None)

  return train_input, test_input


train_input, test_input = create_dataset(spectra, embeddings)

In [64]:
########### Chem names + instrument encodings as labels

def create_dataset(spectra, embeddings, test_chem = 'Succinic Acid', instrument_idx = 8, batch_size=32, training_chems = None, noise = None):
  """
  Clean and format data

  Args:
  - spectra (pd.DataFrame): Mass spec data with observations as columns.
  - embeddings (pd.DataFrame): Column headers are chemical names. Each column represents one chemical's Chemception embedding.
  - instrument_idx: (int) Encoded index of instrument to be set aside for testing
  - batch_size: (int) Batch size to use for training
  - training_chems: (list) When training small group models this param can be used to specify which chems to include in the dataset
  - noise: (str) 'spectrum', 'embedding' or 'condition' - Location for noise in the dataset
  Returns:
    Tuple:
    - train_input (DataLoader object): true spectrum, encoding information and 512 0s to be filled with embedding, labels - chem names
    - train_comparison (DataLoader object): true spectrum, encoding information and true embedding, labels - chem names
    - test_input (DataLoader object): blank spectrum to be filled in, encoding information and true embedding, labels - chem names
    - test_comparison (DataLoader object): true spectrum, encoding information and true embedding, labels - chem names
  """
  all_test_chem_columns = [col for col in spectra.columns if test_chem in col]

  # separate specified chemical and instrument type for testing  
  test_cols = []
  test_labels = []
  test_embeddings = []
  # test_blank_embeddings = []
  for col in all_test_chem_columns:
    instrument_encoding = list(spectra[col][-12:])
    # filter out the test instrument type
    if instrument_encoding.index(1) == instrument_idx:
      embedding = embeddings[col.split('.')[0]]
      if noise == 'embedding':
        test_embeddings.append([0 for _ in range(embedding.shape[0])])
      else:
        test_embeddings.append(embedding)
      test_cols.append(col)
      # label is chemical name encoding
      test_labels.append(list(spectra[col][915:-12]))

  # combine spectrum data, condition encoding and chemception embedding
  test_spectrum_data = spectra[test_cols][:915].values
  if noise == 'spectrum':
    test_spectrum_data = np.zeros_like(test_spectrum_data)
  test_condition_encodings = spectra[test_cols][-12:].values
  if noise == 'condition':
    test_condition_encodings = np.zeros_like(test_condition_encodings)
  test_embeddings = torch.tensor(test_embeddings).T
  # test_blank_embeddings = torch.tensor(test_blank_embeddings).T
  test_labels = torch.tensor(test_labels, dtype=torch.float)
  
  test_input_data = torch.tensor(np.vstack((test_spectrum_data, test_condition_encodings, test_embeddings)).T, dtype=torch.float)
  test_input = TensorDataset(test_input_data, test_labels)
  test_input = DataLoader(dataset=test_input, batch_size=batch_size, shuffle=True)

  # test_comparison_data = torch.tensor(np.vstack((test_spectrum_data, test_condition_encodings, test_blank_embeddings)).T, dtype=torch.float)
  # test_comparison = TensorDataset(test_comparison_data, test_labels)
  # test_comparison = DataLoader(dataset=test_comparison, batch_size=batch_size, shuffle=True)

  non_test_cols = set(spectra.columns) - set(test_cols)
  # use either specified chemicals (for small group training) or everything except test cols for training data
  if training_chems:
    train_cols = [col for col in non_test_cols if col.split('.')[0] in training_chems]
  else:
    train_cols = non_test_cols

  train_labels = []
  train_embeddings = []
  train_blank_embeddings = []
  for col in train_cols:
    train_labels.append(list(spectra[col][915:-12]))
    train_embeddings.append(embeddings[col.split('.')[0]])
    train_blank_embeddings.append([0 for _ in range(embedding.shape[0])])

  train_spectrum_data = spectra[list(train_cols)][:915].values
  train_condition_encodings = spectra[list(train_cols)][-12:].values
  train_embeddings = torch.tensor(train_embeddings).T
  train_blank_embeddings = torch.tensor(train_blank_embeddings).T
  train_labels = torch.tensor(train_labels, dtype=torch.float)

  input_data = torch.tensor(np.vstack((train_spectrum_data, train_condition_encodings, train_embeddings)).T, dtype=torch.float)
  train_input = TensorDataset(input_data, train_labels)
  train_input = DataLoader(dataset=train_input, batch_size=batch_size, shuffle=True)

  # train_comparison_data = torch.tensor(np.vstack((train_spectrum_data, train_condition_encodings, train_blank_embeddings)).T, dtype=torch.float)
  # train_comparison = TensorDataset(train_comparison_data, train_labels)
  # train_comparison = DataLoader(dataset=train_comparison, batch_size=batch_size, shuffle=True)
  return train_input, test_input


thing, stuff = create_dataset(spectra, embeddings)

In [46]:
class Encoder(nn.Module):
  def __init__(self, learning_rate):
    super().__init__()

    self.encoder = nn.Sequential(
      nn.Linear(1439,878),
      nn.LeakyReLU(inplace=True),
      nn.Linear(878,841),
      nn.LeakyReLU(inplace=True),
      nn.Linear(841,804),
      nn.LeakyReLU(inplace=True),
      nn.Linear(804,767),
      nn.LeakyReLU(inplace=True),
      nn.Linear(767, 730),
      nn.LeakyReLU(inplace=True),
      nn.Linear(730, 693),
      nn.LeakyReLU(inplace=True),
      nn.Linear(693, 656),
      nn.LeakyReLU(inplace=True),
      nn.Linear(656, 619),
      nn.LeakyReLU(inplace=True),
      nn.Linear(619, 582),
      nn.LeakyReLU(inplace=True),
      nn.Linear(582, 545),
      nn.LeakyReLU(inplace=True),
      nn.Linear(545, 1439),
    )

    self.optimizer = torch.optim.Adam(self.parameters(), lr = learning_rate)
    self.criterion = nn.MSELoss()

  def forward(self, x):
    x = self.encoder(x)
    return x

1439