In [47]:
import os
import json
import pandas as pd
from torch.utils.data import Dataset, random_split

CSV_PATH = 'spectra.csv'

# Index -> functional group name
label_map = [
    'phenol',
    'aldehyde'
]
num_classes = len(label_map)

# Torch expects every label list to have the same length. Not all samples have the same amount of labels, so instead
# we convert the labels to a multi hot vector v, where v_i = 1 if the sample has label i and v_i = 0 otherwise
def labels_to_multi_hot_vector(labels):
    multi_hot_vector = []
    for i in range(len(label_map)):
        if label_map[i] in labels:
            multi_hot_vector.append(1)
        else:
            multi_hot_vector.append(0)
    return torch.tensor(multi_hot_vector, dtype=torch.float32)

class IRDataset(Dataset):
    def __init__(self, csv_path, transform=None, target_transform=None):
        self.df = pd.read_csv(csv_path)
        # Parse all the json encoded spectra
        self.df['spectrum'] = self.df['spectrum'].apply(json.loads)
        # Convert the string labels to a multi hot vector
        self.df['labels'] = self.df['labels'].apply(labels_to_multi_hot_vector)
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        spectrum = self.df['spectrum'].iloc[idx]
        # Torch expects spectra as a tensor, not a list
        spectrum = torch.tensor(spectrum, dtype=torch.float32)
        
        labels = self.df['labels'].iloc[idx]
        
        if (self.transform):
            spectrum = self.transform(spectrum)
        if (self.target_transform):
            labels = self.target_transform(labels)
        return spectrum, labels    
    
dataset = IRDataset(CSV_PATH)

# TODO: make this split more fair by ensuring groups are evenly split between train/test
# (also add validation set?)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_set, test_set = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
test_loader = DataLoader(test_set, batch_size=64)

In [54]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# TODO: find a way to use the gpu because this is slow
device = 'cpu'#torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using %s' % (device))

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(3600, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, num_classes)
        )
    
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits
    
model = NeuralNetwork().to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())

num_epochs = 1
for epoch in range(num_epochs):
    for batch_inputs, batch_labels in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_inputs)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
        
correct = 0
total = 0

model.eval()
with torch.no_grad():
    for batch_inputs, batch_labels in test_loader:
        outputs = model(batch_inputs)
        probabilities = torch.sigmoid(outputs)
        predicted = (probabilities > 0.5).float()
        
        correct = correct + (predicted == batch_labels).all(dim=1).sum().item()
        total = total + batch_labels.size(0)

print(correct / total)

Using cpu
0.9267425320056899
