In [2]:
import os
import json
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split

CSV_PATH = 'spectra.csv'

# Index -> functional group name
label_map = [
    'phenol',
    'aldehyde',
    'arene'
]
num_classes = len(label_map)

# Torch expects every label list to have the same length. Not all samples have the same amount of labels, so instead
# we convert the labels to a multi hot vector v, where v_i = 1 if the sample has label i and v_i = 0 otherwise
def labels_to_multi_hot_vector(labels):
    multi_hot_vector = []
    for i in range(len(label_map)):
        if label_map[i] in labels:
            multi_hot_vector.append(1)
        else:
            multi_hot_vector.append(0)
    return torch.tensor(multi_hot_vector, dtype=torch.float32)

class IRDataset(Dataset):
    def __init__(self, csv_path, transform=None, target_transform=None):
        self.df = pd.read_csv(csv_path)
        # Parse all the json encoded spectra
        self.df['spectrum'] = self.df['spectrum'].apply(json.loads)
        # Convert the string labels to a multi hot vector
        self.df['labels'] = self.df['labels'].apply(labels_to_multi_hot_vector)
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        spectrum = self.df['spectrum'].iloc[idx]
        # Torch expects spectra as a tensor, not a list
        spectrum = torch.tensor(spectrum, dtype=torch.float32)
        
        labels = self.df['labels'].iloc[idx]
        
        if (self.transform):
            spectrum = self.transform(spectrum)
        if (self.target_transform):
            labels = self.target_transform(labels)
        return spectrum, labels    
    
dataset = IRDataset(CSV_PATH)

# TODO: make this split more fair by ensuring groups are evenly split between train/test
# (also add validation set?)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_set, test_set = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
test_loader = DataLoader(test_set, batch_size=64)

  from .autonotebook import tqdm as notebook_tqdm


In [28]:
import ast
from sklearn.model_selection import train_test_split
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

# 75% train 15% test 10% validation
TRAIN_SIZE = 0.75
VALIDATION_SIZE = 0.4

# Counting how many spectra contain each group (just for logging)
counts = {}
for i in range(len(label_map)):
    counts[label_map[i]] = dataset.df['labels'].apply(lambda L: L[i].item() == 1).sum()
print('Spectra counts per group:')
for name, cnt in counts.items():
    print(f'{name}: {cnt}')

try:
    # Stage 1 (stratifier): (1 - TRAIN_SIZE)% for “remainder”, TRAIN_SIZE% for training
    strat1 = MultilabelStratifiedShuffleSplit(
        n_splits=1, 
        test_size=1 - TRAIN_SIZE,    # (1 - TRAIN_SIZE)% goes to df_remain
        random_state=42
    )
    
    # scikit-learn doesn't work with torch tensors, so we need to convert df['labels'] to a numpy array (messy)
    train_idx, remain_idx = next(strat1.split(dataset.df, np.stack(dataset.df['labels'].apply(lambda x: x.numpy()).values)))
    df_train = dataset.df.iloc[train_idx].reset_index(drop=True)
    Y_train  = dataset.df['labels'][train_idx]
    df_remain = dataset.df.iloc[remain_idx].reset_index(drop=True)
    Y_remain  = dataset.df['labels'][remain_idx]

    print(f"\nAfter Stage 1 → Train: {len(df_train)} (≈{TRAIN_SIZE * 100.0}%), Remainder: {len(df_remain)} (≈{(1 - TRAIN_SIZE) * 100.0}%)")

    strat2 = MultilabelStratifiedShuffleSplit(
        n_splits=1,
        test_size=VALIDATION_SIZE,   # VALIDATION_SIZE% of df_remain → validation
        random_state=42
    )
    
    # Same deal as before, Y_remain is a column of tensors, scikit-learn can't work with them
    remain_train_idx, remain_test_idx = next(strat2.split(df_remain, np.stack(Y_remain.apply(lambda x: x.numpy()).values)))

    # “remain_train_idx” is 75% of df_remain i.e. the test set
    df_test  = df_remain.iloc[remain_train_idx].reset_index(drop=True)
    # “remain_test_idx” is 25% of df_remain i.e. the validation set
    df_val   = df_remain.iloc[remain_test_idx].reset_index(drop=True)

    print(f"After Stage 2 → Test: {len(df_test)} (≈{(1 - TRAIN_SIZE) * (1 - VALIDATION_SIZE) * 100.0}%), Validation: {len(df_val)} (≈{(1 - TRAIN_SIZE) * VALIDATION_SIZE * 100.0}%)")

except ImportError:
    # Fallback (non‐stratified) splitting if iterstrat is missing
    # Stage 1: TRAIN_SIZE% train, (1 - TRAIN_SIZE)% remainder
    df_train, df_remain = train_test_split(
        dataset.df, 
        test_size=1 - TRAIN_SIZE,
        random_state=42,
        shuffle=True
    )
    print(f"\nAfter Stage 1 (fallback) → Train: {len(df_train)}, Remainder: {len(df_remain)}")

    # Stage 2: within remainder, do (1 - VALIDATION_SIZE)% for test and VALIDATION_SIZE% for val
    df_test, df_val = train_test_split(
        df_remain,
        test_size=VALIDATION_SIZE,   # VALIDATION_SIZE of “remainder” is the validation set 
        random_state=42,
        shuffle=True
    )
    print(f"After Stage 2 (fallback) → Test: {len(df_test)}, Validation: {len(df_val)}")

print()
for i in range(len(label_map)):
    name = label_map[i]
    orig_tot = dataset.df['labels'].apply(lambda L: L[i].item() == 1).sum()
    train_tot = df_train['labels'].apply(lambda L: L[i].item() == 1).sum()
    test_tot  = df_test['labels'].apply(lambda L: L[i].item() == 1).sum()
    val_tot   = df_val['labels'].apply(lambda L: L[i].item() == 1).sum()
    print(f"Label = {label_map[i]}")
    print(f" Overall prevalence: {orig_tot / len(dataset.df):.2%}, total: {orig_tot}")
    print(f"→ Train prevalence: {train_tot / len(df_train):.2%}, total: {train_tot}")
    print(f"→ Test prevalence: {test_tot / len(df_test):.2%}, total: {test_tot}")
    print(f"→ Val prevalence: {val_tot / len(df_val):.2%}, total: {val_tot}\n")

Spectra counts per group:
phenol: 555
aldehyde: 249
arene: 5369

After Stage 1 → Train: 8577 (≈75.0%), Remainder: 2859 (≈25.0%)
After Stage 2 → Test: 1715 (≈15.0%), Validation: 1144 (≈10.0%)

Label = phenol
 Overall prevalence: 4.85%, total: 555
→ Train prevalence: 4.85%, total: 416
→ Test prevalence: 4.84%, total: 83
→ Val prevalence: 4.90%, total: 56

Label = aldehyde
 Overall prevalence: 2.18%, total: 249
→ Train prevalence: 2.18%, total: 187
→ Test prevalence: 2.16%, total: 37
→ Val prevalence: 2.19%, total: 25

Label = arene
 Overall prevalence: 46.95%, total: 5369
→ Train prevalence: 46.95%, total: 4027
→ Test prevalence: 46.94%, total: 805
→ Val prevalence: 46.94%, total: 537



In [2]:
def precision(predictions, true_labels, label_index):
    true_positive = sum((predictions[i][label_index] == 1).item() and (true_labels[i][label_index] == 1).item() for i in range(len(predictions)))
    false_positive = sum((predictions[i][label_index] == 1).item() and (true_labels[i][label_index] == 0).item() for i in range(len(predictions)))
    if true_positive == 0:
        return 0.0
    return true_positive / (true_positive + false_positive)

def recall(predictions, true_labels, label_index):
    true_positive = sum((predictions[i][label_index] == 1).item() and (true_labels[i][label_index] == 1).item() for i in range(len(predictions)))
    false_negative = sum((predictions[i][label_index] == 0).item() and (true_labels[i][label_index] == 1).item() for i in range(len(predictions)))
    if true_positive == 0:
        return 0.0
    return true_positive / (true_positive + false_negative)
    
def f1_score(predictions, true_labels, label_index):
    precision_score = precision(predictions, true_labels, label_index)
    recall_score = recall(predictions, true_labels, label_index)
    if precision_score == 0 and recall_score == 0:
        return 0.0
    return 2 * precision_score * recall_score / (precision_score + recall_score)

def EMR(predictions, true_labels):
    return (predictions == true_labels).all(axis=1).mean()
                
def evaluate(model, data_loader, batch_transform, device = 'cpu'):
    model.eval()
    
    all_true_labels = []
    all_predictions = []
    with torch.no_grad():
        for batch_inputs, batch_labels in data_loader:
            batch_inputs = batch_transform(batch_inputs).to(device)
            batch_labels = batch_labels.to(device)
            outputs = model(batch_inputs)
            probabilities = torch.sigmoid(outputs)
            predictions = (probabilities > 0.5).float()
        
            all_true_labels.append(batch_labels)
            all_predictions.append(predictions)
    
    y_true = torch.cat(all_true_labels).cpu().numpy()
    y_pred = torch.cat(all_predictions).cpu().numpy()

    results = {}
    f1_scores = []
    for i in range(len(label_map)):
        results[label_map[i]] = {}
        results[label_map[i]]['precision'] = precision(y_pred, y_true, i)
        results[label_map[i]]['recall'] = recall(y_pred, y_true, i)
        f1 = f1_score(y_pred, y_true, i)
        results[label_map[i]]['f1_score'] = f1
        f1_scores.append(f1)
    results['macro'] = {}
    results['macro']['f1_score'] = np.mean(f1_scores)
    results['macro']['EMR'] = EMR(y_pred, y_true)
    
    return results

In [8]:
import optuna 

INPUT_LENGTH = 3600
print(torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using %s' % (device))

class NeuralNetwork(nn.Module):
    def __init__(self, conv_config, fc_config):
        super().__init__()

        conv_layers = []
        last_out_channels = 1
        conv_out_len = INPUT_LENGTH
        for config in conv_config:
            conv_layers.append(nn.Conv1d(in_channels=last_out_channels, out_channels=config['conv_outchannels'], kernel_size=config['conv_kernelsize']))
            conv_layers.append(nn.BatchNorm1d(config["conv_outchannels"]))
            conv_layers.append(nn.ReLU())
            conv_layers.append(nn.MaxPool1d(2))
            last_out_channels = config['conv_outchannels']
            conv_out_len = (conv_out_len - (config['conv_kernelsize'] - 1)) // 2
        self.conv_stack = nn.Sequential(*conv_layers)

        fc_layers = []
        last_out = conv_out_len * last_out_channels
        fc_layers.append(nn.Flatten())
        for config in fc_config:
            fc_layers.append(nn.Linear(last_out, config['fc_size']))
            fc_layers.append(nn.ReLU())
            fc_layers.append(nn.Dropout(config['dropout']))
            last_out = config['fc_size']
        fc_layers.append(nn.Linear(last_out, num_classes))
        self.fc_stack = nn.Sequential(*fc_layers)
    
    def forward(self, x):
        x = self.conv_stack(x)
        x = self.fc_stack(x)
        return x
    
def batch_to_conv_input(batch_input):
    return batch_input.unsqueeze(1)

def objective(trial):
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)

    conv_layers = trial.suggest_int('conv_layers', 2, 4)
    conv_config = []
    for i in range(conv_layers):
        conv_config.append({})
        conv_config[i]['conv_outchannels'] = trial.suggest_categorical('conv%i_outchannels' % (i), [8, 16, 32, 48, 64])
        conv_config[i]['conv_kernelsize'] = trial.suggest_categorical('conv%i_kernelsize' % (i), [5, 7, 9, 11, 13])

    fc_layers = trial.suggest_int('fc_layers', 1, 3)
    fc_config = []
    for i in range(fc_layers):
        fc_config.append({})
        fc_config[i]['fc_size'] = trial.suggest_categorical('fc%i_size' % (i), [64, 128, 256, 512])
        max_dropout = max(0.1, 0.5 - i * 0.2) # Give later layers a lower dropout
        fc_config[i]['dropout'] = trial.suggest_float('dropout%i' % (i), 0.0, max_dropout)
    
    model = NeuralNetwork(
        conv_config = conv_config,
        fc_config = fc_config
    ).to(device)
    
    # Increasing pos_weight punishes false negatives more heavily
    pos_weight = torch.full((num_classes,), 2.0, device=device) 
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, min_lr=1e6)
    
    model.train()
    
    # For hyperparameter optimization, 10 epochs will be enough to see if the model is good or bad
    num_epochs=10
    for i in range(num_epochs):
        for batch_inputs, batch_labels in train_loader:
            # The CNN expects batch_inputs to have a channel dimension (like [batch_size, 1, input_length]), but currently
            # batch_input has no channel and the dimensions of batch_inputs are [64, 3600].
            # unsqueeze adds a channel dimension in the middle, so batch_input's dimensions become [64, 1, 3600]
            batch_inputs = batch_to_conv_input(batch_inputs).to(device)
            batch_labels = batch_labels.to(device)
        
            optimizer.zero_grad()
            outputs = model(batch_inputs)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()
            scheduler.step(loss)
    
        results = evaluate(model, test_loader, batch_to_conv_input)
        f1 = results['macro']['f1_score']
        
        # Print some additional metrics at the end of a run
        if (i == num_epochs - 1):
            for j in range(num_classes):
                print('%s, recall: %f, precision: %f' % (label_map[j], results[label_map[j]]['recall'], results[label_map[j]]['precision']))
            print('Macro, F1: %f, EMR: %f' % (results['macro']['f1_score'], results['macro']['EMR']))
        trial.report(f1, step=i)
        
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
        
    return f1
    

pruner = optuna.pruners.MedianPruner(n_warmup_steps=2)
study = optuna.create_study(direction='maximize', pruner=pruner)
study.optimize(objective, n_trials=25)

best_trial = study.best_trial
print(best_trial.params)
print(best_trial.value)

[I 2025-06-04 12:48:57,427] A new study created in memory with name: no-name-387cc3be-9711-43dd-8db8-43961380b166


False
Using cpu


[I 2025-06-04 12:53:41,866] Trial 0 finished with value: 0.8044066645949774 and parameters: {'learning_rate': 0.00023893597396861367, 'conv_layers': 2, 'conv0_outchannels': 16, 'conv0_kernelsize': 11, 'conv1_outchannels': 16, 'conv1_kernelsize': 9, 'fc_layers': 3, 'fc0_size': 128, 'dropout0': 0.24303916847160184, 'fc1_size': 128, 'dropout1': 0.09353508310383343, 'fc2_size': 256, 'dropout2': 0.020368387553490564}. Best is trial 0 with value: 0.8044066645949774.


phenol, recall: 0.684685, precision: 0.791667
aldehyde, recall: 0.687500, precision: 0.891892
arene, recall: 0.931066, precision: 0.875540
Macro, F1: 0.804407, EMR: 0.880682


[I 2025-06-04 13:08:26,793] Trial 1 finished with value: 0.8140188439133157 and parameters: {'learning_rate': 0.00021625533428173412, 'conv_layers': 2, 'conv0_outchannels': 48, 'conv0_kernelsize': 9, 'conv1_outchannels': 64, 'conv1_kernelsize': 9, 'fc_layers': 3, 'fc0_size': 64, 'dropout0': 0.4413340799780823, 'fc1_size': 64, 'dropout1': 0.2537198012491565, 'fc2_size': 512, 'dropout2': 0.012078761792962557}. Best is trial 1 with value: 0.8140188439133157.


phenol, recall: 0.711712, precision: 0.790000
aldehyde, recall: 0.729167, precision: 0.853659
arene, recall: 0.960478, precision: 0.858669
Macro, F1: 0.814019, EMR: 0.881119


[I 2025-06-04 13:16:17,694] Trial 2 finished with value: 0.7848253477573904 and parameters: {'learning_rate': 0.007886108483520898, 'conv_layers': 4, 'conv0_outchannels': 8, 'conv0_kernelsize': 11, 'conv1_outchannels': 64, 'conv1_kernelsize': 13, 'conv2_outchannels': 32, 'conv2_kernelsize': 7, 'conv3_outchannels': 16, 'conv3_kernelsize': 7, 'fc_layers': 2, 'fc0_size': 64, 'dropout0': 0.3099955120598508, 'fc1_size': 256, 'dropout1': 0.23612838787078957}. Best is trial 1 with value: 0.8140188439133157.


phenol, recall: 0.648649, precision: 0.692308
aldehyde, recall: 0.687500, precision: 0.916667
arene, recall: 0.944853, precision: 0.857381
Macro, F1: 0.784825, EMR: 0.872378


[I 2025-06-04 13:27:29,548] Trial 3 finished with value: 0.7859436664186005 and parameters: {'learning_rate': 1.2610404115178327e-05, 'conv_layers': 2, 'conv0_outchannels': 48, 'conv0_kernelsize': 11, 'conv1_outchannels': 16, 'conv1_kernelsize': 9, 'fc_layers': 2, 'fc0_size': 512, 'dropout0': 0.18209848841096804, 'fc1_size': 512, 'dropout1': 0.1624020281895291}. Best is trial 1 with value: 0.8140188439133157.


phenol, recall: 0.828829, precision: 0.713178
aldehyde, recall: 0.562500, precision: 0.900000
arene, recall: 0.939338, precision: 0.861720
Macro, F1: 0.785944, EMR: 0.872378


[I 2025-06-04 13:33:20,214] Trial 4 finished with value: 0.8074544461802012 and parameters: {'learning_rate': 0.00016232356173120818, 'conv_layers': 4, 'conv0_outchannels': 16, 'conv0_kernelsize': 9, 'conv1_outchannels': 16, 'conv1_kernelsize': 7, 'conv2_outchannels': 32, 'conv2_kernelsize': 9, 'conv3_outchannels': 8, 'conv3_kernelsize': 9, 'fc_layers': 1, 'fc0_size': 64, 'dropout0': 0.14069673745657274}. Best is trial 1 with value: 0.8140188439133157.


phenol, recall: 0.855856, precision: 0.650685
aldehyde, recall: 0.666667, precision: 0.914286
arene, recall: 0.976103, precision: 0.855761
Macro, F1: 0.807454, EMR: 0.882867


[I 2025-06-04 13:35:46,030] Trial 5 pruned. 
[I 2025-06-04 13:45:09,452] Trial 6 finished with value: 0.8245929753783158 and parameters: {'learning_rate': 0.000759940947059361, 'conv_layers': 4, 'conv0_outchannels': 8, 'conv0_kernelsize': 11, 'conv1_outchannels': 48, 'conv1_kernelsize': 7, 'conv2_outchannels': 64, 'conv2_kernelsize': 9, 'conv3_outchannels': 8, 'conv3_kernelsize': 5, 'fc_layers': 2, 'fc0_size': 512, 'dropout0': 0.03653465374241188, 'fc1_size': 64, 'dropout1': 0.028391032162909013}. Best is trial 6 with value: 0.8245929753783158.


phenol, recall: 0.702703, precision: 0.787879
aldehyde, recall: 0.750000, precision: 0.900000
arene, recall: 0.961397, precision: 0.868771
Macro, F1: 0.824593, EMR: 0.889860


[I 2025-06-04 13:46:33,460] Trial 7 pruned. 
[I 2025-06-04 13:49:12,226] Trial 8 pruned. 
[I 2025-06-04 14:55:41,909] Trial 9 finished with value: 0.833229528814681 and parameters: {'learning_rate': 5.1457655488275636e-05, 'conv_layers': 4, 'conv0_outchannels': 16, 'conv0_kernelsize': 7, 'conv1_outchannels': 48, 'conv1_kernelsize': 5, 'conv2_outchannels': 16, 'conv2_kernelsize': 11, 'conv3_outchannels': 32, 'conv3_kernelsize': 9, 'fc_layers': 1, 'fc0_size': 512, 'dropout0': 0.10207644405034622}. Best is trial 9 with value: 0.833229528814681.


phenol, recall: 0.756757, precision: 0.831683
aldehyde, recall: 0.666667, precision: 0.969697
arene, recall: 0.945772, precision: 0.890138
Macro, F1: 0.833230, EMR: 0.897290


[W 2025-06-04 15:02:15,768] Trial 10 failed with parameters: {'learning_rate': 0.001011707732783943, 'conv_layers': 4, 'conv0_outchannels': 64, 'conv0_kernelsize': 7, 'conv1_outchannels': 48, 'conv1_kernelsize': 5, 'conv2_outchannels': 16, 'conv2_kernelsize': 13, 'conv3_outchannels': 32, 'conv3_kernelsize': 11, 'fc_layers': 1, 'fc0_size': 512, 'dropout0': 0.2697687452412213} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\20234238\AppData\Local\anaconda3\envs\cbl_ir\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\20234238\AppData\Local\Temp\ipykernel_21232\1854491737.py", line 91, in objective
    results = evaluate(model, test_loader, batch_to_conv_input)
  File "C:\Users\20234238\AppData\Local\Temp\ipykernel_21232\1620697563.py", line 34, in evaluate
    outputs = model(batch_inputs)
  File "C:\Users\20234238\AppData\Local\anaconda3\envs\cbl_ir\lib\site-pa

KeyboardInterrupt: 