#### Data Preprocessing

In [1]:
from boxsers.preprocessing import savgol_smoothing, cosmic_filter

def preprocessing_method(x):
    # 1) Applies a median filter to remove cosmic rays from the spectrum(s).
    x = cosmic_filter(x, ks=3)
    # 2) Smoothes the spectra
    x = savgol_smoothing(x, 7, p=3, degree=0)
    return x

2024-08-02 01:13:58.326217: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-02 01:13:58.326246: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-02 01:13:58.326960: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


#### Loading Dataset

In [2]:
import numpy as np

##### Train Dataset

In [3]:
X_fn = "./data/Bacteria-ID/X_reference.npy"
y_fn = "./data/Bacteria-ID/y_reference.npy"
X_train_raw = np.load(X_fn)
y_train_raw = np.load(y_fn)

##### Test Dataset

In [4]:
X_test = "./data/Bacteria-ID/X_test.npy"
y_test = "./data/Bacteria-ID/y_test.npy"
X_test = np.load(X_test)
y_test = np.load(y_test)

#### Model Setting

In [5]:
from model.Variant_LeNet_without_linear import Variant_LeNet_without_linear
from tqdm import tqdm
import torch
import pandas as pd
from torch.autograd import Variable
from functools import partial
from deep_SLDA import slda_loss, SLDA
from imblearn.metrics import specificity_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import torch.optim as optim
import seaborn as sns
from sklearn.metrics import confusion_matrix
import math
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    auc,
    roc_curve,
)
from plot import plot_loss_metrics, plot_metrics

n_classes = 30
batch_size = 800

train_avg_accuracy = []
val_avg_accuracy = []
avg_accuracy = []
avg_roc = []
C = np.zeros((30, 30))

In [6]:
class Solver:
    def __init__(
        self,
        dataloaders,
        model,
        model_path,
        device,
        n_classes,
    ):
        self.dataloaders = dataloaders
        self.device = device
        self.net = model
        self.net = self.net.to(self.device)

        self.criterion = partial(
            slda_loss,
            n_classes=n_classes,
        )

        self.optimizer = optim.Adam(self.net.parameters(), lr=1e-4, betas=(0.5, 0.999))
        self.model_path = model_path
        self.n_classes = n_classes
        self.slda_layer = SLDA(self.n_classes)

    def iterate(self, epoch, phase, scheduler=None):
        if phase == "train":
            self.net.train()
        else:
            self.net.eval()

        dataloader = self.dataloaders[phase]
        total_loss = 0
        correct = 0
        total = 0
        loss_total = 0

        # if phase == "train":
        #     self.optimizer.zero_grad()

        for batch_idx, (inputs, targets) in enumerate(dataloader):
            inputs, targets = Variable(inputs).to(self.device), Variable(
                targets.long()
            ).to(self.device)

            feas = self.net(inputs)

            if phase == "train":
                dirs, range_eigenvalue, null_eigenvalue = self.slda_layer.fit(feas, targets, phase)
                Z = torch.matmul(feas, dirs.T)
                self.clf = LinearDiscriminantAnalysis()
                self.clf.fit(Z.detach().data.cpu().numpy(),targets.cpu().numpy())
                outputs = self.clf.predict(Z.detach().data.cpu().numpy())
                outputs = torch.from_numpy(outputs).to(self.device)
                loss = self.criterion(range_eigenvalue, null_eigenvalue)
                self.dirs = dirs
            else:
                range_eigenvalue, null_eigenvalue = self.slda_layer.fit(feas, targets, phase)
                Z = torch.matmul(feas, self.dirs.T)
                outputs = self.clf.predict(Z.detach().data.cpu().numpy())
                outputs = torch.from_numpy(outputs).to(self.device)
                loss = self.criterion(range_eigenvalue, null_eigenvalue)

            if phase == "train":
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

            total_loss += loss.item()
            total += targets.size(0)
            loss_total += 1
            correct += outputs.eq(targets).cpu().sum().item()
            
        avg_loss = total_loss / loss_total
        total_acc = correct / total

        return avg_loss, total_acc

    def train(self, epochs):

        best_acc = 0

        useful_stuff = {
            "training_loss": [],
            "validation_loss": [],
            "train_metrics": [],
            "validation_metrics": [],
        }

        lambda1 = lambda epoch: 0.9 ** (epoch // 10) if epoch >= 10 else 1.0
        self.scheduler = torch.optim.lr_scheduler.LambdaLR(self.optimizer, lr_lambda=lambda1)

        for epoch in tqdm(range(epochs)):
            
            train_loss, train_acc = self.iterate(epoch, "train")
            useful_stuff["training_loss"].append(train_loss)
            useful_stuff["train_metrics"].append(train_acc)

            # self.optimizer.step()
            self.scheduler.step()
            # print(epoch, self.scheduler.get_last_lr()[0])

            with torch.no_grad():
                val_loss, val_acc = self.iterate(epoch, "val")
                useful_stuff["validation_loss"].append(val_loss)
                useful_stuff["validation_metrics"].append(val_acc)

            if val_acc > best_acc or epoch == 0:
                best_acc = val_acc
                checkpoint = {
                    "epoch": epoch,
                    "val_loss": val_loss,
                    "dirs": self.dirs,
                    "clf": self.clf,
                    "state_dict": self.net.state_dict(),
                }
                torch.save(checkpoint, self.model_path)

        return train_acc, best_acc, useful_stuff

    def test_iterate(self, epoch, phase):
        self.net.eval()
        dataloader = self.dataloaders[phase]
        y_pred = []
        y_true = []
        y_pred_prob = []
        with torch.no_grad():
            for batch_idx, (inputs, targets) in enumerate(dataloader):
                inputs, targets = Variable(inputs.cuda()), Variable(
                    targets.cuda().long()
                )

                feas = self.net(inputs)
                Z = torch.matmul(feas, self.dirs.T)
                outputs = self.clf.predict(Z.detach().data.cpu().numpy())
                outputs = torch.from_numpy(outputs).to(self.device)
                outputs_prob = self.clf.predict_proba(Z.detach().data.cpu().numpy())
                outputs_prob = torch.from_numpy(outputs_prob).to(self.device)
                    
                y_pred.append(outputs.cpu().numpy().ravel())
                y_true.append(targets.cpu().numpy())
                y_pred_prob.append(outputs_prob.cpu().numpy())
            pass
        
        y_pred_prob = np.concatenate(y_pred_prob)
        y_pred = np.concatenate(y_pred)
        y_true = np.concatenate(y_true)

        return (
            np.array(y_pred).flatten(),
            np.array(y_true).flatten(),
            np.array(y_pred_prob).reshape(3000, 30),
        )

    def test(self):
        checkpoint = torch.load(self.model_path)
        epoch = checkpoint["epoch"]
        val_loss = checkpoint["val_loss"]
        self.dirs = checkpoint["dirs"]
        self.clf = checkpoint["clf"]

        self.net.load_state_dict(checkpoint["state_dict"])
        print("load model at epoch {}, with val loss: {:.3f}".format(epoch, val_loss))
        y_pred, y_true, y_pred_prob = self.test_iterate(epoch, "test")
        print("total", accuracy_score(y_true, y_pred))
        for i in range(self.n_classes):
            idx = y_true == i
            print("class", i, accuracy_score(y_true[idx], y_pred[idx]))

        return (
            confusion_matrix(y_true, y_pred),
            y_true,
            y_pred,
            accuracy_score(y_true, y_pred),
            y_pred_prob,
        )

In [7]:
from torch.utils.data import sampler

class StratifiedSampler(sampler.Sampler):
    """
    Stratified Sampling: Provides equal representation of target classes in each batch
    """

    def __init__(self, class_vector: torch.tensor, batch_size: int):
        """

        Args:
            class_vector (torch.tensor): A vector of class labels.
            batch_size (int): Batch size.
        """
        self.n_splits = int(class_vector.size / batch_size)
        self.class_vector = class_vector

    def gen_sample_array(self):
        from sklearn.model_selection import StratifiedShuffleSplit

        s = StratifiedShuffleSplit(n_splits=self.n_splits, test_size=0.5)
        X = torch.randn(self.class_vector.size, 2).numpy()
        y = self.class_vector
        s.get_n_splits(X, y)

        train_index, test_index = next(s.split(X, y))
        return np.hstack([train_index, test_index])

    def __iter__(self):
        return iter(self.gen_sample_array())

    def __len__(self):
        return len(self.class_vector)

In [8]:
from sklearn.model_selection import StratifiedKFold
from datasets_spectrum import spectral_dataloader
from config import ORDER, STRAINS

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
fold_index = 1

for train_idx, valid_idx in kfold.split(X_train_raw, y_train_raw):

    print(fold_index)
    x_train, y_train = X_train_raw[train_idx], y_train_raw[train_idx]
    x_valid, y_valid = X_train_raw[valid_idx], y_train_raw[valid_idx]
    
    print("train size: ", len(x_train))
    print("validation size: ", len(x_valid))
    print("test size: ", len(y_test))

    stratified_train_batch_sampler = StratifiedSampler(y_train, batch_size)
    dl_tr = spectral_dataloader(
        x_train, y_train, idxs=None, batch_size=batch_size, shuffle=False,sampler=stratified_train_batch_sampler,
    )
    stratified_train_batch_sampler = StratifiedSampler(y_valid, batch_size)
    dl_val = spectral_dataloader(
        x_valid, y_valid, idxs=None, batch_size=batch_size, shuffle=False,sampler=stratified_train_batch_sampler,
    )
    dl_test = spectral_dataloader(X_test, y_test, batch_size=batch_size, shuffle=False)
    values, counts = np.unique(np.asarray(y_test), return_counts=True)

    dataloaders = {"train": dl_tr, "val": dl_val, "test": dl_test}
    model = Variant_LeNet_without_linear(in_channels=1)

    model_path = f"best_variant_lenet_model_{fold_index}.pt"
    solver = Solver(
        dataloaders, model, model_path, "cuda", n_classes
    )
    train_accuracy, val_accuracy, useful_stuff = solver.train(200)
    C, y_true, y_pred, test_accuracy, y_pred_prob = solver.test()
    train_avg_accuracy.append(train_accuracy)
    val_avg_accuracy.append(val_accuracy)
    avg_accuracy.append(test_accuracy)
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(np.unique(y_true).shape[0]):
        fpr[i], tpr[i], _ = roc_curve(y_test == i, y_pred_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    values = [
        v
        for v in roc_auc.values()
        if isinstance(v, (int, float)) and not math.isnan(v)
    ]
    if values:
        auc_score = sum(values) / len(values)
    avg_roc.append(auc_score)

    # Plot confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=ORDER)
    sns.set_context("talk", rc={"font": "Helvetica", "font.size": 12})
    label = [STRAINS[i] for i in ORDER]
    cm = 100 * cm / cm.sum(axis=1)[:,np.newaxis]
    
    # calculate comfusion matrix
    accuracy = accuracy_score(y_true, y_pred)
    sensitivity = recall_score(y_true, y_pred, average="micro", zero_division=0)
    specificity = cm[1, 1] / (cm[1, 0] + cm[1, 1])
    f1 = f1_score(y_true, y_pred, average="micro", zero_division=0)

    df = pd.DataFrame(
        {
            "Accuracy": [np.round(accuracy_score(y_true, y_pred), 4)],
            "Recall": [
                recall_score(y_true, y_pred, average=None, zero_division=0).round(4)
            ],
            "Specificity": [specificity_score(y_true, y_pred, average=None).round(4)],
            "Precision": [
                precision_score(y_true, y_pred, average=None, zero_division=0).round(4)
            ],
            "F1 Score": [
                f1_score(y_true, y_pred, average=None, zero_division=0).round(4)
            ],
        }
    )
    print(df.transpose())

    plot_metrics(training_results=useful_stuff, fold_index=fold_index, fold_name="variant_lenet")
    plot_loss_metrics(training_results=useful_stuff, fold_index=fold_index, fold_name="variant_lenet")

    fold_index += 1

1
train size:  48000
validation size:  12000
test size:  3000


100%|██████████| 200/200 [1:17:58<00:00, 23.39s/it]

load model at epoch 129, with val loss: 4.245





total 0.4663333333333333
class 0 0.86
class 1 0.0
class 2 0.01
class 3 0.99
class 4 0.42
class 5 1.0
class 6 0.64
class 7 0.59
class 8 0.03
class 9 0.0
class 10 0.45
class 11 0.06
class 12 0.14
class 13 0.01
class 14 1.0
class 15 1.0
class 16 0.0
class 17 0.99
class 18 0.77
class 19 0.94
class 20 0.86
class 21 0.0
class 22 0.01
class 23 0.87
class 24 0.0
class 25 0.7
class 26 0.41
class 27 0.86
class 28 0.07
class 29 0.31
                                                             0
Accuracy                                                0.4663
Recall       [0.86, 0.0, 0.01, 0.99, 0.42, 1.0, 0.64, 0.59,...
Specificity  [0.9659, 1.0, 0.9972, 0.9817, 0.9931, 0.9941, ...
Precision    [0.4649, 0.0, 0.1111, 0.6513, 0.6774, 0.8547, ...
F1 Score     [0.6035, 0.0, 0.0183, 0.7857, 0.5185, 0.9217, ...
2
train size:  48000
validation size:  12000
test size:  3000


100%|██████████| 200/200 [1:19:00<00:00, 23.70s/it]

load model at epoch 129, with val loss: 4.247





total 0.48966666666666664
class 0 0.81
class 1 0.0
class 2 0.05
class 3 1.0
class 4 0.27
class 5 1.0
class 6 0.63
class 7 0.67
class 8 0.02
class 9 0.03
class 10 0.51
class 11 0.09
class 12 0.2
class 13 0.0
class 14 0.96
class 15 1.0
class 16 0.0
class 17 0.99
class 18 0.85
class 19 1.0
class 20 0.97
class 21 0.0
class 22 0.04
class 23 0.87
class 24 0.0
class 25 0.63
class 26 0.47
class 27 0.81
class 28 0.1
class 29 0.72
                                                             0
Accuracy                                                0.4897
Recall       [0.81, 0.0, 0.05, 1.0, 0.27, 1.0, 0.63, 0.67, ...
Specificity  [0.9659, 1.0, 0.9997, 0.9752, 0.9921, 0.9969, ...
Precision    [0.45, 0.0, 0.8333, 0.5814, 0.54, 0.9174, 0.34...
F1 Score     [0.5786, 0.0, 0.0943, 0.7353, 0.36, 0.9569, 0....
3
train size:  48000
validation size:  12000
test size:  3000


100%|██████████| 200/200 [1:18:29<00:00, 23.55s/it]

load model at epoch 155, with val loss: 4.218





total 0.49066666666666664
class 0 0.81
class 1 0.0
class 2 0.17
class 3 1.0
class 4 0.31
class 5 1.0
class 6 0.7
class 7 0.71
class 8 0.05
class 9 0.0
class 10 0.55
class 11 0.0
class 12 0.12
class 13 0.01
class 14 1.0
class 15 1.0
class 16 0.01
class 17 0.96
class 18 0.82
class 19 0.94
class 20 0.96
class 21 0.0
class 22 0.03
class 23 0.94
class 24 0.0
class 25 0.47
class 26 0.66
class 27 0.89
class 28 0.06
class 29 0.55
                                                             0
Accuracy                                                0.4907
Recall       [0.81, 0.0, 0.17, 1.0, 0.31, 1.0, 0.7, 0.71, 0...
Specificity  [0.9679, 1.0, 0.9866, 0.99, 0.9976, 0.991, 0.9...
Precision    [0.4655, 0.0, 0.3036, 0.7752, 0.8158, 0.7937, ...
F1 Score     [0.5912, 0.0, 0.2179, 0.8734, 0.4493, 0.885, 0...
4
train size:  48000
validation size:  12000
test size:  3000


100%|██████████| 200/200 [1:18:43<00:00, 23.62s/it]

load model at epoch 169, with val loss: 4.213





total 0.48833333333333334
class 0 0.84
class 1 0.0
class 2 0.01
class 3 0.99
class 4 0.74
class 5 1.0
class 6 0.67
class 7 0.57
class 8 0.12
class 9 0.02
class 10 0.34
class 11 0.01
class 12 0.18
class 13 0.01
class 14 0.98
class 15 0.99
class 16 0.01
class 17 0.97
class 18 0.77
class 19 0.96
class 20 0.77
class 21 0.0
class 22 0.01
class 23 0.92
class 24 0.0
class 25 0.67
class 26 0.61
class 27 0.9
class 28 0.12
class 29 0.47
                                                             0
Accuracy                                                0.4883
Recall       [0.84, 0.0, 0.01, 0.99, 0.74, 1.0, 0.67, 0.57,...
Specificity  [0.9721, 1.0, 1.0, 0.9941, 0.9807, 0.9869, 0.9...
Precision    [0.5091, 0.0, 1.0, 0.8534, 0.5692, 0.7246, 0.4...
F1 Score     [0.634, 0.0, 0.0198, 0.9167, 0.6435, 0.8403, 0...
5
train size:  48000
validation size:  12000
test size:  3000


100%|██████████| 200/200 [1:18:48<00:00, 23.64s/it]

load model at epoch 171, with val loss: 4.233





total 0.4613333333333333
class 0 0.81
class 1 0.0
class 2 0.03
class 3 1.0
class 4 0.32
class 5 1.0
class 6 0.76
class 7 0.41
class 8 0.02
class 9 0.01
class 10 0.57
class 11 0.0
class 12 0.16
class 13 0.02
class 14 0.98
class 15 1.0
class 16 0.03
class 17 0.98
class 18 0.77
class 19 0.92
class 20 0.96
class 21 0.0
class 22 0.01
class 23 0.95
class 24 0.0
class 25 0.35
class 26 0.64
class 27 0.89
class 28 0.1
class 29 0.15
                                                             0
Accuracy                                                0.4613
Recall       [0.81, 0.0, 0.03, 1.0, 0.32, 1.0, 0.76, 0.41, ...
Specificity  [0.9679, 1.0, 0.9983, 0.9769, 0.9941, 0.9897, ...
Precision    [0.4655, 0.0, 0.375, 0.5988, 0.6531, 0.7692, 0...
F1 Score     [0.5912, 0.0, 0.0556, 0.7491, 0.4295, 0.8696, ...


In [9]:
print("train mean:", round(np.mean(train_avg_accuracy),4))
print("train std:", round(np.std(train_avg_accuracy),4))

print("val mean:", round(np.mean(val_avg_accuracy),4))
print("val std:", round(np.std(val_avg_accuracy),4))

print("test mean:", round(np.mean(avg_accuracy),4))
print("test std:", round(np.std(avg_accuracy),4))

print("auc mean:", round(np.mean(avg_roc),4))
print("auc std:", round(np.std(avg_roc),4))

train mean: 1.0
train std: 0.0
val mean: 0.9362
val std: 0.0011
test mean: 0.4793
test std: 0.0127
auc mean: 0.8523
auc std: 0.0088
