In [1]:
!git clone https://github.com/ElanaPearl/interPLM.git
%cd interPLM
!pip install -e .
!pip install biopython

fatal: destination path 'interPLM' already exists and is not an empty directory.
/content/interPLM
Obtaining file:///content/interPLM
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: interplm
  Attempting uninstall: interplm
    Found existing installation: interplm 1.0.0
    Uninstalling interplm-1.0.0:
      Successfully uninstalled interplm-1.0.0
  Running setup.py develop for interplm
Successfully installed interplm-1.0.0


## Pobieranie danych

## Zbiór Danych: CATH Protein Domains


W bazie CATH istnieje ścisła hierarchia.

| Poziom | Oznaczenie | Nazwa | Liczba Klas* |
| --- | --- | --- | --- |
| **C** | `1` | **Class** | 5 |
| **A** | `1.10` | **Architecture** | 26 |
| **T** | `1.10.8` | **Topology** | 520  |
| **H** | `1.10.8.10` | **Homology** | 671 |

**Liczby klas są przybliżone i zależą od wersji bazy CATH.*

Będziemy trenować model na poziomie C.A.T, ponieważ niższe poziomy są zbyt ogólne, a poziom C.A.T.H jest zbyt rozdrobniony.


In [2]:
%%bash
mkdir -p cath_data
cd cath_data

echo "Pobieranie etykiet..."
wget -q -nc ftp://orengoftp.biochem.ucl.ac.uk/cath/releases/latest-release/cath-classification-data/cath-domain-list.txt

echo "Pobieranie sekwencji..."
wget -q -nc ftp://orengoftp.biochem.ucl.ac.uk/cath/releases/latest-release/sequence-data/cath-domain-seqs.fa

ls -lh

Pobieranie etykiet...
Pobieranie sekwencji...
total 154M
-rw-r--r-- 1 root root  43M Jan 21 14:29 cath-domain-list.txt
-rw-r--r-- 1 root root 112M Jan 21 14:34 cath-domain-seqs.fa


In [3]:
!head -20 cath_data/cath-domain-list.txt

#---------------------------------------------------------------------
# FILE NAME:    CathDomainList.v4.4.0
# FILE DATE:    16.12.2024
#
# CATH VERSION: v4.4.0
# VERSION DATE: 16.12.2024
#
# FILE FORMAT:  Cath List File (CLF) Format 2.0
#
# FILE DESCRIPTION:
# Contains all classified protein domains in CATH
# for class 1 (mainly alpha), class 2 (mainly beta),
# class 3 (alpha and beta) and class 4 (few secondary structures).
#
# See 'README.file_formats' for file format information
#---------------------------------------------------------------------
1oaiA00     1    10     8    10     1     1     1     1     1    59 1.000
1go5A00     1    10     8    10     1     1     1     1     2    69 999.000
3frhA01     1    10     8    10     2     1     1     1     1    58 1.200
3friA01     1    10     8    10     2     1     1     1     2    54 1.800


In [4]:
import pandas as pd
from Bio import SeqIO

LABEL_FILE = 'cath_data/cath-domain-list.txt'
SEQ_FILE = 'cath_data/cath-domain-seqs.fa'

column_names = [
    'domain_id', 'class_C', 'arch_A', 'top_T', 'hom_H',
    's35', 's60', 's95', 's100', 's100_count', 'domain_len', 'resolution'
]

df_labels = pd.read_csv(
    LABEL_FILE,
    sep=r'\s+',
    comment='#',
    header=None,
    names=column_names,
    usecols=['domain_id', 'class_C', 'arch_A', 'top_T', 'hom_H']
)
print(df_labels[['class_C', 'arch_A', 'top_T', 'hom_H']].nunique())
df_labels['target_label'] = (
    df_labels['class_C'].astype(str) + "." +
    df_labels['arch_A'].astype(str) + "." +
    df_labels['top_T'].astype(str)
)
df_labels.drop(columns=['class_C', 'arch_A', 'top_T', 'hom_H'], inplace=True, errors='ignore')


print(f"Wczytywanie sekwencji z {SEQ_FILE}")
seq_data = []

with open(SEQ_FILE, "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        original_id = record.id
        # original_id = cath|4_4_0|3avrA01/886-901_1246-1312_1380-1395
        try:
            part_with_id = original_id.split('|')[2]
            # part_with_id = 3avrA01/886-901_1246-1312_1380-1395

            clean_id = part_with_id.split('/')[0]
            # clean_id = 3avrA01
            seq_data.append({
                'domain_id': clean_id,
                'sequence': str(record.seq)
            })
        except IndexError:
            print(f"Pominięto nietypowy nagłówek: {original_id}")
            continue

df_seqs = pd.DataFrame(seq_data)


print("Łączenie sekwencji z etykietami")
full_dataset = pd.merge(df_labels, df_seqs, on='domain_id', how='inner')

print("-" * 50)
print(f"GOTOWY ZBIÓR DANYCH: {len(full_dataset)} próbek.")
print("-" * 50)
pd.set_option('display.max_colwidth', 50)
print(full_dataset.head())

class_C      5
arch_A      26
top_T      520
hom_H      671
dtype: int64
Wczytywanie sekwencji z cath_data/cath-domain-seqs.fa
Łączenie sekwencji z etykietami
--------------------------------------------------
GOTOWY ZBIÓR DANYCH: 601328 próbek.
--------------------------------------------------
  domain_id target_label                                           sequence
0   1oaiA00       1.10.8  PTLSPEQQEMLQAFSTQSGMNLEWSQKCLQDNNWDYTRSAQAFTHL...
1   1go5A00       1.10.8  PAPTPSSSPVPTLSPEQQEMLQAFSTQSGMNLEWSQKCLQDNNWDY...
2   3frhA01       1.10.8  YPMNINDALTSILASKKYRALCPDTVRRILTEEWGRHKSPKQTVEA...
3   3friA01       1.10.8  YPMNINDALTSILASKKYRALCPDTVRRILTEEWGRHKSPKQTVEA...
4   3b89A01       1.10.8  SLNINDALTSILASKKYRALCPDTVRRILTEEWGRHKSPKQTVEAA...


In [5]:
import torch
from transformers import AutoModel, AutoTokenizer
from interplm.sae.inference import load_sae_from_hf

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Używam urządzenia: {DEVICE}")

MODEL_NAME = "esm2-8m"
HF_MODEL_NAME = "facebook/esm2_t6_8M_UR50D"
LAYER_ID = 6

print("Ładowanie modelu ESM-2")
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_NAME)
base_model = AutoModel.from_pretrained(HF_MODEL_NAME).to(DEVICE)
base_model.eval()

print(f"Ładowanie SAE dla warstwy {LAYER_ID}...")
sae = load_sae_from_hf(plm_model=MODEL_NAME, plm_layer=LAYER_ID)
sae = sae.to(DEVICE)
sae.eval()


Używam urządzenia: cuda
Ładowanie modelu ESM-2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/93.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/31.4M [00:00<?, ?B/s]

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Ładowanie SAE dla warstwy 6...


layer_6/ae_normalized.pt:   0%|          | 0.00/26.3M [00:00<?, ?B/s]

Loading configs from /root/.cache/huggingface/hub/models--Elana--InterPLM-esm2-8m/snapshots/81d2429cd9dae7175f1dcd8b4c649a20cdc06c8c/layer_6/config.yaml
Loaded data type: <class 'interplm.train.configs.TrainingRunConfig'>
Data keys: Not a dict


ReLUSAE(
  (encoder): Linear(in_features=320, out_features=10240, bias=True)
  (decoder): Linear(in_features=10240, out_features=320, bias=False)
)

In [6]:
def get_feature_dim(example_seq):
    inputs = tokenizer([example_seq], return_tensors="pt", padding=True,
                       truncation=True, max_length=512).to(DEVICE)
    with torch.no_grad():
        outputs = base_model(**inputs, output_hidden_states=True)
        dense_acts = outputs.hidden_states[LAYER_ID]
        sae_acts = sae.encode(dense_acts)
        return sae_acts.shape[-1]



In [7]:
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import os
import pickle

# --- OGRANICZENIE DANYCH DO TESTÓW ---
SAMPLE_SIZE = 100_000
SAMPLE_SIZE = None

if SAMPLE_SIZE:
    df_subset = full_dataset.sample(n=min(SAMPLE_SIZE, len(full_dataset)), random_state=42).copy()
else:
    df_subset = full_dataset.copy()

FEATURE_DIM = get_feature_dim(df_subset['sequence'].iloc[0])
N = len(df_subset)

print(f"Przetwarzanie {len(df_subset)} próbek...")

# Ścieżki do zapisu tymczasowego
X_tmp_file = "X_tmp.dat"

# --- Label encoding dla etykiet ---
y_labels = df_subset['target_label'].values

X_mmap = np.memmap(
    X_tmp_file,
    dtype="float32",
    mode="w+",
    shape=(N, FEATURE_DIM)
)

def extract_sae_features_stream(sequences, batch_size=32):
    idx = 0
    for i in tqdm(range(0, len(sequences), batch_size)):
        batch_seqs = sequences[i:i+batch_size]

        inputs = tokenizer(
            batch_seqs,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(DEVICE)

        with torch.no_grad():
            outputs = base_model(**inputs, output_hidden_states=True)
            dense_acts = outputs.hidden_states[LAYER_ID]
            sae_acts = sae.encode(dense_acts)

            mask = inputs['attention_mask'].unsqueeze(-1).float()
            sum_features = torch.sum(sae_acts * mask, dim=1)
            count_tokens = torch.clamp(mask.sum(dim=1), min=1e-9)
            mean_features = sum_features / count_tokens

            batch_features = mean_features.cpu().numpy().astype("float32")

            b = batch_features.shape[0]
            X_mmap[idx:idx+b] = batch_features
            idx += b

    X_mmap.flush()


extract_sae_features_stream(df_subset['sequence'].tolist(), batch_size=32)

# print(f"\nWymiary macierzy cech X zapisanej w {X_tmp_file}: {X_loaded.shape}")
# print(f"Liczba etykiet y: {len(y_encoded)}")


Przetwarzanie 601328 próbek...


100%|██████████| 18792/18792 [28:53<00:00, 10.84it/s]


In [8]:
import numpy as np
from collections import Counter

unique_classes, counts = np.unique(y_labels, return_counts=True)
valid_classes = unique_classes[counts > 1]

valid_mask = np.isin(y_labels, valid_classes)
N_new = valid_mask.sum()

print(f"Oryginalnie: {len(y_labels)} próbek")
print(f"Po filtracji: {N_new} próbek")


Oryginalnie: 601328 próbek
Po filtracji: 601232 próbek


In [9]:
N = len(y_labels)
FEATURE_DIM = X_mmap.shape[1]  # albo znana wcześniej wartość

X_in = np.memmap(
    "X_tmp.dat",
    dtype="float32",
    mode="r",
    shape=(N, FEATURE_DIM)
)


In [10]:
X_out = np.memmap(
    "X_filtered.dat",
    dtype="float32",
    mode="w+",
    shape=(N_new, FEATURE_DIM)
)

y_out = y_labels[valid_mask]


In [None]:
batch_size = 2048
write_idx = 0

for i in range(0, N, batch_size):
    j = min(i + batch_size, N)

    X_batch = X_in[i:j]
    mask_batch = valid_mask[i:j]

    if not mask_batch.any():
        continue

    X_kept = X_batch[mask_batch]

    b = len(X_kept)
    X_out[write_idx:write_idx+b] = X_kept
    write_idx += b

X_out.flush()

print(f"Zapisano {write_idx} próbek po filtracji.")


In [None]:
# Filtrowanie rzadkich klas
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import LabelEncoder
# from sklearn.metrics import classification_report, accuracy_score
# import pandas as pd
# import numpy as np

# unique_classes, counts = np.unique(y, return_counts=True)
# valid_classes = unique_classes[counts > 1]
# mask_valid = np.isin(y, valid_classes)
# X_filtered = X[mask_valid]
# y_filtered = y[mask_valid]

# print(f"Oryginalna liczba próbek: {len(y)}")
# print(f"Po usunięciu pojedynczych klas: {len(y_filtered)}")
# print(f"Liczba unikalnych klas do przewidzenia: {len(valid_classes)}")

# le = LabelEncoder()
# y_encoded = le.fit_transform(y_filtered)

# # Zapisz macierz cech X i wektor etykiet y
# np.save('X_features.npy', X_filtered)
# np.save('y_labels.npy', y_encoded)


## Trening na GPU.


In [None]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# import numpy as np
# from sklearn.metrics import classification_report, accuracy_score
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder

# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Trenowanie na: {DEVICE}")

# X_train_t = torch.tensor(X_train, dtype=torch.float32).to(DEVICE)
# y_train_t = torch.tensor(y_train, dtype=torch.long).to(DEVICE)
# X_test_t = torch.tensor(X_test, dtype=torch.float32).to(DEVICE)

# input_dim = X_train.shape[1]
# output_dim = len(valid_classes)

# class LogisticRegressionPyTorch(nn.Module):
#     def __init__(self, input_dim, output_dim):
#         super(LogisticRegressionPyTorch, self).__init__()
#         self.linear = nn.Linear(input_dim, output_dim)

#     def forward(self, x):
#         return self.linear(x)

# model = LogisticRegressionPyTorch(input_dim, output_dim).to(DEVICE)


# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.01)
# EPOCHS = 1000

# for epoch in range(EPOCHS):
#     model.train()

#     outputs = model(X_train_t)
#     loss = criterion(outputs, y_train_t)

#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()

#     if (epoch+1) % 100 == 0:
#         print(f'Epoch [{epoch+1}/{EPOCHS}], Loss: {loss.item():.4f}')


# print("Ewaluacja")
# model.eval()
# with torch.no_grad():
#     outputs = model(X_test_t)
#     _, predicted = torch.max(outputs.data, 1)
#     y_pred = predicted.cpu().numpy()


# acc = accuracy_score(y_test, y_pred)
# print("-" * 30)
# print(f"Accuracy: {acc:.2%}")
# print("-" * 30)

# unique_test_labels = np.unique(y_test)
# target_names = le.inverse_transform(unique_test_labels)

# print(classification_report(
#     y_test,
#     y_pred,
#     labels=unique_test_labels,
#     target_names=target_names,
#     zero_division=0
# ))

# Task
Save the filtered labels `y_out` to a NumPy file, then load the dimensions of the feature matrix from `X_filtered.dat` and the labels from the saved `y_out` file. Split the data indices into stratified training and testing sets. Define a custom PyTorch `Dataset` called `NumpyMemmapDataset` to efficiently load features from `X_filtered.dat` and labels from the `y_out` file using memory-mapping for given indices. Instantiate `DataLoader` objects for both training and testing datasets. Define a PyTorch logistic regression model using `nn.Module`, with input dimensions from `FEATURE_DIM` and output dimensions equal to the number of unique classes, moving the model to the GPU. Train the model using `nn.CrossEntropyLoss` and `optim.Adam`, printing the loss per epoch. Finally, evaluate the model by calculating and printing the overall accuracy and a detailed classification report using `sklearn.metrics.accuracy_score` and `sklearn.metrics.classification_report`.

## Prepare Data for Training and Testing

### Subtask:
Save the filtered labels (y_out) to a NumPy file. Then, load the dimensions of the feature matrix (X_filtered.dat) and the labels from the saved file. Split the data indices into training and testing sets in a stratified manner to maintain class distribution.


**Reasoning**:
The subtask requires saving the filtered labels, determining dimensions, and splitting data indices. I will first save the `y_out` array to a file, then retrieve the number of unique classes and the dimensions of the feature matrix, and finally split the data indices into training and testing sets using stratified sampling to maintain class distribution.



In [None]:
import numpy as np
from sklearn.model_selection import train_test_split


# 2. (Optional) Load the y_filtered.npy file back, or directly use y_out
le = LabelEncoder()
y_encoded = le.fit_transform(y_out)
y_loaded = y_encoded

# 3. Determine the number of unique classes
output_dim = len(np.unique(y_loaded))
print(f"Number of unique classes (output_dim): {output_dim}")

# 4. Get the total number of samples (N_new) and the feature dimension (FEATURE_DIM)
N_new = y_loaded.shape[0] # Total number of samples after filtering
FEATURE_DIM = X_out.shape[1] # Feature dimension from the memory-mapped array

print(f"Total number of filtered samples (N_new): {N_new}")
print(f"Feature dimension (FEATURE_DIM): {FEATURE_DIM}")

# 5. Generate a sequence of indices
indices = np.arange(N_new)

# 6. Split these indices into training and testing sets in a stratified manner
train_indices, test_indices, _, _ = train_test_split(
    indices, # Indices to split
    y_loaded, # Labels for stratification
    test_size=0.2,
    random_state=42,
    stratify=y_loaded # Ensure balanced class distribution
)

print(f"Train indices shape: {train_indices.shape}")
print(f"Test indices shape: {test_indices.shape}")


## Create Custom PyTorch Dataset

### Subtask:
Define a custom PyTorch Dataset class, `NumpyMemmapDataset`, which will efficiently load features from `X_filtered.dat` using memory-mapping and labels from the saved y_out file. This dataset will accept indices for specific subsets of data (training or testing).


**Reasoning**:
The subtask requires defining a custom PyTorch Dataset class. I will define `NumpyMemmapDataset` inheriting from `torch.utils.data.Dataset`, implementing `__init__`, `__len__`, and `__getitem__` methods as specified to efficiently load data from memory-mapped files and arrays.



In [None]:
import torch
from torch.utils.data import Dataset
import numpy as np

class NumpyMemmapDataset(Dataset):
    def __init__(self, feature_mmap_path, labels, indices, feature_dim):
        self.labels = labels
        self.indices = indices
        self.feature_dim = feature_dim

        # Initialize the memory-mapped array for features
        # The shape needs to correspond to the full original memmap size (N_new, FEATURE_DIM)
        # which is len(labels) by feature_dim
        self.X_mmap = np.memmap(
            feature_mmap_path,
            dtype='float32',
            mode='r',
            shape=(len(labels), feature_dim)
        )

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        # Retrieve the actual index from the stored indices array
        actual_idx = self.indices[idx]

        # Load the feature vector from the memory-mapped array
        feature = torch.tensor(self.X_mmap[actual_idx], dtype=torch.float32)

        # Load the label from the stored labels array
        label = torch.tensor(self.labels[actual_idx], dtype=torch.long)

        return feature, label

print("NumpyMemmapDataset class defined.")

## Instantiate DataLoaders

### Subtask:
Create instances of the `NumpyMemmapDataset` for both training and testing subsets. Then, create `torch.utils.data.DataLoader` objects for these datasets, enabling batch-wise loading of data into the model, thus optimizing RAM usage.


**Reasoning**:
The subtask requires instantiating the custom `NumpyMemmapDataset` for both training and testing data, and then creating `DataLoader` objects for efficient batch processing during model training and evaluation.



In [None]:
from torch.utils.data import DataLoader

# 1. Define BATCH_SIZE
BATCH_SIZE = 128

# 2. Instantiate NumpyMemmapDataset for training data
train_dataset = NumpyMemmapDataset(
    feature_mmap_path="X_filtered.dat",
    labels=y_loaded,
    indices=train_indices,
    feature_dim=FEATURE_DIM
)

# 3. Instantiate NumpyMemmapDataset for testing data
test_dataset = NumpyMemmapDataset(
    feature_mmap_path="X_filtered.dat",
    labels=y_loaded,
    indices=test_indices,
    feature_dim=FEATURE_DIM
)

# 4. Create DataLoader for the training dataset
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2 # Adjust based on your system's capabilities
)

# 5. Create DataLoader for the testing dataset
test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False, # No need to shuffle test data
    num_workers=2 # Adjust based on your system's capabilities
)

print(f"Training dataset size: {len(train_dataset)}")
print(f"Testing dataset size: {len(test_dataset)}")
print(f"Number of batches in training loader: {len(train_loader)}")
print(f"Number of batches in testing loader: {len(test_loader)}")

**Reasoning**:
Now that the data loaders are set up, the next step is to define the PyTorch logistic regression model as specified in the main task. This involves creating a class that inherits from `nn.Module` and defines the linear layer.



In [None]:
import torch.nn as nn

# 1. Define the Logistic Regression Model
class LogisticRegressionPyTorch(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegressionPyTorch, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

# 2. Instantiate the model and move it to the GPU
model = LogisticRegressionPyTorch(FEATURE_DIM, output_dim).to(DEVICE)

print(f"Logistic Regression Model created with input_dim={FEATURE_DIM} and output_dim={output_dim}")
print(f"Model is on device: {next(model.parameters()).device}")

**Reasoning**:
The subtask requires training the model and then evaluating it. First, I will define the criterion (loss function) and optimizer. Then, I will implement the training loop, iterating through epochs and batches from the `train_loader`, calculating the loss, performing backpropagation, and updating model weights. Finally, I will evaluate the model on the `test_loader` and print the accuracy and a classification report.



In [None]:
import torch.optim as optim
from sklearn.metrics import classification_report, accuracy_score
import torch.nn as nn

# 1. Define Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# 2. Define Training Parameters
EPOCHS = 5

# 3. Training Loop
print("Starting model training...")
for epoch in range(EPOCHS):
    model.train() # Set the model to training mode
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(DEVICE)
        labels = labels.to(DEVICE)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Epoch [{epoch+1}/{EPOCHS}], Loss: {running_loss/len(train_loader):.4f}')

print("Training complete.")

# 4. Evaluation
print("Starting model evaluation...")
model.eval() # Set the model to evaluation mode
y_true = []
y_pred = []

with torch.no_grad(): # Disable gradient calculation for evaluation
    for inputs, labels in test_loader:
        inputs = inputs.to(DEVICE)
        labels = labels.to(DEVICE)

        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

acc = accuracy_score(y_true, y_pred)
print("-" * 30)
print(f"Accuracy on test set: {acc:.2%}")
print("-" * 30)

# Get unique labels from y_true to pass to classification_report
unique_test_labels = np.unique(y_true)

# print(classification_report(
#     y_true,
#     y_pred,
#     labels=unique_test_labels, # Ensure only present labels are reported
#     zero_division=0
# ))


**Reasoning**:
The CUDA error 'device-side assert triggered' during `loss.backward()` typically indicates an issue with the labels provided to `nn.CrossEntropyLoss`, specifically when label values are outside the expected range (0 to num_classes-1). After filtering classes, the integer labels from the initial `LabelEncoder` might no longer be contiguous. Re-encoding the `y_out` array with a new `LabelEncoder` will ensure that the labels are remapped to a contiguous range starting from 0, resolving the out-of-bounds access issue.

