## Imports

In [4]:
import pandas as pd
import numpy as np
import time

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
# from torch.amp import GradScaler

# dtype = torch.float
# device = "cuda" if torch.cuda.is_available() else "cpu"
# torch.set_default_device(device)
# torch.get_default_device()

from datetime import timedelta
import math

import gc

# ToDo!

- [ ] Salvar lista de classes possíveis no original, de acordo com a config de amostragem
- [ ] Remover classes não presentes no original

## Data Load

### Sequence encoder

In [5]:
base_map = {
    "A":[1.0, 0.0, 0.0, 0.0],
    "T":[0.0, 1.0, 0.0, 0.0],
    "G":[0.0, 0.0, 1.0, 0.0],
    "C":[0.0, 0.0, 0.0, 1.0],

    'W':[0.5, 0.5, 0.0, 0.0],
    'S':[0.0, 0.0, 0.5, 0.5],
    'M':[0.5, 0.0, 0.0, 0.5],
    'K':[0.0, 0.5, 0.5, 0.0],
    'R':[0.5, 0.0, 0.5, 0.0],
    'Y':[0.0, 0.5, 0.0, 0.5],
    
    'B':[0.0, 0.3, 0.3, 0.3],
    'D':[0.3, 0.3, 0.3, 0.0],
    'H':[0.3, 0.3, 0.0, 0.3],
    'V':[0.3, 0.0, 0.3, 0.3],

    'N':[0.25, 0.25, 0.25, 0.25],
}

def encode_sequence(sequence):
    encoded_seq = []

    for base in sequence:
        encoded_seq.append(base_map[base])
    
    return torch.tensor(encoded_seq)

### PyTorch dataset object to load Sequences and Classification Data

In [None]:
class SequenceDataset(Dataset):    
    def __init__(self, classes, level, dataset):
        self.classes = classes
        self.level = level

        dataset = dataset.loc[dataset[level].isin(classes)]

        self.labels = dataset[level]
        self.sequences = SequenceDataset.__sequences__(dataset)
        self.encoded_labels = SequenceDataset.__encoded_labels__(self.classes, self.labels)

    def __encoded_labels__(classes, labels):
        return torch.nn.functional.one_hot(torch.tensor([classes.index(l) for l in labels]), len(classes)).type(torch.cuda.FloatTensor)
    
    def __sequences__(ds):
        sequences = []
        for _, row in ds.iterrows():
            sequences.append(encode_sequence(row["truncated_sequence"]))        
        return torch.stack(sequences, dim=0)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return   self.sequences[idx], self.encoded_labels[idx]
    
    def __getitems__(self, ids):
        idx = torch.tensor(ids, device=torch.device('cuda:0'))
        return   list(zip(torch.index_select(self.sequences, 0, idx), torch.index_select(self.encoded_labels, 0, idx)))
    

### Generate PyTorch DataLoader objects

In [25]:
def loaders_generator(ds_train, ds_test, bs = 128):
    train_loader = DataLoader(ds_train, batch_size=bs, shuffle=True, generator=torch.Generator(device='cuda'))
    test_loader = DataLoader(ds_test, batch_size=bs, shuffle=True, generator=torch.Generator(device='cuda'))

    return train_loader, test_loader

## Models

In [None]:
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=4):
        super(ResidualBlock, self).__init__()
        
        # Padding to maintain input size
        self.padding = nn.CircularPad1d((1,2))
        
        # Convolutional layers
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size)
        self.bn1 = nn.BatchNorm1d(out_channels)
        
        # Shortcut connection
        self.shortcut = nn.Sequential()
        if in_channels != out_channels:
            self.shortcut = nn.Conv1d(in_channels, out_channels, kernel_size=1)
        
        # Activation
        self.relu = nn.ReLU(inplace=True)
    
    def forward(self, x):
        # Store the input for the residual connection
        residual = x
        
        # Main path
        out = self.padding(x)
        out = self.conv1(out)
        out = self.bn1(out)
        
        # Shortcut connection
        residual = self.shortcut(residual)
        
        # Add residual connection
        out += residual
        out = self.relu(out)
        
        return out

In [27]:
class SimplestCNNClassifier_8layers_Residual(nn.Module):
    def __init__(self, nClasses):
        super(SimplestCNNClassifier_8layers_Residual, self).__init__()
        
        # Residual blocks with adaptive pooling
        self.residual_block1 = ResidualBlock(4, 16)
        self.adAvgPool1 = nn.AdaptiveAvgPool1d(450)
        
        self.residual_block2 = ResidualBlock(16, 32)
        self.adAvgPool2 = nn.AdaptiveAvgPool1d(225)
        
        self.residual_block3 = ResidualBlock(32, 64)
        self.adAvgPool3 = nn.AdaptiveAvgPool1d(112)
        
        self.residual_block4 = ResidualBlock(64, 128)
        self.adAvgPool4 = nn.AdaptiveAvgPool1d(56)
        
        self.residual_block5 = ResidualBlock(128, 256)
        self.adAvgPool5 = nn.AdaptiveAvgPool1d(28)
        
        self.residual_block6 = ResidualBlock(256, 512)
        self.adAvgPool6 = nn.AdaptiveAvgPool1d(14)
        
        # Two additional residual blocks
        self.residual_block7 = ResidualBlock(512, 1024)
        self.adAvgPool7 = nn.AdaptiveAvgPool1d(7)
        
        self.residual_block8 = ResidualBlock(1024, 2048)
        self.adAvgPool8 = nn.AdaptiveAvgPool1d(3)
        
        # Activation and fully connected layers
        self.act = nn.ReLU()
        
        # Calculate the input size for linear layers
        # Note: You might need to adjust this based on your specific input dimensions
        self.linear1 = nn.Linear(6144, 6144)
        self.linear2 = nn.Linear(6144, nClasses)
    
    def forward(self, x):
        # Move channel dimension
        x = torch.movedim(x, -1, -2)
        
        # First residual block
        x = self.residual_block1(x)
        x = self.adAvgPool1(x)
        
        # Second residual block
        x = self.residual_block2(x)
        x = self.adAvgPool2(x)
        
        # Third residual block
        x = self.residual_block3(x)
        x = self.adAvgPool3(x)
        
        # Fourth residual block
        x = self.residual_block4(x)
        x = self.adAvgPool4(x)
        
        # Fifth residual block
        x = self.residual_block5(x)
        x = self.adAvgPool5(x)
        
        # Sixth residual block
        x = self.residual_block6(x)
        x = self.adAvgPool6(x)
        
        # Seventh residual block
        x = self.residual_block7(x)
        x = self.adAvgPool7(x)
        
        # Eighth residual block
        x = self.residual_block8(x)
        x = self.adAvgPool8(x)
        
        # Flatten and fully connected layers
        x = torch.flatten(x, 1)
        x = self.linear1(x)
        x = self.act(x)
        x = self.linear2(x)
        
        return x

## Run

In [None]:
# Global references
_model_ = None
_lossfunction_ = None
_optimizer_ = None

# Function to clean cache
def clear():
    global _model_, _lossfunction_, _optimizer_
    
    torch.cuda.empty_cache()
    torch.compiler.reset()
    torch._dynamo.reset()

    if _model_:
        del _model_
        _model_ = None
    if _lossfunction_:
        del _lossfunction_
        _lossfunction_ = None
    if _optimizer_:
        del _optimizer_
        _optimizer_ = None
    
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
def run_once(level, dataset_path, model_path):
    
    # Clean Models Cache
    clear()

    times_log = {}
    times_log["Start_Time"] = time.time()

    # Load Data
    data = pd.read_csv(dataset_path)
    droped_sequences_by_null = data.shape[0]
    data = data.loc[~data[level].isna()]
    droped_sequences_by_null = droped_sequences_by_null - data.shape[0]
    classes = pd.read_csv("./Classes/"+level+".csv")
    dataset = SequenceDataset(classes=classes, level=level, dataset=data)
    droped_sequences_by_class = data.shape[0] - dataset.__len__()
    batch_size = 50000
    dataloader = DataLoader(dataset, batch_size, shuffle=True, generator=torch.Generator(device='cuda'))
    times_log["Data_Loaded_Time"] = time.time()

    # Load Model
    _model_ = torch.compile(SimplestCNNClassifier_8layers_Residual(dataset.encoded_labels.shape[1]))
    _model_.load_state_dict(torch.load(model_path, weights_only=True))
    _model_.eval()
    times_log["Model_Loaded_Time"] = time.time()
    
    # Run Predict
    pred_acc = 0
    with torch.no_grad():
        for X, y in dataloader:
            pred = _model_(X)
            pred_acc += (pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()
    pred_acc /= len(dataloader.dataset)
    times_log["Prediction_Finished_Time"] = time.time()

    
    # Save Results
    results = {
        "acc":0,
        "n_classes":len(classes.shape[0]),
        "n_sequences":SequenceDataset.__len__(),
        "droped_sequences_by_null": droped_sequences_by_null,
        "droped_sequences_by_class": droped_sequences_by_class,
        "batch_size":batch_size,
        "reserved_memory": torch.cuda.memory_reserved() / 1024 / 1024,
        **times_log
    }

    return results


In [None]:
def run_batch(levels, models_list, data_path):
    results = []

    for level in levels:

        level_models = models_list.loc[models_list["level"] == level]
        for index, level_model in level_models.iterrows():

            # Run
            run = run_once(
                level = level,
                dataset_path = data_path,
                model_path = "./Models/"+str(level).capitalize()+"/"+str(level_model["experiment_id"])+"_"+str(index)+"_"+level_model["model"]+".pth",
            )
            
            # Save Results
            results.append(
                {
                    "id":str(index),
                    "level":level_model["level"],
                    "model":level_model["model"],
                } | run
            )

            print("Level: "+level+"\tModel: "+level_model["model"]+"\tAccuracy: "+str(run["acc"])+"\tTotal time: "+str(timedelta(seconds=math.floor(run["Prediction_Finished_Time"]-run["Start_Time"]))))

    pd.DataFrame(results).to_csv("./Results/"+str(int(time.time()))+"_results.csv", index=False)


Level: class	Model: SimplestCNNClassifier_8layers_Residual	Accuracy: 0	Total time: 0:00:00
Level: class	Model: SimplestCNNClassifier_8layers_Residual	Accuracy: 0	Total time: 0:00:00
Level: class	Model: SimplestCNNClassifier_8layers_Residual	Accuracy: 0	Total time: 0:00:00
Level: order	Model: SimplestCNNClassifier_8layers_Residual	Accuracy: 0	Total time: 0:00:00
Level: order	Model: SimplestCNNClassifier_8layers_Residual	Accuracy: 0	Total time: 0:00:00
Level: order	Model: SimplestCNNClassifier_8layers_Residual	Accuracy: 0	Total time: 0:00:00
Level: family	Model: SimplestCNNClassifier_8layers_Residual	Accuracy: 0	Total time: 0:00:00
Level: family	Model: SimplestCNNClassifier_8layers_Residual	Accuracy: 0	Total time: 0:00:00
Level: family	Model: SimplestCNNClassifier_8layers_Residual	Accuracy: 0	Total time: 0:00:00
Level: genus	Model: SimplestCNNClassifier_8layers_Residual	Accuracy: 0	Total time: 0:00:00
Level: genus	Model: SimplestCNNClassifier_8layers_Residual	Accuracy: 0	Total time: 0:00

## Params

In [None]:
levels = [
    "class", 
    "order", 
    "family", 
    "genus",
    "species",
]

In [None]:
models_list = pd.read_csv("./Models/Models_List.csv", index_col="id")

In [None]:
data_path = "../sequences.csv"

# Prediction

Class
Order
Family
Genus
Species
