## Imports

In [None]:
import pandas as pd
import numpy as np
import time

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


dtype = torch.float
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)
torch.get_default_device()

# Data Load

In [2]:
base_map = {
    "A":[1.0, 0.0, 0.0, 0.0],
    "T":[0.0, 1.0, 0.0, 0.0],
    "G":[0.0, 0.0, 1.0, 0.0],
    "C":[0.0, 0.0, 0.0, 1.0],

    'W':[0.5, 0.5, 0.0, 0.0],
    'S':[0.0, 0.0, 0.5, 0.5],
    'M':[0.5, 0.0, 0.0, 0.5],
    'K':[0.0, 0.5, 0.5, 0.0],
    'R':[0.5, 0.0, 0.5, 0.0],
    'Y':[0.0, 0.5, 0.0, 0.5],
    
    'B':[0.0, 0.3, 0.3, 0.3],
    'D':[0.3, 0.3, 0.3, 0.0],
    'H':[0.3, 0.3, 0.0, 0.3],
    'V':[0.3, 0.0, 0.3, 0.3],

    'N':[0.25, 0.25, 0.25, 0.25],
}

def encode_sequence(sequence):
    encoded_seq = []

    for base in sequence:
        encoded_seq.append(base_map[base])
    
    return torch.tensor(encoded_seq)

In [3]:
levels = ["domain", "class", "order", "family", "genus", "species"]

class MultilevelSequenceDataset(Dataset):
    def __init__(self, train, test, level):

        self.classes = pd.concat([train[level], test[level]]).unique().tolist()
        self.classes.sort()
        self.level = level

        self.labels = train[level]
        self.encoded_labels = MultilevelSequenceDataset.__encoded_labels__(self.classes, self.labels)
        self.sequences = MultilevelSequenceDataset.__sequences__(train)        

        self.previous_level = levels[levels.index(level)-1]
        self.previous_classes = pd.concat([train[self.previous_level], test[self.previous_level]]).unique().tolist()
        self.previous_classes.sort()
        self.previous_encoded_labels = MultilevelSequenceDataset.__encoded_labels__(self.previous_classes, train[self.previous_level])


        self.test = MultilevelSequenceDatasetTest(
            labels = test[level],
            classes = self.classes,
            encoded_labels = MultilevelSequenceDataset.__encoded_labels__(self.classes, test[level]),
            sequences = MultilevelSequenceDataset.__sequences__(test),
            previous_classes = self.previous_classes,
            previous_encoded_labels = MultilevelSequenceDataset.__encoded_labels__(self.previous_classes, test[self.previous_level]) ,
            previous_level = self.previous_level
            )

    def __encoded_labels__(classes, labels):
        return torch.nn.functional.one_hot(torch.tensor([classes.index(l) for l in labels]), len(classes)).type(torch.cuda.FloatTensor)
    
    def __sequences__(ds):
        sequences = []
        for _, row in ds.iterrows():
            sequences.append(encode_sequence(row["truncated_sequence"]))        
        return torch.stack(sequences, dim=0)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return   self.sequences[idx], self.previous_encoded_labels[idx], self.encoded_labels[idx]
    
    def __getitems__(self, ids):
        idx = torch.tensor(ids, device=torch.device('cuda:0'))
        return   list(zip(torch.index_select(self.sequences, 0, idx), torch.index_select(self.previous_encoded_labels, 0, idx), torch.index_select(self.encoded_labels, 0, idx)))
    
    def get_test(self):
        return self.test

class MultilevelSequenceDatasetTest(MultilevelSequenceDataset):    
    def __init__(self, labels, classes, encoded_labels, sequences, previous_level, previous_classes, previous_encoded_labels ):
        self.labels = labels
        self.classes = classes
        self.encoded_labels = encoded_labels
        self.sequences = sequences

        self.previous_level = previous_level 
        self.previous_classes = previous_classes
        self.previous_encoded_labels = previous_encoded_labels

In [4]:
def loaders_generator(ds_train, ds_test, bs = 128):
    train_loader = DataLoader(ds_train, batch_size=bs, shuffle=True, generator=torch.Generator(device='cuda'))
    test_loader = DataLoader(ds_test, batch_size=bs, shuffle=True, generator=torch.Generator(device='cuda'))

    return train_loader, test_loader

## Models

Entrada: 
    sequencia (tamanho: 900)
    OneHot do nível anterior (tamanho: n_classes) 



CNN na sequencia
flatten na saída da CNN
Concatena com o OneHot anterior


Primeiro Nível roda com o modelo Simplest
Níveis seguintes rodam com o modelo MultilevelSimplest


Obs:
- Usar o script de treino do batch (teve alterações)
- Qual o impacto de usar softmax como saída do modelo? É melhor usar depois do retorno?

In [5]:
class MultilevelSimplestCNNClassifier(nn.Module):
    
    def __init__(self, nClasses, nPreviousClasses):
        super(MultilevelSimplestCNNClassifier, self).__init__()

        print("nClasses: "+str(nClasses))
        print("nPreviousClasses: "+str(nPreviousClasses))

        self.padding1 = nn.CircularPad1d((1,2))
        self.conv1 = nn.Conv1d(4, 8, kernel_size=4)
        self.adAvgPool1 = nn.AdaptiveAvgPool1d(450)

        self.padding2 = nn.CircularPad1d((1,2))
        self.conv2 = nn.Conv1d(8, 32, kernel_size=4)
        self.adAvgPool2 = nn.AdaptiveAvgPool1d(225)

        self.padding3 = nn.CircularPad1d((1,2))
        self.conv3 = nn.Conv1d(32, 128, kernel_size=4)
        self.adAvgPool3 = nn.AdaptiveAvgPool1d(225)


        self.act4 = nn.ReLU()

        self.linear1 = nn.Linear(28800, 28800*2)
        self.linear2 = nn.Linear((28800*2)+nPreviousClasses, nClasses)
    
    def forward(self, x, px):

        x = torch.movedim(x, -1, -2)

        x = self.conv1(self.padding1(x))
        x = self.adAvgPool1(x)

        
        x = self.conv2(self.padding2(x))
        x = self.adAvgPool2(x)

        
        x = self.conv3(self.padding3(x))
        x = self.adAvgPool3(x)
        
        
        x = torch.flatten(x, 1)
        x = self.linear1(x)
        x = self.act4(x)
        
        x = torch.cat([x, px], dim=1)

        x = self.linear2(x)

        return x

## Test Params

In [6]:
_levels_ = [
    # "class", 
    "order", 
    # "family", 
    # "genus"
]

In [7]:
_batch_sizes_ = [
    # 64,
    # 128,
    # 256,
    # 512,
    # 2048,
    "dynamic"
]

In [8]:
_epochs_ = [
    # 2,
    # 5,
    # 50,
    # 100,
    200,
    # 500
]

In [9]:
_models_list_ = [
    MultilevelSimplestCNNClassifier,
    # SimplestCNNClassifier,
    # SimpleCNNClassifier,
    # SimpleCNNWithDropoutClassifier,
    # UnetBasedCNNClassifier,
    # UnetBasedCNNWithDropoutClassifier,
    # UnetBasedCNNWithDilationClassifier,
    # UnetBasedCNNWithDropoutAndDilationClassifier,
    # BaseCNNClassifier, 
]

In [10]:
_loss_functions_ = {
    "CrossEntropyLoss":{
        "function":nn.CrossEntropyLoss,
        "params":{},
        "function_params":{}
    },
}

In [11]:
_learning_rates_ = [
    # 1e-2,
    # 5e-2,
    # 1e-3,
    # 5e-3,
    # 1e-4,
    5e-4
]

In [12]:
_optimizers_ = [
    {
        "optim":torch.optim.AdamW,
        "params":{
            "weight_decay":1e-2,
            "amsgrad":True
        }
    },
]

In [13]:
hiperparams = {
    "levels": _levels_,
    "batch_size": _batch_sizes_,
    "epochs": _epochs_,
    "model": _models_list_,
    "loss_function": _loss_functions_,
    "learning_rate": _learning_rates_,
    "optimizer": _optimizers_    
}

In [14]:
hiperparams

{'levels': ['order'],
 'batch_size': ['dynamic'],
 'epochs': [200],
 'model': [__main__.MultilevelSimplestCNNClassifier],
 'loss_function': {'CrossEntropyLoss': {'function': torch.nn.modules.loss.CrossEntropyLoss,
   'params': {},
   'function_params': {}}},
 'learning_rate': [0.0005],
 'optimizer': [{'optim': torch.optim.adamw.AdamW,
   'params': {'weight_decay': 0.01, 'amsgrad': True}}]}

## Batch Execution

In [15]:
def Train_Test(
        model, 
        loss_fn, 
        optimizer, 
        epochs, 
        learning_rate, 
        batch_size, 
        train_data,
        test_data,
        id="", 
        ):
    
    print("Model: \t\t\t"+(model._get_name() if not model._get_name() == "OptimizedModule" else model.__dict__["_modules"]["_orig_mod"].__class__.__name__))
    print("  Loss Func.: \t\t"+loss_fn._get_name())
    print("  Optimizer: \t\t"+type(optimizer).__name__)
    print("  Epochs: \t\t"+str(epochs))
    print("  Learning Rate: \t"+str(learning_rate))

    print("\nModel Arch: ")
    print(str(model))
    print("\n\n\n")

    epochs_results = []
    current = {
        "model":(model._get_name() if not model._get_name() == "OptimizedModule" else model.__dict__["_modules"]["_orig_mod"].__class__.__name__),
        "loss_function":loss_fn._get_name(),
        "epoch":None,
        "learning_rate":learning_rate,
        "batch_size":None,
        "train_size":None,
        "test_size":None,
        "optimizer":type(optimizer).__name__,
        "train_acc":None,
        "train_loss":None,
        "test_acc":None,
        "test_loss":None,
    }

    if batch_size == "dynamic":
        bss = [1000, 500, 250, 250, 500, 1000, 1000, 1000]
    else:
        bss = [batch_size]
    if len(bss) > epochs:
        bss = bss[0:epochs]
    print("Batch Sizes List: "+str(bss))
    batch_lim = int(epochs/len(bss))
    
    
    t_start = time.time()
    
    best = {
        "epoch":0,
        "train_acc":0,
        "train_loss":10000000,
        "test_acc":0,
        "test_loss":10000000,
    }
    
    train_loader = None
    test_loader = None
    
    # Epochs
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}\n-------------------------------")

        if epoch%batch_lim == 0 and len(bss) > 0:
            if train_loader:
                del train_loader
            if test_loader:
                del test_loader

            batch_size = bss.pop(0)
            train_loader, test_loader = loaders_generator(train_data, test_data, batch_size)

        print("Batch Size: "+str(batch_size))

        # Train --------------------------------------------------------------------------
        # if torch.cuda.get_device_capability() < (7, 0):
        #     print("Exiting because torch.compile is not supported on this device.")
        #     import sys
        #     sys.exit(0)
                    
        #@torch.compile(fullgraph=False)
        fn = torch.compile(optimizer.step)

        model.train()
        train_loss = 0
        train_acc = 0

        for batch, (X, prevy, y) in enumerate(train_loader):
            optimizer.zero_grad()

            # Compute prediction and loss
            pred = model(X, prevy)
            loss = loss_fn(pred, y)
            
            # Backpropagation
            loss.backward()
            fn()

            # Update results
            train_loss += loss.item()
            train_acc += (pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()

        # Train results
        train_loss /= len(train_loader)
        train_acc /= len(train_loader.dataset)
        print(f"Train: \n Accuracy: {(100*train_acc):>0.1f}%, Avg loss: {train_loss:>8f} \n")


        # Test
        model.eval()
        test_loss = 0
        test_acc = 0

        with torch.no_grad():
            for X, prevy, y in test_loader:
                pred = model(X, prevy)
                test_loss += loss_fn(pred, y).item()
                test_acc += (pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()

        # Test results
        test_loss /= len(test_loader)
        test_acc /= len(test_loader.dataset)
        print(f"Test: \n Accuracy: {(100*test_acc):>0.1f}%, Avg loss: {test_loss:>8f} \n")


        # Update Results
        if best["test_acc"] < test_acc or (best["test_acc"] == test_acc and best["train_acc"] < train_acc):
            best["epoch"] = epoch+1
            best["test_acc"] = test_acc
            best["test_loss"] = test_loss
            best["train_acc"] = train_acc
            best["train_loss"] = train_loss
        
        current["epoch"] = epoch+1
        current["batch_size"] = batch_size
        current["train_size"] = train_loader.dataset.__len__()
        current["test_size"] = test_loader.dataset.__len__()
        current["train_acc"] = train_acc
        current["train_loss"] = train_loss
        current["test_acc"] = test_acc
        current["test_loss"] = test_loss

    
        epochs_results.append(current.copy())

    # pd.DataFrame(epochs_results).to_csv("./results/epochs/"+str(id)+"__"+current["model"]+"_train_test.csv")
    
    print("\n\n")
    print(f"Best Epoch:{best['epoch']} \n\tAccuracy: {(100*best['test_acc']):>0.1f}%, Avg loss: {best['test_loss']:>8f} \n")
    print("Train and Test execution time: "+str(format(time.time()-t_start, '.4f'))+"s")
    print("Done!")

    return best

In [None]:
_model_ = None
_lossfunction_ = None
_optimizer_ = None

def clear():
    global _model_, _lossfunction_, _optimizer_
    
    torch.cuda.empty_cache()

    torch.compiler.reset()
    torch._dynamo.reset()

    if _model_:
        del _model_
        _model_ = None
    if _lossfunction_:
        del _lossfunction_
        _lossfunction_ = None
    if _optimizer_:
        del _optimizer_
        _optimizer_ = None
    torch.cuda.empty_cache()

results = []
current = {}

id = 0
time_id = str(int(time.time()))
print("Time ID: "+str(time_id))

for level in hiperparams["levels"]:
    clear()

    train_data = pd.read_csv("../new_data/"+level+"/train_dataset.csv")
    test_data = pd.read_csv("../new_data/"+level+"/test_dataset.csv")
    print(level)
    print(train_data.shape)
    print(test_data.shape)

    dataset = MultilevelSequenceDataset(
        train=train_data, 
        test=test_data, 
        level=level)


    for batch_size in hiperparams["batch_size"]:
        for epochs in hiperparams["epochs"]:
            for model in hiperparams["model"]:
                for loss_function_name, loss_function in hiperparams["loss_function"].items():
                    for learning_rate in hiperparams["learning_rate"]:
                        for optimizer in hiperparams["optimizer"]:
                            
                            optim = optimizer["optim"]
                            optim_params = optimizer["params"] if "params" in optimizer.keys() else {}

                            current = {
                                    "id": id,
                                    "start_time":time.time(),
                                    "end_time": None,
                                    "level": level,
                                    "batch_size": batch_size,
                                    "epochs": epochs,
                                    "model": model.__name__,
                                    "loss_function": loss_function_name+" ("+str(loss_function["function"])+")",
                                    "learning_rate": learning_rate,
                                    "optimizer": optim.__name__+" (params: "+str(optim_params)+")",
                                    "obs": "9:1",
                                    "error": None
                                }
                            

                            try:                                
                                clear()
                                # torch.set_float32_matmul_precision('high')
                                
                                _model_ = torch.compile(model(dataset.encoded_labels.shape[1], dataset.previous_encoded_labels.shape[1]))
                                _lossfunction_ = loss_function["function"](**{func:params[0](*params[1:]) for func,params in loss_function["function_params"].items()})
                                _optimizer_ = optim(_model_.parameters(), lr=learning_rate, **optim_params)


                                # ---- Run ----
                                result = Train_Test(
                                    model=_model_,
                                    loss_fn=_lossfunction_,
                                    optimizer=_optimizer_,
                                    epochs=epochs,
                                    learning_rate=learning_rate,
                                    batch_size=batch_size,
                                    train_data=dataset,
                                    test_data=dataset.get_test(),
                                    id=time_id+"_"+str(id),
                                    )
                                    
                                current["end_time"] = time.time()
                                current["best_epoch"] = result["epoch"]
                                current["train_acc_best_epoch"] = result["train_acc"]
                                current["train_loss_best_epoch"] = result["train_loss"]
                                current["test_acc_best_epoch"] = result["test_acc"]
                                current["test_loss_best_epoch"] = result["test_loss"]

                                
                                # torch.save(_model_.state_dict(), "/media/stark/Models/Gustavo/"+level+"/"+time_id+"_"+str(id)+"_"+current["model"]+".pth")
                                clear()                                
                                
                            except Exception as e:
                                print(e)
                                current["error"] = str(e)
                            
                            results.append(current)
                            pd.DataFrame(results).to_csv("./results/summarized/"+str(time_id)+"_models_train_test_"+str(len(results))+".csv")
                            
                            id = id+1

clear()

Time ID: 1731818850
order
(90507, 11)
(10056, 11)
nClasses: 293
nPreviousClasses: 98
Model: 			MultilevelSimplestCNNClassifier
  Loss Func.: 		CrossEntropyLoss
  Optimizer: 		AdamW
  Epochs: 		200
  Learning Rate: 	0.0005

Model Arch: 
OptimizedModule(
  (_orig_mod): MultilevelSimplestCNNClassifier(
    (padding1): CircularPad1d((1, 2))
    (conv1): Conv1d(4, 8, kernel_size=(4,), stride=(1,))
    (adAvgPool1): AdaptiveAvgPool1d(output_size=450)
    (padding2): CircularPad1d((1, 2))
    (conv2): Conv1d(8, 32, kernel_size=(4,), stride=(1,))
    (adAvgPool2): AdaptiveAvgPool1d(output_size=225)
    (padding3): CircularPad1d((1, 2))
    (conv3): Conv1d(32, 128, kernel_size=(4,), stride=(1,))
    (adAvgPool3): AdaptiveAvgPool1d(output_size=225)
    (act4): ReLU()
    (linear1): Linear(in_features=28800, out_features=57600, bias=True)
    (linear2): Linear(in_features=57698, out_features=293, bias=True)
  )
)




Batch Sizes List: [1000, 500, 250, 250, 500, 1000, 1000, 1000]
Epoch 1
---------

W1117 01:49:14.931000 131157631936320 torch/_logging/_internal.py:1034] [1/0] Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored


Train: 
 Accuracy: 42.2%, Avg loss: 3.753450 

Test: 
 Accuracy: 70.4%, Avg loss: 1.397917 

Epoch 2
-------------------------------
Batch Size: 1000
Train: 
 Accuracy: 80.0%, Avg loss: 0.894259 

Test: 
 Accuracy: 84.9%, Avg loss: 0.700706 

Epoch 3
-------------------------------
Batch Size: 1000
Train: 
 Accuracy: 90.2%, Avg loss: 0.395329 

Test: 
 Accuracy: 88.1%, Avg loss: 0.549935 

Epoch 4
-------------------------------
Batch Size: 1000
Train: 
 Accuracy: 95.5%, Avg loss: 0.169951 

Test: 
 Accuracy: 89.4%, Avg loss: 0.522351 

Epoch 5
-------------------------------
Batch Size: 1000
Train: 
 Accuracy: 97.6%, Avg loss: 0.090977 

Test: 
 Accuracy: 90.1%, Avg loss: 0.516815 

Epoch 6
-------------------------------
Batch Size: 1000
Train: 
 Accuracy: 98.4%, Avg loss: 0.061119 

Test: 
 Accuracy: 90.2%, Avg loss: 0.540080 

Epoch 7
-------------------------------
Batch Size: 1000
Train: 
 Accuracy: 98.8%, Avg loss: 0.046574 

Test: 
 Accuracy: 90.9%, Avg loss: 0.491783 

Epoch 8

KeyboardInterrupt: 