In [1]:
import pandas as pd
import numpy as np

In [3]:
from datetime import timedelta
import math

# Data from Naive Bayes experiments

In [2]:
naive = pd.DataFrame(data=[
    {'level': 'class', 'splitter': 'RandomSplit', 'total_of_sequences': 12349, 'correct': 11670, 'wrong': 679, 'accuracy': 0.9450157907522876},
    {'level': 'order', 'splitter': 'RandomSplit', 'total_of_sequences': 10056, 'correct': 9530, 'wrong': 526, 'accuracy': 0.9476929196499603},
    {'level': 'family', 'splitter': 'RandomSplit', 'total_of_sequences': 8194, 'correct': 7708, 'wrong': 486, 'accuracy': 0.9406883085184281},
    {'level': 'genus', 'splitter': 'RandomSplit', 'total_of_sequences': 4168, 'correct': 3468, 'wrong': 700, 'accuracy': 0.8320537428023033},
    {'level': 'species', 'splitter': 'RandomSplit', 'total_of_sequences': 1167, 'correct': 1, 'wrong': 1166, 'accuracy': 0.000856898029134533},
    {'level': 'class', 'splitter': 'StratifiedSplit', 'total_of_sequences': 12349, 'correct': 11674, 'wrong': 675, 'accuracy': 0.9453397036197263},
    {'level': 'order', 'splitter': 'StratifiedSplit', 'total_of_sequences': 10057, 'correct': 9531, 'wrong': 526, 'accuracy': 0.9476981207119419},
    {'level': 'family', 'splitter': 'StratifiedSplit', 'total_of_sequences': 8194, 'correct': 7727, 'wrong': 467, 'accuracy': 0.9430070783500122},
    {'level': 'genus', 'splitter': 'StratifiedSplit', 'total_of_sequences': 4169, 'correct': 3519, 'wrong': 650, 'accuracy': 0.8440873111057807},
    {'level': 'species', 'splitter': 'StratifiedSplit', 'total_of_sequences': 1167, 'correct': 1035, 'wrong': 132, 'accuracy': 0.8868894601542416}])  
naive

Unnamed: 0,level,splitter,total_of_sequences,correct,wrong,accuracy
0,class,RandomSplit,12349,11670,679,0.945016
1,order,RandomSplit,10056,9530,526,0.947693
2,family,RandomSplit,8194,7708,486,0.940688
3,genus,RandomSplit,4168,3468,700,0.832054
4,species,RandomSplit,1167,1,1166,0.000857
5,class,StratifiedSplit,12349,11674,675,0.94534
6,order,StratifiedSplit,10057,9531,526,0.947698
7,family,StratifiedSplit,8194,7727,467,0.943007
8,genus,StratifiedSplit,4169,3519,650,0.844087
9,species,StratifiedSplit,1167,1035,132,0.886889


# Data from CNN experiments

In [2]:
sort_levels = {
    "class":1,
    "order":2,
    "family":3,
    "genus":4,
    "species":5,
}

In [4]:
def format(row):
    row["optimizer"] = row["optimizer"][:6]
    row["delta_t"] = row["end_time"] - row["start_time"]
    row["elapsed_time"] = str(timedelta(seconds=math.floor(row["delta_t"])))

    return row

In [5]:
best_cnn = pd.read_csv("./results/summarized/1734322688_models_train_test_400.csv",
    usecols=['start_time', 'end_time', 'level', 'splitter', 'augmentation',
       'batch_size', 'epochs', 'model', 'learning_rate', 'optimizer', 'reserved_memory', 'best_epoch',
       'train_acc_best_epoch', 'test_acc_best_epoch',]
       )
# best_cnn.drop(best_cnn.columns[0], axis=1, inplace=True)
# best_cnn.columns

rename = {"test_acc_best_epoch": "test_acc", "train_acc_best_epoch": "train_acc", "learning_rate":"lr"}

best_cnn["sort_level"] = best_cnn.level.map(sort_levels)
best_cnn = best_cnn.sort_values(by=["splitter", "sort_level", "test_acc_best_epoch", "train_acc_best_epoch"], ascending=[True, True, False, False])
best_cnn = best_cnn.apply(format, axis=1).drop(["start_time","end_time", "sort_level"], axis=1).reset_index(drop=True)
best_cnn[['splitter','elapsed_time','level','best_epoch','test_acc_best_epoch','augmentation','batch_size','epochs','model','learning_rate','optimizer','reserved_memory','train_acc_best_epoch']].rename(rename, axis=1)

Unnamed: 0,splitter,elapsed_time,level,best_epoch,test_acc,augmentation,batch_size,epochs,model,lr,optimizer,reserved_memory,train_acc
0,prop_0-05/min_10/RandomSplit_0,1:42:41,class,307,0.991416,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,0.005,AdamW,16782.0,1.000000
1,prop_0-05/min_10/RandomSplit_0,1:23:22,order,296,0.985879,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,0.005,AdamW,17058.0,0.999990
2,prop_0-05/min_10/RandomSplit_0,1:09:50,family,299,0.975348,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,0.005,AdamW,16858.0,0.999974
3,prop_0-05/min_10/RandomSplit_0,0:38:42,genus,619,0.898273,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,0.005,AdamW,16302.0,0.994091
4,prop_0-05/min_10/RandomSplit_0,0:09:49,species,589,0.922813,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,0.005,AdamW,12370.0,0.999910
...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,prop_0-2/min_5/StratifiedSplit2_92,1:32:52,class,304,0.990413,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,0.005,AdamW,19852.0,0.999990
396,prop_0-2/min_5/StratifiedSplit2_92,1:17:27,order,308,0.985145,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,0.005,AdamW,20156.0,1.000000
397,prop_0-2/min_5/StratifiedSplit2_92,1:03:39,family,308,0.973784,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,0.005,AdamW,20048.0,1.000000
398,prop_0-2/min_5/StratifiedSplit2_92,0:41:23,genus,630,0.876229,False,dynamic,700,SimplestCNNClassifier_8layers_Residual,0.005,AdamW,17654.0,0.997620


In [None]:
a = best_cnn.groupby('level').agg({
    'delta_t': ['min', 'max', 'mean'],
    'test_acc_best_epoch': ['min', 'max', 'mean', "median"]
}).reset_index().set_index("level")

a.loc[:,("delta_t", "min")] = a.loc[:,("delta_t", "min")].map(lambda x: str(timedelta(seconds=math.floor(x))))
a.loc[:,("delta_t", "max")] = a.loc[:,("delta_t", "max")].map(lambda x: str(timedelta(seconds=math.floor(x))))
a.loc[:,("delta_t", "mean")] = a.loc[:,("delta_t", "mean")].map(lambda x: str(timedelta(seconds=math.floor(x))))

a.sort_index(key=lambda x: x.map(lambda y: sort_levels[y]))

  a.loc[:,("delta_t", "min")] = a.loc[:,("delta_t", "min")].map(lambda x: str(timedelta(seconds=math.floor(x))))
  a.loc[:,("delta_t", "max")] = a.loc[:,("delta_t", "max")].map(lambda x: str(timedelta(seconds=math.floor(x))))
  a.loc[:,("delta_t", "mean")] = a.loc[:,("delta_t", "mean")].map(lambda x: str(timedelta(seconds=math.floor(x))))


Unnamed: 0_level_0,delta_t,delta_t,delta_t,test_acc_best_epoch,test_acc_best_epoch,test_acc_best_epoch,test_acc_best_epoch
Unnamed: 0_level_1,min,max,mean,min,max,mean,median
level,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
class,1:32:02,1:43:30,1:36:37,0.987569,0.992227,0.990224,0.990282
order,1:15:54,1:26:08,1:20:23,0.977577,0.990654,0.985265,0.985358
family,1:02:43,1:13:02,1:06:36,0.970831,0.980962,0.975718,0.975685
genus,0:34:33,0:44:05,0:38:21,0.84224,0.922302,0.891597,0.903982
species,0:08:51,0:15:47,0:11:40,0.851479,0.953688,0.91295,0.927378


### Total time of CNN batch experiment:

In [7]:
str(timedelta(seconds=math.floor(1735818894.7526095 - 1734322803.666693)))

'17 days, 7:34:51'