In [1]:
from src.generate_data import Data
from src.evaluate import *
from src.models import *

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [3]:
%autosave 120

Autosaving every 120 seconds


# Get the synthetic dataset

In [4]:
%%time
data = Data(layer_size=6)

CPU times: total: 0 ns
Wall time: 30.5 ms




In [5]:
%%time
train, val, test = data.get_splits(['random_subsample'], [[0.25, 0.75]])

train_unshuffled = train.copy()
np.random.shuffle(train)

x_train, y_train = data.get_x_y(train)
x_val, y_val = data.get_x_y(val)
x_test, y_test = data.get_x_y(test)

train_original = data.reverse_encoding(data.get_x_y(train_unshuffled)[0])
val_original = data.reverse_encoding(x_val)
test_original = data.reverse_encoding(x_test)

Number of samples: 132 train | 79 val | 318 test
CPU times: total: 0 ns
Wall time: 15.9 ms


#### Peek at unshuffled train data

In [6]:
train_original[:20]

[['_,.,,_', '_uw,,_', ',dr.__', 'A'],
 ['.___,_', '._,,w.', '__cr,,', 'V'],
 [',.__._', 'uw,,_.', '_.._cr', 'V'],
 ['_,,._,', '.wu_.,', '_cr._.', 'V'],
 [',,.,,_', '___wu.', ',.,.,_', 'V'],
 ['_,_.,,', '_._,__', '_._r._', 'C'],
 ['_,__._', '.___,,', '.r._,,', 'C'],
 ['.__._.', 'wu,,,.', ',__,,,', 'V'],
 ['_,,__,', '__uw,,', ',.,__r', 'C'],
 [',,,__.', '.__.w_', '_cr__,', 'V'],
 [',__._,', '.,,_w,', '_,dr_,', 'D'],
 ['_,.,_,', ',,____', ',,r.,_', 'C'],
 ['..___.', ',_,...', '_._dr,', 'D'],
 ['..,._,', '.__.w,', '__,,cr', 'V'],
 ['._..,.', ',.,,.w', '_,_,,,', 'V'],
 ['_,.___', ',,__..', ',,,r,,', 'C'],
 ['._,.,,', '.,_.,,', 'dr,,._', 'D'],
 ['_._.._', '.....,', '_.,_,_', 'V'],
 ['._..,_', ',,,_wu', '..,,,,', 'V'],
 ['.___,,', ',wu_.,', '_,,,cr', 'V']]

In [7]:
pos_train_ratio = get_stats_and_ratio(train_original)

Number of samples by case:
F3 dr:    F2-wu 4 | F2-uw 7 | F2-w 8 | F2-noop 12 | 
F3 cr:    F2-wu 6 | F2-uw 6 | F2-w 9 | F2-noop 6 | 
F3 r:    F2-wu 4 | F2-uw 4 | F2-w 11 | F2-noop 10 | 
F3 noop:    F2-wu 10 | F2-uw 9 | F2-w 12 | F2-noop 14 | 

Positive samples count: 56
Total samples count: 132
Positive class ratio: 0.42424242424242425


#### Peek at test data

In [8]:
test_original[:15]

[['___._.', '__..,_', ',__.cr', 'V'],
 [',,__,,', '_._.,w', ',,,r_.', 'C'],
 [',,,._.', '__.,uw', '.cr_._', 'V'],
 ['_.._,,', '__,,uw', ',,.._r', 'C'],
 ['_,._..', '.._w.,', ',_,_dr', 'D'],
 ['__._.,', '_.,_uw', '_,.,_.', 'V'],
 ['.__,..', ',_wu._', 'r___..', 'C'],
 [',_..,,', ',_w._.', '.,_dr.', 'D'],
 ['_.,_,_', '__,,,,', '_cr._.', 'V'],
 ['.__.__', '.,w.__', '_._.r.', 'C'],
 [',._,__', ',,wu__', ',.,_cr', 'V'],
 ['.,,.,,', '_.,uw.', '.___._', 'V'],
 ['.,._,_', '_,.wu.', ',_,.dr', 'V'],
 [',_.,._', '.,,uw_', '__._cr', 'V'],
 [',_,,__', '.__uw.', ',r,,._', 'C']]

In [9]:
pos_test_ratio = get_stats_and_ratio(test_original)

Number of samples by case:
F3 dr:    F2-wu 19 | F2-uw 15 | F2-w 19 | F2-noop 20 | 
F3 cr:    F2-wu 16 | F2-uw 14 | F2-w 17 | F2-noop 25 | 
F3 r:    F2-wu 21 | F2-uw 22 | F2-w 20 | F2-noop 25 | 
F3 noop:    F2-wu 20 | F2-uw 20 | F2-w 22 | F2-noop 23 | 

Positive samples count: 142
Total samples count: 318
Positive class ratio: 0.44654088050314467


#### Peek at val data

In [10]:
val_original[:15]

[['.,,._,', '.._,__', '.,,_cr', 'V'],
 ['__,._,', '__,...', ',._cr,', 'V'],
 [',.,,,_', '....,,', '_,_,,,', 'V'],
 ['.,._,,', ',._uw_', '.,.,,r', 'C'],
 [',.._,_', ',.wu..', '._._dr', 'V'],
 ['__..__', '_._w.,', '_,.,r.', 'C'],
 ['.,.,..', '__uw,_', '_.___.', 'V'],
 ['.,.,..', '.,_.._', ',._dr,', 'D'],
 ['__.,,,', '_._w__', ',._,,_', 'V'],
 ['_,_,._', '_....,', 'r.._.,', 'C'],
 ['..,,.,', ',,__,,', 'dr,,.,', 'D'],
 [',,_.__', '__..uw', '_,,cr,', 'V'],
 [',_,,,.', '._w__,', '__.,__', 'V'],
 ['_,,,..', '_wu_,_', '.,_.._', 'V'],
 ['.._,,_', '.,_.uw', '.,__cr', 'V']]

In [11]:
pos_val_ratio = get_stats_and_ratio(val_original)

Number of samples by case:
F3 dr:    F2-wu 2 | F2-uw 3 | F2-w 3 | F2-noop 3 | 
F3 cr:    F2-wu 3 | F2-uw 5 | F2-w 4 | F2-noop 4 | 
F3 r:    F2-wu 5 | F2-uw 4 | F2-w 5 | F2-noop 7 | 
F3 noop:    F2-wu 5 | F2-uw 6 | F2-w 8 | F2-noop 12 | 

Positive samples count: 30
Total samples count: 79
Positive class ratio: 0.379746835443038


#### Send label arrays to device

In [12]:
from sklearn.preprocessing import LabelEncoder

# Codificar etiquetas categóricas a índices numéricos
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

# Convertir a tensores de PyTorch
y_train = torch.tensor(y_train, dtype=torch.long).to(device)
y_val = torch.tensor(y_val, dtype=torch.long).to(device)
y_test = torch.tensor(y_test, dtype=torch.long).to(device)

# Verificar clases codificadas
print("Clases codificadas:", label_encoder.classes_)


Clases codificadas: ['A' 'C' 'D' 'V']


#### Get data in normal format (same as CNN format)

In [13]:
x_train_normal = data.to_conv_format(x_train)
x_val_normal = data.to_conv_format(x_val)
x_test_normal = data.to_conv_format(x_test)
for i in range(len(x_train_normal)):
    x_train_normal[i] = x_train_normal[i].to(device)
    x_val_normal[i] = x_val_normal[i].to(device)
    x_test_normal[i] = x_test_normal[i].to(device)

In [14]:
x_train_normal[0].shape

torch.Size([132, 48])

#### Get data in convolutional format, send to device

In [15]:
x_train_cnn = data.to_conv_format(x_train)
x_val_cnn = data.to_conv_format(x_val)
x_test_cnn = data.to_conv_format(x_test)
for i in range(len(x_train_cnn)):
    x_train_cnn[i] = x_train_cnn[i].to(device)
    x_val_cnn[i] = x_val_cnn[i].to(device)
    x_test_cnn[i] = x_test_cnn[i].to(device)

In [16]:
x_train_cnn[0].shape

torch.Size([132, 48])

#### Get data in LSTM format, send to device

In [17]:
x_train_lstm = data.to_lstm_format(x_train)
x_val_lstm = data.to_lstm_format(x_val)
x_test_lstm = data.to_lstm_format(x_test)
for i in range(len(x_train_lstm)):
    x_train_lstm[i] = x_train_lstm[i].to(device)
    x_val_lstm[i] = x_val_lstm[i].to(device)
    x_test_lstm[i] = x_test_lstm[i].to(device)

In [18]:
x_train_lstm[0].shape

torch.Size([132, 6, 8])

### Define training parameters

In [19]:
num_experiments = 20
epochs = 200
early_stopping_limit = 100

experiment_name = "25per"

### Define training procedure for each model

In [20]:
def train_models(constructor, x_train, x_val, x_test, weight_decay, *argv):
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    wrong_preds = []
    best_accuracy = 0
    
    train_losses = []
    val_losses = []
    train_accs = []
    val_accs = []

    for i in range(num_experiments):
        model = constructor(*argv)
        model.to(device)

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), weight_decay=weight_decay)

        train_losses.append([])
        val_losses.append([])
        train_accs.append([])
        val_accs.append([])
        
        best_acc = 0

        early_stopping_cnt = 0

        for epoch in range(1, epochs + 1):
            train_loss, train_acc = train_epoch(model, x_train, y_train, criterion, optimizer, epoch, 10, verbose=False)
            val_loss, val_acc = eval_epoch(model, x_val, y_val, criterion, 'Validation', verbose=False)

            
            train_losses[-1].append(train_loss)
            val_losses[-1].append(val_loss)
            train_accs[-1].append(train_acc)
            val_accs[-1].append(val_acc)
            
            model_name = constructor.__name__[:constructor.__name__.find('_')]
            if val_acc > best_acc:
                best_acc = val_acc
                torch.save(model.state_dict(), f'./{model_name}_model_TEMP_' + experiment_name)    
                early_stopping_cnt = 0
            else:
                early_stopping_cnt += 1

            if early_stopping_cnt >= early_stopping_limit:
                break


        model.load_state_dict(torch.load(f'./{model_name}_model_TEMP_' + experiment_name))
    
        accuracies.append(get_accuracy_by_cases(model, x_test, y_test, test_original))
        precisions.append(get_precision_by_cases(model, x_test, y_test, test_original))
        recalls.append(get_recall_by_cases(model, x_test, y_test, test_original))
        f1_scores.append(get_f1_by_cases(model, x_test, y_test, test_original))
        
        wrong_preds.append(get_wrong_predictions(model, x_test, y_test, test_original))

        if accuracies[-1]['Overall'] > best_accuracy:
            torch.save(model.state_dict(), f'./best_{model_name}_model_' + experiment_name)    
            best_accuracy = accuracies[-1]['Overall']

        print(i + 1, "/", num_experiments, "models trained | Current model test accuracy:", accuracies[-1]['Overall'])
    
    return accuracies, precisions, recalls, f1_scores, wrong_preds, [train_losses, val_losses, train_accs, val_accs]
    #return accuracies, precisions, wrong_preds, [train_losses, val_losses, train_accs, val_accs]

# Train models

In [21]:
%%time
cnn_accuracies, cnn_precisions, cnn_recalls, cnn_f1_scores, cnn_wrong_preds, cnn_epoch_stats = train_models(
    CNN_Model, x_train_cnn, x_val_cnn, x_test_cnn, 0.0001, data, 64, 128, 4, -1, 'keras')
    
#cnn_accuracies, cnn_precisions, cnn_wrong_preds, cnn_epoch_stats = train_models(CNN_Model, x_train_cnn, x_val_cnn, x_test_cnn, 0.0001, data, 64, 128, 4, -1, 'keras')

  model.load_state_dict(torch.load(f'./{model_name}_model_TEMP_' + experiment_name))


1 / 20 models trained | Current model test accuracy: 0.9213836477987422
2 / 20 models trained | Current model test accuracy: 0.8647798742138365
3 / 20 models trained | Current model test accuracy: 0.8144654088050315
4 / 20 models trained | Current model test accuracy: 0.9559748427672956
5 / 20 models trained | Current model test accuracy: 0.9308176100628931
6 / 20 models trained | Current model test accuracy: 0.949685534591195
7 / 20 models trained | Current model test accuracy: 0.9622641509433962
8 / 20 models trained | Current model test accuracy: 0.9559748427672956
9 / 20 models trained | Current model test accuracy: 0.9811320754716981
10 / 20 models trained | Current model test accuracy: 0.9905660377358491
11 / 20 models trained | Current model test accuracy: 0.9591194968553459
12 / 20 models trained | Current model test accuracy: 0.9433962264150944
13 / 20 models trained | Current model test accuracy: 0.8962264150943396
14 / 20 models trained | Current model test accuracy: 0.92452

In [22]:
print("Metrics for CNN:")
for i in range(len(cnn_accuracies)):
    print(f"Experiment {i + 1}:")
    print(f"  Accuracy: {cnn_accuracies[i]['Overall']}")
    print(f"  Precision: {cnn_precisions[i]['Overall']}")
    print(f"  Recall: {cnn_recalls[i]['Overall']}")
    print(f"  F1-score: {cnn_f1_scores[i]['Overall']}")

Metrics for CNN:
Experiment 1:
  Accuracy: 0.9213836477987422
  Precision: 0.9627118644067797
  Recall: 0.9213836477987422
  F1-score: 0.9415944825088969
Experiment 2:
  Accuracy: 0.8647798742138365
  Precision: 0.8802588996763754
  Recall: 0.8647798742138365
  F1-score: 0.8724507351097269
Experiment 3:
  Accuracy: 0.8144654088050315
  Precision: 0.8269230769230769
  Recall: 0.8144654088050315
  F1-score: 0.820646967798983
Experiment 4:
  Accuracy: 0.9559748427672956
  Precision: 0.9700996677740864
  Recall: 0.9559748427672956
  F1-score: 0.9629854632241268
Experiment 5:
  Accuracy: 0.9308176100628931
  Precision: 0.9792387543252595
  Recall: 0.9308176100628931
  F1-score: 0.9544144287846508
Experiment 6:
  Accuracy: 0.949685534591195
  Precision: 0.9540983606557377
  Recall: 0.949685534591195
  F1-score: 0.9518868333261123
Experiment 7:
  Accuracy: 0.9622641509433962
  Precision: 0.9832775919732442
  Recall: 0.9622641509433962
  F1-score: 0.9726573902889949
Experiment 8:
  Accuracy: 0

In [23]:
%%time
lstm_accuracies, lstm_precisions, lstm_recalls, lstm_f1Scores, lstm_wrong_preds, lstm_epoch_stats = train_models(LSTM_Model, x_train_lstm, x_val_lstm, x_test_lstm, 0.0001, data, 16, 32, 8, 'keras')

  model.load_state_dict(torch.load(f'./{model_name}_model_TEMP_' + experiment_name))


1 / 20 models trained | Current model test accuracy: 0.9245283018867925
2 / 20 models trained | Current model test accuracy: 0.9591194968553459
3 / 20 models trained | Current model test accuracy: 0.9056603773584906
4 / 20 models trained | Current model test accuracy: 0.9213836477987422
5 / 20 models trained | Current model test accuracy: 0.9308176100628931
6 / 20 models trained | Current model test accuracy: 0.9088050314465409
7 / 20 models trained | Current model test accuracy: 0.89937106918239
8 / 20 models trained | Current model test accuracy: 0.940251572327044
9 / 20 models trained | Current model test accuracy: 0.9716981132075472
10 / 20 models trained | Current model test accuracy: 0.949685534591195
11 / 20 models trained | Current model test accuracy: 0.9150943396226415
12 / 20 models trained | Current model test accuracy: 0.9119496855345912
13 / 20 models trained | Current model test accuracy: 0.9308176100628931
14 / 20 models trained | Current model test accuracy: 0.92138364

In [24]:
print("Metrics for LSTM:")
for i in range(len(lstm_accuracies)):
    print(f"Experiment {i + 1}:")
    print(f"  Accuracy: {lstm_accuracies[i]['Overall']}")
    print(f"  Precision: {lstm_precisions[i]['Overall']}")
    print(f"  Recall: {lstm_recalls[i]['Overall']}")
    print(f"  F1-score: {lstm_f1Scores[i]['Overall']}")

Metrics for LSTM:
Experiment 1:
  Accuracy: 0.9245283018867925
  Precision: 0.9628378378378378
  Recall: 0.9245283018867925
  F1-score: 0.9432942686345369
Experiment 2:
  Accuracy: 0.9591194968553459
  Precision: 0.9865771812080537
  Recall: 0.9591194968553459
  F1-score: 0.9726545975203653
Experiment 3:
  Accuracy: 0.9056603773584906
  Precision: 0.9362416107382551
  Recall: 0.9056603773584906
  F1-score: 0.9206971228214911
Experiment 4:
  Accuracy: 0.9213836477987422
  Precision: 0.9658703071672355
  Recall: 0.9213836477987422
  F1-score: 0.9431026540720984
Experiment 5:
  Accuracy: 0.9308176100628931
  Precision: 0.9759450171821306
  Recall: 0.9308176100628931
  F1-score: 0.9528472977874501
Experiment 6:
  Accuracy: 0.9088050314465409
  Precision: 0.9131832797427653
  Recall: 0.9088050314465409
  F1-score: 0.9109888951168483
Experiment 7:
  Accuracy: 0.89937106918239
  Precision: 0.921311475409836
  Recall: 0.89937106918239
  F1-score: 0.9102090742292793
Experiment 8:
  Accuracy: 0.

In [25]:
%%time
deepset_accuracies, deepset_precisions, deepset_recalls, deepset_f1Scores, deepset_wrong_preds, deepset_epoch_stats = train_models(DEEPSET_Model, x_train_normal, x_val_normal, x_test_normal, 0.005, data, 128, 32, 8, 'keras')

  model.load_state_dict(torch.load(f'./{model_name}_model_TEMP_' + experiment_name))


1 / 20 models trained | Current model test accuracy: 0.8207547169811321
2 / 20 models trained | Current model test accuracy: 0.8584905660377359
3 / 20 models trained | Current model test accuracy: 0.8301886792452831
4 / 20 models trained | Current model test accuracy: 0.8773584905660378
5 / 20 models trained | Current model test accuracy: 0.8238993710691824
6 / 20 models trained | Current model test accuracy: 0.8647798742138365
7 / 20 models trained | Current model test accuracy: 0.8364779874213837
8 / 20 models trained | Current model test accuracy: 0.8930817610062893
9 / 20 models trained | Current model test accuracy: 0.8805031446540881
10 / 20 models trained | Current model test accuracy: 0.8742138364779874
11 / 20 models trained | Current model test accuracy: 0.8930817610062893
12 / 20 models trained | Current model test accuracy: 0.8710691823899371
13 / 20 models trained | Current model test accuracy: 0.8742138364779874
14 / 20 models trained | Current model test accuracy: 0.8742

In [26]:
print("Metrics for DeepSet:")
for i in range(len(deepset_accuracies)):
    print(f"Experiment {i + 1}:")
    print(f"  Accuracy: {deepset_accuracies[i]['Overall']}")
    print(f"  Precision: {deepset_precisions[i]['Overall']}")
    print(f"  Recall: {deepset_recalls[i]['Overall']}")
    print(f"  F1-score: {deepset_f1Scores[i]['Overall']}")

Metrics for DeepSet:
Experiment 1:
  Accuracy: 0.8207547169811321
  Precision: 0.8403908794788274
  Recall: 0.8207547169811321
  F1-score: 0.8304567401076642
Experiment 2:
  Accuracy: 0.8584905660377359
  Precision: 0.8673139158576052
  Recall: 0.8584905660377359
  F1-score: 0.862879685813859
Experiment 3:
  Accuracy: 0.8301886792452831
  Precision: 0.8571428571428571
  Recall: 0.8301886792452831
  F1-score: 0.8434504792332269
Experiment 4:
  Accuracy: 0.8773584905660378
  Precision: 0.8896103896103896
  Recall: 0.8773584905660378
  F1-score: 0.8834419636666204
Experiment 5:
  Accuracy: 0.8238993710691824
  Precision: 0.8453947368421053
  Recall: 0.8238993710691824
  F1-score: 0.8345086569087957
Experiment 6:
  Accuracy: 0.8647798742138365
  Precision: 0.8852459016393442
  Recall: 0.8647798742138365
  F1-score: 0.8748932158953663
Experiment 7:
  Accuracy: 0.8364779874213837
  Precision: 0.8501628664495114
  Recall: 0.8364779874213837
  F1-score: 0.8432649095105066
Experiment 8:
  Accur

In [27]:
%%time
deepsetv2_accuracies, deepsetv2_precisions, deepsetv2_recalls, deepsetv2_f1Scores, deepsetv2_wrong_preds, deepsetv2_epoch_stats = train_models(DEEPSETV2_Model, x_train_normal, x_val_normal, x_test_normal, 0.0001, data, 16, 8, 'keras')

  model.load_state_dict(torch.load(f'./{model_name}_model_TEMP_' + experiment_name))


1 / 20 models trained | Current model test accuracy: 0.6729559748427673
2 / 20 models trained | Current model test accuracy: 0.7767295597484277
3 / 20 models trained | Current model test accuracy: 0.550314465408805
4 / 20 models trained | Current model test accuracy: 0.6855345911949685
5 / 20 models trained | Current model test accuracy: 0.19811320754716982
6 / 20 models trained | Current model test accuracy: 0.5974842767295597
7 / 20 models trained | Current model test accuracy: 0.5534591194968553
8 / 20 models trained | Current model test accuracy: 0.6132075471698113
9 / 20 models trained | Current model test accuracy: 0.7421383647798742
10 / 20 models trained | Current model test accuracy: 0.8805031446540881
11 / 20 models trained | Current model test accuracy: 0.5534591194968553
12 / 20 models trained | Current model test accuracy: 0.7484276729559748
13 / 20 models trained | Current model test accuracy: 0.8176100628930818
14 / 20 models trained | Current model test accuracy: 0.5534

In [28]:
print("Metrics for DeepSetV2:")
for i in range(len(deepsetv2_accuracies)):
    print(f"Experiment {i + 1}:")
    print(f"  Accuracy: {deepsetv2_accuracies[i]['Overall']}")
    print(f"  Precision: {deepsetv2_precisions[i]['Overall']}")
    print(f"  Recall: {deepsetv2_recalls[i]['Overall']}")
    print(f"  F1-score: {deepsetv2_f1Scores[i]['Overall']}")

Metrics for DeepSetV2:
Experiment 1:
  Accuracy: 0.6729559748427673
  Precision: 0.6893203883495146
  Recall: 0.6729559748427673
  F1-score: 0.6810398924249216
Experiment 2:
  Accuracy: 0.7767295597484277
  Precision: 0.7767295597484277
  Recall: 0.7767295597484277
  F1-score: 0.7767295597484277
Experiment 3:
  Accuracy: 0.550314465408805
  Precision: 0.570957095709571
  Recall: 0.550314465408805
  F1-score: 0.5604457649552478
Experiment 4:
  Accuracy: 0.6855345911949685
  Precision: 0.7697841726618705
  Recall: 0.6855345911949685
  F1-score: 0.7252207436885959
Experiment 5:
  Accuracy: 0.19811320754716982
  Precision: 0.4396551724137931
  Recall: 0.19811320754716982
  F1-score: 0.27314460596786533
Experiment 6:
  Accuracy: 0.5974842767295597
  Precision: 0.5974842767295597
  Recall: 0.5974842767295597
  F1-score: 0.5974842767295597
Experiment 7:
  Accuracy: 0.5534591194968553
  Precision: 0.5534591194968553
  Recall: 0.5534591194968553
  F1-score: 0.5534591194968553
Experiment 8:
  Ac

In [29]:
%%time
feedforward_accuracies, feedforward_precisions, feedforward_recalls, feedforward_f1Scores, feedforward_wrong_preds, feedforward_epoch_stats = train_models(FEEDFORWARD_Model, x_train_normal, x_val_normal, x_test_normal, 0.0001, data, 128, 32, 8, 'keras')

  model.load_state_dict(torch.load(f'./{model_name}_model_TEMP_' + experiment_name))


1 / 20 models trained | Current model test accuracy: 0.7295597484276729
2 / 20 models trained | Current model test accuracy: 0.7484276729559748
3 / 20 models trained | Current model test accuracy: 0.7264150943396226
4 / 20 models trained | Current model test accuracy: 0.6572327044025157
5 / 20 models trained | Current model test accuracy: 0.660377358490566
6 / 20 models trained | Current model test accuracy: 0.6729559748427673
7 / 20 models trained | Current model test accuracy: 0.7578616352201258
8 / 20 models trained | Current model test accuracy: 0.7389937106918238
9 / 20 models trained | Current model test accuracy: 0.6509433962264151
10 / 20 models trained | Current model test accuracy: 0.7044025157232704
11 / 20 models trained | Current model test accuracy: 0.6855345911949685
12 / 20 models trained | Current model test accuracy: 0.7955974842767296
13 / 20 models trained | Current model test accuracy: 0.7138364779874213
14 / 20 models trained | Current model test accuracy: 0.73899

In [30]:
print("Metrics for FeedForward:")
for i in range(len(feedforward_accuracies)):
    print(f"Experiment {i + 1}:")
    print(f"  Accuracy: {feedforward_accuracies[i]['Overall']}")
    print(f"  Precision: {feedforward_precisions[i]['Overall']}")
    print(f"  Recall: {feedforward_recalls[i]['Overall']}")
    print(f"  F1-score: {feedforward_f1Scores[i]['Overall']}")

Metrics for FeedForward:
Experiment 1:
  Accuracy: 0.7295597484276729
  Precision: 0.7356687898089171
  Recall: 0.7295597484276729
  F1-score: 0.7326015337716838
Experiment 2:
  Accuracy: 0.7484276729559748
  Precision: 0.7694805194805194
  Recall: 0.7484276729559748
  F1-score: 0.7588080984731284
Experiment 3:
  Accuracy: 0.7264150943396226
  Precision: 0.7508196721311475
  Recall: 0.7264150943396226
  F1-score: 0.7384157959756276
Experiment 4:
  Accuracy: 0.6572327044025157
  Precision: 0.6666666666666666
  Recall: 0.6572327044025157
  F1-score: 0.6619160728424386
Experiment 5:
  Accuracy: 0.660377358490566
  Precision: 0.6698717948717948
  Recall: 0.660377358490566
  F1-score: 0.6650906941855707
Experiment 6:
  Accuracy: 0.6729559748427673
  Precision: 0.6815286624203821
  Recall: 0.6729559748427673
  F1-score: 0.6772151898734177
Experiment 7:
  Accuracy: 0.7578616352201258
  Precision: 0.7717041800643086
  Recall: 0.7578616352201258
  F1-score: 0.7647202702434703
Experiment 8:
  Ac

# Evaluation

### Get best 50% performing models

In [None]:
top_half = int(num_experiments / 2)
best_cnn_accs = filter_top_k_accuracies(cnn_accuracies, top_half)
best_lstm_accs = filter_top_k_accuracies(lstm_accuracies, top_half)
best_deepset_accs = filter_top_k_accuracies(deepset_accuracies, top_half)
best_deepsetv2_accs = filter_top_k_accuracies(deepsetv2_accuracies, top_half)
best_feedforward_accs = filter_top_k_accuracies(feedforward_accuracies, top_half)

## Accuracy breakdown by cases for all the models

In [None]:
all_accuracies = [cnn_accuracies, lstm_accuracies, deepset_accuracies, deepsetv2_accuracies, feedforward_accuracies]
model_names = ['CNN', 'LSTM', 'DeepSet(like in paper)', 'DeepSet(sum at start)', 'Feedforward']

In [None]:
get_stats_df(all_accuracies, model_names, test_original)

## Accuracy breakdown by cases for top 50% of models

In [None]:
best_accuracies = [best_cnn_accs, best_lstm_accs, best_deepset_accs, best_deepsetv2_accs, best_feedforward_accs]
model_names = ['CNN', 'LSTM', 'DeepSet(like in paper)', 'DeepSet(sum at start)', 'Feedforward']
collapsed_cases = ['dr', 'r', 'cr', 'noop']

In [None]:
get_stats_df(best_accuracies, model_names, test_original, collapsed_cases)

In [None]:
best_accuracies = [best_cnn_accs, best_lstm_accs, best_deepset_accs, best_deepsetv2_accs, best_feedforward_accs]
model_names = ['CNN', 'LSTM', 'DeepSet(like in paper)', 'DeepSet(sum at start)', 'Feedforward']
collapsed_cases = ['r', 'cr', 'noop']

In [None]:
get_stats_df(best_accuracies, model_names, test_original, collapsed_cases)

## Accuracies per CNN model

In [None]:
get_stats_per_model(cnn_accuracies, ['CNN #' + str(i) for i in range(len(cnn_accuracies))], test_original, ['cr', 'dr', 'noop', 'r'])

## Accuracies per LSTM model

In [None]:
get_stats_per_model(lstm_accuracies, ['LSTM #' + str(i) for i in range(len(lstm_accuracies))], test_original, ['cr', 'dr', 'noop', 'r'])

## Accuracies per DeepSets V1 model

In [None]:
get_stats_per_model(deepset_accuracies, ['DeepSet(like in paper) #' + str(i) for i in range(len(deepset_accuracies))], test_original, ['cr', 'dr', 'noop', 'r'])

## Accuracies per DeepSets V2 model

In [None]:
get_stats_per_model(deepsetv2_accuracies, ['DeepSet(sum at start) #' + str(i) for i in range(len(deepsetv2_accuracies))], test_original, ['cr', 'dr', 'noop', 'r'])

## Accuracies per FeedForward model

In [None]:
get_stats_per_model(feedforward_accuracies, ['Feedforward #' + str(i) for i in range(len(feedforward_accuracies))], test_original, ['cr', 'dr', 'noop', 'r'])

# Training evolution

### CNN

In [None]:
display_epochs_stats(cnn_epoch_stats, num_experiments, display_train_loss=False, display_val_loss=False)

### LSTM

In [None]:
display_epochs_stats(lstm_epoch_stats, num_experiments, display_train_loss=False, display_val_loss=False)

### DeepSets V1

In [None]:
display_epochs_stats(deepset_epoch_stats, num_experiments, display_train_loss=False, display_val_loss=False)

### DeepSets V2

In [None]:
display_epochs_stats(deepsetv2_epoch_stats, num_experiments, display_train_loss=False, display_val_loss=False)

### FeedForward

In [None]:
display_epochs_stats(feedforward_epoch_stats, num_experiments, display_train_loss=False, display_val_loss=False)

# Best performing models

In [None]:
best_lstm_model = LSTM_Model(data, 16, 32, 8).to(device)
best_lstm_model.load_state_dict(torch.load('best_LSTM_model_' + experiment_name))
best_cnn_model = CNN_Model(data, 64, 128, 4, -1,).to(device)
best_cnn_model.load_state_dict(torch.load('best_CNN_model_' + experiment_name))
best_deepset_model = DEEPSET_Model(data).to(device)
best_deepset_model.load_state_dict(torch.load('best_DEEPSET_model_' + experiment_name))
best_deepsetv2_model = DEEPSETV2_Model(data, 16, 8).to(device)
best_deepsetv2_model.load_state_dict(torch.load('best_DEEPSETV2_model_' + experiment_name))
best_feedforward_model = FEEDFORWARD_Model(data).to(device)
best_feedforward_model.load_state_dict(torch.load('best_FEEDFORWARD_model_' + experiment_name))

## Top wrong predictions for best performing CNN model

In [None]:
print_wrong_preds([get_wrong_predictions(best_cnn_model, x_test_cnn, y_test, test_original)], top_k=10)

## Top wrong predictions for best performing LSTM model

In [None]:
print_wrong_preds([get_wrong_predictions(best_lstm_model, x_test_lstm, y_test, test_original)], top_k=10)

## Top wrong predictions for best performing DeepSets V1 model

In [None]:
print_wrong_preds([get_wrong_predictions(best_deepset_model, x_test_normal, y_test, test_original)], top_k=10)

## Top wrong predictions for best performing DeepSets V2 model

In [None]:
print_wrong_preds([get_wrong_predictions(best_deepsetv2_model, x_test_normal, y_test, test_original)], top_k=10)

## Top wrong predictions for best performing Feedforward model

In [None]:
print_wrong_preds([get_wrong_predictions(best_feedforward_model, x_test_normal, y_test, test_original)], top_k=10)

# Saving Notebook State

In [None]:
import dill
dill.dump_session('notebook_env_' + experiment_name + '.db')

# Loading Notebook State

In [None]:
import dill
experiment_name = "25per"
# dill.load_session('notebook_env_' + experiment_name + '.db')