In [1]:
import pandas as pd
import numpy as np
import json
import joblib
import torch
import torch.nn as nn

from sklearn.preprocessing import LabelEncoder

from src.data.tokenizers.basic_tokenizer import BasicWordTokenizer, CustomWordCharTokenizer
from src.data.datastruct import Batch, Sample
from src.data.collate import create_batch, create_samples
from src.models.neural_text_classifier import TicketTextClassifierV01
from src.training.train_neural import train_model, train_model_cuda
from src.evaluation.metrics import format_cm, evaluate
from src.evaluation.neural_eval import evaluate as neural_eval, evaluate_cuda as neural_eval_cuda

#### similarity index

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
with open("../data/processed/agumented_ticketdata_similarity_v01.json", "r") as f:
    agumented_data = json.load(f)

X_train_agumented = agumented_data["train"]["X"]
X_test_agumented = agumented_data["test"]["X"]

y_train = agumented_data["train"]["y"]
y_test = agumented_data["test"]["y"]

labelencode: LabelEncoder = joblib.load("../artifacts/labelencoder_neural_v01.pkl")

y_train_encoded = labelencode.transform(y_train)
y_test_encoded = labelencode.transform(y_test)

criterion = nn.CrossEntropyLoss()

tokenizer_uni = BasicWordTokenizer()
tokenizer_uni_char3_5 = CustomWordCharTokenizer()

tokenizer_uni.fit(X_train_agumented)
tokenizer_uni_char3_5.fit(X_train_agumented)

train_samples_uni = create_samples(X_train_agumented, y_train_encoded, tokenizer_uni)
test_samples_uni = create_samples(X_test_agumented, y_test_encoded, tokenizer_uni)

train_samples_uni_char3_5 = create_samples(X_train_agumented, y_train_encoded, tokenizer_uni_char3_5)
test_samples_uni_char3_5 = create_samples(X_test_agumented, y_test_encoded, tokenizer_uni_char3_5)


EPOCHS = 50
N_CLASS = 8 # known from previous notebooks
TRAIN_BATCH_SIZE = 32
TEST_BATCH_SIZE = 64
EMBEDDING_DIM = 256

Tokenizer fitted. Vocab size:  11608
Tokenizer fitted. Vocab size:  30002


In [4]:
model_uni = TicketTextClassifierV01(
    len(tokenizer_uni), EMBEDDING_DIM, pad_id=tokenizer_uni.get_pad_id(), n_classes=N_CLASS
).to(device)

In [5]:
test_batches_uni = create_batch(
    TEST_BATCH_SIZE, in_samples=test_samples_uni, pad_id=tokenizer_uni.get_pad_id(), device=device,
    shuffle=False, random_state=43
)

In [6]:
history_uni = train_model(
    model=model_uni, total_epochs=EPOCHS, train_samples=train_samples_uni, training_batch_size=TRAIN_BATCH_SIZE,
    pad_id=tokenizer_uni.get_pad_id(), device=device, test_batches=test_batches_uni, criterion=criterion, early_stop=True,
    random_state=43
)

100%|██████████| 1196/1196 [00:05<00:00, 233.16it/s]
  2%|▏         | 1/50 [00:11<09:20, 11.43s/it]


        Epoch 1
        Train Loss: 1.6904 | Train F1: 0.2654
        Val   Loss: 1.5601 | Val   F1: 0.3829
        


100%|██████████| 1196/1196 [00:04<00:00, 239.49it/s]
  4%|▍         | 2/50 [00:22<08:44, 10.93s/it]


        Epoch 2
        Train Loss: 1.4780 | Train F1: 0.4355
        Val   Loss: 1.5049 | Val   F1: 0.4439
        


100%|██████████| 1196/1196 [00:05<00:00, 216.64it/s]
  6%|▌         | 3/50 [00:32<08:30, 10.86s/it]


        Epoch 3
        Train Loss: 1.4293 | Train F1: 0.4831
        Val   Loss: 1.4854 | Val   F1: 0.4688
        


100%|██████████| 1196/1196 [00:05<00:00, 230.96it/s]
  8%|▊         | 4/50 [00:43<08:14, 10.75s/it]


        Epoch 4
        Train Loss: 1.4023 | Train F1: 0.5068
        Val   Loss: 1.4741 | Val   F1: 0.4835
        


100%|██████████| 1196/1196 [00:05<00:00, 238.82it/s]
 10%|█         | 5/50 [00:53<07:59, 10.67s/it]


        Epoch 5
        Train Loss: 1.3837 | Train F1: 0.5198
        Val   Loss: 1.4665 | Val   F1: 0.4888
        


100%|██████████| 1196/1196 [00:04<00:00, 259.37it/s]
 12%|█▏        | 6/50 [01:04<07:42, 10.51s/it]


        Epoch 6
        Train Loss: 1.3699 | Train F1: 0.5293
        Val   Loss: 1.4611 | Val   F1: 0.4955
        


100%|██████████| 1196/1196 [00:04<00:00, 262.26it/s]
 14%|█▍        | 7/50 [01:13<07:19, 10.22s/it]


        Epoch 7
        Train Loss: 1.3594 | Train F1: 0.5366
        Val   Loss: 1.4574 | Val   F1: 0.5019
        


100%|██████████| 1196/1196 [00:04<00:00, 248.78it/s]
 16%|█▌        | 8/50 [01:23<07:01, 10.03s/it]


        Epoch 8
        Train Loss: 1.3487 | Train F1: 0.5422
        Val   Loss: 1.4544 | Val   F1: 0.5047
        


100%|██████████| 1196/1196 [00:04<00:00, 252.93it/s]
 18%|█▊        | 9/50 [01:33<06:50, 10.01s/it]


        Epoch 9
        Train Loss: 1.3392 | Train F1: 0.5439
        Val   Loss: 1.4525 | Val   F1: 0.5082
        


100%|██████████| 1196/1196 [00:04<00:00, 250.06it/s]
 20%|██        | 10/50 [01:43<06:38,  9.97s/it]


        Epoch 10
        Train Loss: 1.3318 | Train F1: 0.5500
        Val   Loss: 1.4508 | Val   F1: 0.5096
        


100%|██████████| 1196/1196 [00:04<00:00, 257.91it/s]
 22%|██▏       | 11/50 [01:52<06:23,  9.84s/it]


        Epoch 11
        Train Loss: 1.3246 | Train F1: 0.5533
        Val   Loss: 1.4498 | Val   F1: 0.5116
        


100%|██████████| 1196/1196 [00:04<00:00, 258.48it/s]
 24%|██▍       | 12/50 [02:02<06:09,  9.71s/it]


        Epoch 12
        Train Loss: 1.3163 | Train F1: 0.5551
        Val   Loss: 1.4491 | Val   F1: 0.5124
        


100%|██████████| 1196/1196 [00:04<00:00, 254.04it/s]
 26%|██▌       | 13/50 [02:11<05:59,  9.73s/it]


        Epoch 13
        Train Loss: 1.3107 | Train F1: 0.5588
        Val   Loss: 1.4484 | Val   F1: 0.5129
        


100%|██████████| 1196/1196 [00:04<00:00, 246.92it/s]
 28%|██▊       | 14/50 [02:21<05:50,  9.75s/it]


        Epoch 14
        Train Loss: 1.3033 | Train F1: 0.5611
        Val   Loss: 1.4482 | Val   F1: 0.5142
        


100%|██████████| 1196/1196 [00:05<00:00, 235.60it/s]
 30%|███       | 15/50 [02:33<06:01, 10.32s/it]


        Epoch 15
        Train Loss: 1.2997 | Train F1: 0.5628
        Val   Loss: 1.4481 | Val   F1: 0.5161
        


100%|██████████| 1196/1196 [00:04<00:00, 265.56it/s]
 32%|███▏      | 16/50 [02:42<05:42, 10.06s/it]


        Epoch 16
        Train Loss: 1.2952 | Train F1: 0.5648
        Val   Loss: 1.4481 | Val   F1: 0.5181
        


100%|██████████| 1196/1196 [00:04<00:00, 267.79it/s]
 34%|███▍      | 17/50 [02:52<05:27,  9.92s/it]


        Epoch 17
        Train Loss: 1.2882 | Train F1: 0.5669
        Val   Loss: 1.4490 | Val   F1: 0.5189
        


100%|██████████| 1196/1196 [00:04<00:00, 259.45it/s]
 36%|███▌      | 18/50 [03:01<05:13,  9.81s/it]


        Epoch 18
        Train Loss: 1.2836 | Train F1: 0.5692
        Val   Loss: 1.4490 | Val   F1: 0.5197
        


100%|██████████| 1196/1196 [00:04<00:00, 250.54it/s]
 38%|███▊      | 19/50 [03:11<05:02,  9.76s/it]


        Epoch 19
        Train Loss: 1.2784 | Train F1: 0.5707
        Val   Loss: 1.4498 | Val   F1: 0.5213
        


100%|██████████| 1196/1196 [00:04<00:00, 255.84it/s]
 40%|████      | 20/50 [03:20<04:48,  9.62s/it]


        Epoch 20
        Train Loss: 1.2736 | Train F1: 0.5724
        Val   Loss: 1.4503 | Val   F1: 0.5210
        


100%|██████████| 1196/1196 [00:04<00:00, 256.15it/s]
 42%|████▏     | 21/50 [03:30<04:40,  9.67s/it]


        Epoch 21
        Train Loss: 1.2684 | Train F1: 0.5732
        Val   Loss: 1.4513 | Val   F1: 0.5210
        


100%|██████████| 1196/1196 [00:04<00:00, 253.54it/s]
 42%|████▏     | 21/50 [03:40<05:04, 10.50s/it]


        Epoch 22
        Train Loss: 1.2646 | Train F1: 0.5726
        Val   Loss: 1.4521 | Val   F1: 0.5207
        
Early stopping at epoch: 22





In [19]:
loss_uni, y_pred_uni, y_true_uni = neural_eval(
    model=model_uni, batches=test_batches_uni, criterion=criterion
)
format_cm(evaluate(y_true_uni, y_pred_uni), class_names=list(labelencode.classes_), normalize=True)

              precision    recall  f1-score   support

           0     0.5553    0.5284    0.5415      1425
           1     0.5252    0.3551    0.4237       352
           2     0.5581    0.5525    0.5552      2183
           3     0.5256    0.6358    0.5755      2724
           4     0.5651    0.4198    0.4817       424
           5     0.5004    0.4483    0.4729      1412
           6     0.6159    0.5659    0.5899       493
           7     0.5216    0.4559    0.4865       555

    accuracy                         0.5392      9568
   macro avg     0.5459    0.4952    0.5159      9568
weighted avg     0.5399    0.5392    0.5365      9568



Unnamed: 0,Pred: Access,Pred: Administrative rights,Pred: HR Support,Pred: Hardware,Pred: Internal Project,Pred: Miscellaneous,Pred: Purchase,Pred: Storage
True: Access,0.528421,0.011228,0.118596,0.210526,0.01614,0.076491,0.017544,0.021053
True: Administrative rights,0.065341,0.355114,0.113636,0.332386,0.005682,0.068182,0.028409,0.03125
True: HR Support,0.076042,0.005955,0.552451,0.215758,0.01191,0.083372,0.018781,0.035731
True: Hardware,0.080029,0.013216,0.132893,0.63583,0.015051,0.082599,0.017621,0.022761
True: Internal Project,0.075472,0.014151,0.158019,0.212264,0.419811,0.087264,0.011792,0.021226
True: Miscellaneous,0.070113,0.014873,0.143768,0.265581,0.015581,0.4483,0.021955,0.01983
True: Purchase,0.046653,0.020284,0.087221,0.188641,0.016227,0.046653,0.565923,0.028398
True: Storage,0.075676,0.01982,0.127928,0.210811,0.027027,0.057658,0.025225,0.455856


In [4]:
EPOCHS = 50
N_CLASS = 8 # known from previous notebooks
TRAIN_BATCH_SIZE = 16
TEST_BATCH_SIZE = 64
EMBEDDING_DIM = 256

In [5]:
model_uni_char3_5 = TicketTextClassifierV01(
    len(tokenizer_uni_char3_5), EMBEDDING_DIM, pad_id=tokenizer_uni_char3_5.get_pad_id(), n_classes=N_CLASS
).to(device)

In [6]:
test_batches_uni_char3_5 = create_batch(
    TEST_BATCH_SIZE, in_samples=test_samples_uni_char3_5, pad_id=tokenizer_uni_char3_5.get_pad_id(), device="cpu",
    shuffle=False, random_state=43
)

In [7]:
history_uni_char3_5 = train_model_cuda(
    model=model_uni_char3_5, total_epochs=EPOCHS, train_samples=train_samples_uni_char3_5, training_batch_size=TRAIN_BATCH_SIZE,
    pad_id=tokenizer_uni_char3_5.get_pad_id(), test_batches=test_batches_uni_char3_5, criterion=criterion, early_stop=True,
    random_state=43
)

100%|██████████| 2392/2392 [00:39<00:00, 60.16it/s]
  2%|▏         | 1/50 [01:06<54:21, 66.56s/it]


        Epoch 1
        Train Loss: 1.6674 | Train F1: 0.2660
        Val   Loss: 1.5356 | Val   F1: 0.3864
        


100%|██████████| 2392/2392 [00:30<00:00, 78.91it/s]
  4%|▍         | 2/50 [02:03<48:52, 61.09s/it]


        Epoch 2
        Train Loss: 1.4501 | Train F1: 0.4520
        Val   Loss: 1.4823 | Val   F1: 0.4510
        


100%|██████████| 2392/2392 [00:31<00:00, 75.24it/s]
  6%|▌         | 3/50 [03:01<46:41, 59.61s/it]


        Epoch 3
        Train Loss: 1.4037 | Train F1: 0.5000
        Val   Loss: 1.4643 | Val   F1: 0.4801
        


100%|██████████| 2392/2392 [00:32<00:00, 73.92it/s]
  8%|▊         | 4/50 [04:02<45:55, 59.91s/it]


        Epoch 4
        Train Loss: 1.3804 | Train F1: 0.5206
        Val   Loss: 1.4545 | Val   F1: 0.4910
        


100%|██████████| 2392/2392 [00:31<00:00, 76.76it/s]
 10%|█         | 5/50 [05:00<44:34, 59.43s/it]


        Epoch 5
        Train Loss: 1.3631 | Train F1: 0.5335
        Val   Loss: 1.4482 | Val   F1: 0.5015
        


100%|██████████| 2392/2392 [00:32<00:00, 74.74it/s]
 12%|█▏        | 6/50 [05:59<43:32, 59.38s/it]


        Epoch 6
        Train Loss: 1.3509 | Train F1: 0.5400
        Val   Loss: 1.4450 | Val   F1: 0.5069
        


100%|██████████| 2392/2392 [00:31<00:00, 76.33it/s]
 14%|█▍        | 7/50 [07:00<42:45, 59.66s/it]


        Epoch 7
        Train Loss: 1.3409 | Train F1: 0.5463
        Val   Loss: 1.4419 | Val   F1: 0.5102
        


100%|██████████| 2392/2392 [00:32<00:00, 72.55it/s]
 16%|█▌        | 8/50 [07:58<41:34, 59.40s/it]


        Epoch 8
        Train Loss: 1.3321 | Train F1: 0.5510
        Val   Loss: 1.4401 | Val   F1: 0.5113
        


100%|██████████| 2392/2392 [00:31<00:00, 76.60it/s]
 18%|█▊        | 9/50 [08:55<40:00, 58.56s/it]


        Epoch 9
        Train Loss: 1.3246 | Train F1: 0.5550
        Val   Loss: 1.4395 | Val   F1: 0.5132
        


100%|██████████| 2392/2392 [00:30<00:00, 77.45it/s]
 20%|██        | 10/50 [09:54<39:02, 58.56s/it]


        Epoch 10
        Train Loss: 1.3172 | Train F1: 0.5588
        Val   Loss: 1.4390 | Val   F1: 0.5147
        


100%|██████████| 2392/2392 [00:30<00:00, 77.73it/s]
 22%|██▏       | 11/50 [10:51<37:44, 58.08s/it]


        Epoch 11
        Train Loss: 1.3100 | Train F1: 0.5605
        Val   Loss: 1.4388 | Val   F1: 0.5155
        


100%|██████████| 2392/2392 [00:31<00:00, 76.22it/s]
 24%|██▍       | 12/50 [11:48<36:33, 57.71s/it]


        Epoch 12
        Train Loss: 1.3033 | Train F1: 0.5625
        Val   Loss: 1.4387 | Val   F1: 0.5157
        


100%|██████████| 2392/2392 [00:31<00:00, 75.83it/s]
 26%|██▌       | 13/50 [12:44<35:15, 57.18s/it]


        Epoch 13
        Train Loss: 1.2982 | Train F1: 0.5645
        Val   Loss: 1.4395 | Val   F1: 0.5166
        


100%|██████████| 2392/2392 [00:30<00:00, 77.54it/s]
 28%|██▊       | 14/50 [13:38<33:45, 56.27s/it]


        Epoch 14
        Train Loss: 1.2924 | Train F1: 0.5661
        Val   Loss: 1.4399 | Val   F1: 0.5183
        


100%|██████████| 2392/2392 [00:30<00:00, 77.24it/s]
 30%|███       | 15/50 [14:34<32:50, 56.29s/it]


        Epoch 15
        Train Loss: 1.2863 | Train F1: 0.5684
        Val   Loss: 1.4406 | Val   F1: 0.5185
        


100%|██████████| 2392/2392 [00:31<00:00, 75.82it/s]
 32%|███▏      | 16/50 [15:32<32:06, 56.67s/it]


        Epoch 16
        Train Loss: 1.2816 | Train F1: 0.5697
        Val   Loss: 1.4419 | Val   F1: 0.5203
        


100%|██████████| 2392/2392 [00:30<00:00, 78.34it/s]
 34%|███▍      | 17/50 [16:29<31:21, 57.00s/it]


        Epoch 17
        Train Loss: 1.2755 | Train F1: 0.5705
        Val   Loss: 1.4434 | Val   F1: 0.5203
        


100%|██████████| 2392/2392 [00:31<00:00, 76.02it/s]
 36%|███▌      | 18/50 [17:27<30:32, 57.25s/it]


        Epoch 18
        Train Loss: 1.2711 | Train F1: 0.5710
        Val   Loss: 1.4446 | Val   F1: 0.5207
        


100%|██████████| 2392/2392 [00:36<00:00, 65.09it/s]
 38%|███▊      | 19/50 [18:33<30:57, 59.93s/it]


        Epoch 19
        Train Loss: 1.2665 | Train F1: 0.5727
        Val   Loss: 1.4458 | Val   F1: 0.5208
        


100%|██████████| 2392/2392 [00:37<00:00, 63.26it/s]
 40%|████      | 20/50 [19:38<30:43, 61.46s/it]


        Epoch 20
        Train Loss: 1.2609 | Train F1: 0.5735
        Val   Loss: 1.4476 | Val   F1: 0.5209
        


100%|██████████| 2392/2392 [00:36<00:00, 64.76it/s]
 42%|████▏     | 21/50 [20:45<30:27, 63.01s/it]


        Epoch 21
        Train Loss: 1.2568 | Train F1: 0.5756
        Val   Loss: 1.4498 | Val   F1: 0.5203
        


100%|██████████| 2392/2392 [00:35<00:00, 66.89it/s]
 44%|████▍     | 22/50 [21:47<29:13, 62.64s/it]


        Epoch 22
        Train Loss: 1.2519 | Train F1: 0.5760
        Val   Loss: 1.4512 | Val   F1: 0.5202
        


100%|██████████| 2392/2392 [00:36<00:00, 65.84it/s]
 44%|████▍     | 22/50 [22:51<29:05, 62.32s/it]


        Epoch 23
        Train Loss: 1.2476 | Train F1: 0.5775
        Val   Loss: 1.4530 | Val   F1: 0.5197
        
Early stopping at epoch: 23





In [8]:
loss_uni_char3_5, y_pred_uni_char3_5, y_true_uni_char3_5 = neural_eval_cuda(
    model=model_uni_char3_5, batches=test_batches_uni_char3_5, criterion=criterion
)
format_cm(evaluate(y_true_uni_char3_5, y_pred_uni_char3_5), class_names=list(labelencode.classes_), normalize=True)

              precision    recall  f1-score   support

           0     0.5509    0.5354    0.5431      1425
           1     0.4965    0.3977    0.4416       352
           2     0.5534    0.5552    0.5543      2183
           3     0.5434    0.6156    0.5773      2724
           4     0.5353    0.4292    0.4764       424
           5     0.4950    0.4561    0.4748      1412
           6     0.6137    0.5801    0.5965       493
           7     0.5116    0.4775    0.4939       555

    accuracy                         0.5402      9568
   macro avg     0.5375    0.5059    0.5197      9568
weighted avg     0.5394    0.5402    0.5385      9568



Unnamed: 0,Pred: Access,Pred: Administrative rights,Pred: HR Support,Pred: Hardware,Pred: Internal Project,Pred: Miscellaneous,Pred: Purchase,Pred: Storage
True: Access,0.535439,0.016842,0.127719,0.183158,0.016842,0.079298,0.017544,0.023158
True: Administrative rights,0.0625,0.397727,0.119318,0.286932,0.005682,0.073864,0.028409,0.025568
True: HR Support,0.074668,0.008704,0.555199,0.201099,0.014659,0.087952,0.01924,0.038479
True: Hardware,0.084802,0.016153,0.136197,0.615639,0.01909,0.084068,0.01909,0.024963
True: Internal Project,0.087264,0.016509,0.153302,0.183962,0.429245,0.09434,0.014151,0.021226
True: Miscellaneous,0.072946,0.017705,0.141643,0.249292,0.016997,0.456091,0.021955,0.023371
True: Purchase,0.05071,0.024341,0.091278,0.150101,0.018256,0.05071,0.580122,0.034483
True: Storage,0.073874,0.01982,0.131532,0.187387,0.027027,0.057658,0.025225,0.477477
