In [47]:
# import the required libraries
import re
import torch
import pickle
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import nltk
nltk.download('punkt_tab')

from importlib import reload as r
import config as cfg
import processing as prep
import Source.utils as utils
import Source.multiplethreading as mt
import Source.model as modeller
import Source.data as creator

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/abhishek/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [48]:
acc_labels = 'Output/label_encoder_accounttype_20241108.pkl'
sms_labels = 'Output/label_encoder_smstype20241108.pkl'
smssub_labels = 'Output/label_encoder_smssubtype_2024110819.pkl'
transactiontype_labels = 'Output/label_encoder_transactiontype_2024110915.pkl'
transactionchannel_labels = 'Output/label_encoder_transactionchannel_2024111015.pkl'


In [49]:
import pickle
encoders = []

for url in [acc_labels, sms_labels, smssub_labels, transactionchannel_labels, transactiontype_labels]:
    encoders.append(pickle.load(open(url, 'rb')))

In [50]:
for ec in encoders:
    print(len(ec.classes_))

48
8
77
16
37


In [51]:
len(list(set([x.lower() for x in _classes])))

15

In [52]:
_classes = list(set(encoders[3].classes_))

_classes = [x for x in _classes if x != 'nan']

{str(_class).lower(): i for i, _class in enumerate(_classes)}

{'aeps': 0,
 'unknown': 1,
 'upi': 2,
 'imps': 3,
 'cheque': 4,
 'credit-card': 5,
 'debit-card': 6,
 'other': 7,
 'pos': 8,
 'pos-machine': 9,
 'cash-pickup': 10,
 'online-transfer': 11,
 'net-banking': 12,
 'netbanking': 13,
 'auto-debit': 14}

In [53]:
r(cfg)
RANDOM_STATE = cfg.RANDOM_STATE
PADDING_LENGTH = cfg.PADDING_LENGTH
LEARNING_RATE = cfg.LEARNING_RATE
INPUT_SIZE = cfg.INPUT_SIZE
NUM_EPOCHS = cfg.NUM_EPOCHS
HIDDEN_SIZE = cfg.HIDDEN_SIZE

CURRENT_DATETIME = datetime.now().strftime("%Y%m%d%H")
DATA_PATH = "Input/000.parquet"
GLOVE_VECTOR_PATH = cfg.GLOVE_VECTOR_PATH
DATA_S3_PATH = cfg.DATA_S3_PATH
PRIMARY_LABEL = cfg.PRIMARY_LABEL
SECONDARY_LABEL1 = cfg.SECONDARY_LABEL1
TERTIARY_LABEL1 = cfg.TERTIARY_LABEL1
TERTIARY_LABEL2 = cfg.TERTIARY_LABEL2
TERTIARY_LABEL3 = cfg.TERTIARY_LABEL3

TEXT_COLUMN = cfg.TEXT_COLUMN

label = PRIMARY_LABEL
SUFFIX = f'_{label}_{CURRENT_DATETIME}'

EMBEDDINGS_PATH = cfg.EMBEDDINGS_PATH.format(SUFFIX)
EMBEDDINGS_V2_PATH = cfg.EMBEDDINGS_V2_PATH.format(SUFFIX)
VOCABULARY_PATH = cfg.VOCABULARY_PATH.format(SUFFIX)
TOKENS_PATH = cfg.TOKENS_PATH.format(SUFFIX)
LABELS_PATH = cfg.LABELS_PATH.format(SUFFIX)
RNN_MODEL_PATH = cfg.RNN_MODEL_PATH.format(SUFFIX)
LSTM_MODEL_PATH = cfg.LSTM_MODEL_PATH.format(SUFFIX)
LABEL_ENCODER_PATH = cfg.LABEL_ENCODER_PATH.format(SUFFIX)

## Process glove embeddings
---

In [54]:
# open the glove embeddings file and read
with open(GLOVE_VECTOR_PATH, "rt") as f:
    emb = f.readlines()

len(emb)

400000

In [56]:
# emb

In [264]:
emb[0]

'the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581\n'

In [265]:
vocabulary, embeddings = {}, {}

for i, item in tqdm(enumerate(emb)):
    _word, _embedding = item.split()[0], item.split()[1:]
    vocabulary[_word] = i
    embeddings[i] = np.array(_embedding, dtype=np.float32)

vocabulary["<unk>"] = max(vocabulary.values()) + 1
embeddings[vocabulary["<unk>"]] = np.mean(np.array(list(embeddings.values()), dtype=np.float32), axis=0)

vocabulary["<pad>"] = vocabulary["<unk>"] + 1
embeddings[vocabulary["<pad>"]] = np.ones(50, dtype=np.float32)

400000it [00:05, 73619.40it/s]


In [266]:
# embeddings = np.array(embeddings, dtype=np.float32)
# embeddings.shape

In [267]:
# vocabulary = ["<pad>", "<unk>"] + vocabulary

In [268]:
# embeddings = np.vstack([np.ones(50, dtype=np.float32), np.mean(embeddings, axis=0), embeddings])
# print(len(vocabulary), embeddings.shape)

In [269]:
# EMBEDDINGS["<pad>"] = np.ones(50, dtype=np.float32)
# EMBEDDINGS["<unk>"] = np.mean(list(EMBEDDINGS.values()), axis=0)

In [270]:
utils.save_file(EMBEDDINGS_PATH, embeddings)
utils.save_file(VOCABULARY_PATH, vocabulary)
# utils.save_file(EMBEDDINGS_V2_PATH, EMBEDDINGS)

In [271]:
SAMPLE_SIZE = 500000

chunks = pd.read_csv(DATA_S3_PATH, chunksize=SAMPLE_SIZE, low_memory=False)
type(chunks)

pandas.io.parsers.readers.TextFileReader

In [272]:
all_primary_labels = set()
sample_data_list = []

for chunk in tqdm(chunks):
    all_primary_labels = all_primary_labels.union(set(chunk[label].unique().tolist()))
    sample_data_list.append(chunk.sample(frac=0.2))

sample_data = pd.concat(sample_data_list)
len(all_primary_labels), sample_data.shape

4it [00:57, 14.30s/it]


(16, (325646, 28))

In [273]:
sample_data.dropna(subset=[TEXT_COLUMN], inplace=True)

In [274]:
sample_data.shape

(325646, 28)

In [275]:
sample_data[label].nunique()

15

In [276]:
label_encoder = LabelEncoder()
label_encoder.fit(list(all_primary_labels))
labels = label_encoder.transform(sample_data[label])

In [277]:
labels[0]

np.int64(11)

In [278]:
label_encoder.classes_

array(['aeps', 'auto-debit', 'cash-pickup', 'cheque', 'credit-card',
       'debit-card', 'imps', 'nan', 'net-banking', 'netbanking',
       'online-transfer', 'other', 'pos', 'pos-machine', 'unknown', 'upi'],
      dtype='<U32')

In [279]:
utils.save_file(LABELS_PATH, labels)
utils.save_file(LABEL_ENCODER_PATH, label_encoder)

In [280]:
input_text = sample_data[TEXT_COLUMN]
input_text = [i.lower() for i in tqdm(input_text)]
input_text = [re.sub(r"[^\w\d'\s]+", " ", i) for i in tqdm(input_text)]
input_text = [re.sub("\d+", "", i) for i in tqdm(input_text)]
input_text = [re.sub(r'[x]{2,}', "", i) for i in tqdm(input_text)]
input_text = [re.sub(' +', ' ', i) for i in tqdm(input_text)]

100%|██████████| 325646/325646 [00:00<00:00, 1976774.57it/s]
100%|██████████| 325646/325646 [00:01<00:00, 174293.41it/s]
100%|██████████| 325646/325646 [00:01<00:00, 302666.38it/s]
100%|██████████| 325646/325646 [00:00<00:00, 648592.36it/s]
100%|██████████| 325646/325646 [00:01<00:00, 236353.18it/s]


In [281]:
tokens = [word_tokenize(t) for t in tqdm(input_text)]

100%|██████████| 325646/325646 [00:26<00:00, 12394.84it/s]


In [282]:
tokens = [i[:PADDING_LENGTH] if len(i) > PADDING_LENGTH - 1 else ['<pad>'] * (PADDING_LENGTH - len(i)) + i for i in tqdm(tokens)]

100%|██████████| 325646/325646 [00:03<00:00, 85158.20it/s] 


In [283]:
# def token_index(tokens, vocabulary, missing=0):
#     """
#     :param tokens: List of word tokens
#     :param vocabulary: All words in the embeddings
#     :param missing: Token for words not present in the vocabulary
#     :return: List of integers representing the word tokens
#     """
#     idx_token = []
#     for text in tqdm(tokens):
#         idx_text = []
#         for token in text:
#             try:
#                 idx_text.append(vocabulary.index(token))
#             except:
#                 idx_text.append(missing)
#         idx_token.append(idx_text)
#     return idx_token

In [284]:
# missing_index = vocabulary.index('<unk>')
# tokens = token_index(tokens, vocabulary, missing_index)

In [285]:
r(prep)
tokens = prep.token_index(tokens, vocabulary)

100%|██████████| 325646/325646 [00:05<00:00, 58418.17it/s] 


In [286]:
len(tokens)

325646

In [287]:
len(tokens[0])

50

In [288]:
# vocabulary[tokens[0][0]]

In [289]:
r(utils)
utils.save_file(TOKENS_PATH, tokens)

In [290]:
r(utils)
tokens = utils.load_file(TOKENS_PATH)
labels = utils.load_file(LABELS_PATH)
embeddings = utils.load_file(EMBEDDINGS_PATH)
label_encoder = utils.load_file(LABEL_ENCODER_PATH)
num_classes = len(label_encoder.classes_)

In [291]:
X_train, X_test, y_train, y_test = train_test_split(tokens, labels, test_size=0.2)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25)

In [292]:
r(creator)
train_dataset = creator.TextDataset(X_train, embeddings, y_train)
valid_dataset = creator.TextDataset(X_valid, embeddings, y_valid)
test_dataset = creator.TextDataset(X_test, embeddings, y_test)

In [293]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True, drop_last=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=16)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16)

In [294]:
# model = modeller.RNNNetwork(INPUT_SIZE, HIDDEN_SIZE, num_classes)
# if torch.cuda.is_available():
#     model = model.cuda()

# criterion = torch.nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# modeller.train(
#     train_loader=train_loader,
#     valid_loader=valid_loader,
#     model=model,
#     criterion=criterion,
#     optimizer=optimizer,
#     device=device,
#     num_epochs=NUM_EPOCHS,
#     model_path=RNN_MODEL_PATH
# )

In [113]:
r(modeller)
model = modeller.LSTMNetwork(INPUT_SIZE, HIDDEN_SIZE, num_classes)
if torch.cuda.is_available():
    model = model.cuda()

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  0%|          | 0/18 [4:31:43<?, ?it/s]


In [117]:
type(criterion)

torch.nn.modules.loss.CrossEntropyLoss

In [296]:
r(modeller)
modeller.train(
    train_loader=train_loader,
    valid_loader=valid_loader,
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    device=device, 
    num_epochs=NUM_EPOCHS,
    model_path=LSTM_MODEL_PATH
)

Epoch 1 of 50


100%|██████████| 12211/12211 [01:42<00:00, 119.71it/s]
100%|██████████| 4071/4071 [00:14<00:00, 274.02it/s]


Train Loss: 0.5335872929630814, Validation Loss: 0.2760382591680633
Best Validation Loss: 0.2760382591680633
Epoch 2 of 50


100%|██████████| 12211/12211 [01:40<00:00, 122.10it/s]
100%|██████████| 4071/4071 [00:14<00:00, 271.72it/s]


Train Loss: 0.21238894085763993, Validation Loss: 0.17143234221328318
Best Validation Loss: 0.17143234221328318
Epoch 3 of 50


100%|██████████| 12211/12211 [01:45<00:00, 115.48it/s]
100%|██████████| 4071/4071 [00:13<00:00, 292.98it/s]


Train Loss: 0.14143542513125765, Validation Loss: 0.1469635749123652
Best Validation Loss: 0.1469635749123652
Epoch 4 of 50


100%|██████████| 12211/12211 [01:36<00:00, 125.89it/s]
100%|██████████| 4071/4071 [00:13<00:00, 292.88it/s]


Train Loss: 0.10795529203356104, Validation Loss: 0.09891105922500874
Best Validation Loss: 0.09891105922500874
Epoch 5 of 50


100%|██████████| 12211/12211 [01:37<00:00, 125.84it/s]
100%|██████████| 4071/4071 [00:14<00:00, 282.48it/s]


Train Loss: 0.08983607938595987, Validation Loss: 0.08390426088976224
Best Validation Loss: 0.08390426088976224
Epoch 6 of 50


100%|██████████| 12211/12211 [01:36<00:00, 126.41it/s]
100%|██████████| 4071/4071 [00:14<00:00, 289.71it/s]


Train Loss: 0.07746460141609351, Validation Loss: 0.0778018577930041
Best Validation Loss: 0.0778018577930041
Epoch 7 of 50


100%|██████████| 12211/12211 [01:34<00:00, 129.11it/s]
100%|██████████| 4071/4071 [00:13<00:00, 297.81it/s]


Train Loss: 0.06750505242992898, Validation Loss: 0.0766873842112304
Best Validation Loss: 0.0766873842112304
Epoch 8 of 50


100%|██████████| 12211/12211 [01:37<00:00, 125.53it/s]
100%|██████████| 4071/4071 [00:13<00:00, 296.42it/s]


Train Loss: 0.061071080403831385, Validation Loss: 0.06471120108023526
Best Validation Loss: 0.06471120108023526
Epoch 9 of 50


100%|██████████| 12211/12211 [01:34<00:00, 129.10it/s]
100%|██████████| 4071/4071 [00:13<00:00, 302.39it/s]


Train Loss: 0.05523650924295041, Validation Loss: 0.05645346983857622
Best Validation Loss: 0.05645346983857622
Epoch 10 of 50


100%|██████████| 12211/12211 [01:33<00:00, 129.93it/s]
100%|██████████| 4071/4071 [00:15<00:00, 258.51it/s]


Train Loss: 0.050517499935472894, Validation Loss: 0.05314093404873891
Best Validation Loss: 0.05314093404873891
Epoch 11 of 50


100%|██████████| 12211/12211 [01:43<00:00, 118.02it/s]
100%|██████████| 4071/4071 [00:13<00:00, 308.04it/s]


Train Loss: 0.04691877668019233, Validation Loss: 0.05022121923714277
Best Validation Loss: 0.05022121923714277
Epoch 12 of 50


100%|██████████| 12211/12211 [01:33<00:00, 131.13it/s]
100%|██████████| 4071/4071 [00:14<00:00, 285.80it/s]


Train Loss: 0.043680064698554476, Validation Loss: 0.0459023918668411
Best Validation Loss: 0.0459023918668411
Epoch 13 of 50


100%|██████████| 12211/12211 [01:33<00:00, 131.30it/s]
100%|██████████| 4071/4071 [00:14<00:00, 278.61it/s]


Train Loss: 0.040517188124642375, Validation Loss: 0.04576312401367706
Best Validation Loss: 0.04576312401367706
Epoch 14 of 50


100%|██████████| 12211/12211 [01:39<00:00, 123.16it/s]
100%|██████████| 4071/4071 [00:12<00:00, 323.83it/s]


Train Loss: 0.037944737842617426, Validation Loss: 0.04352183268712392
Best Validation Loss: 0.04352183268712392
Epoch 15 of 50


100%|██████████| 12211/12211 [01:27<00:00, 139.98it/s]
100%|██████████| 4071/4071 [00:13<00:00, 311.56it/s]


Train Loss: 0.03571404031130348, Validation Loss: 0.04048473517567679
Best Validation Loss: 0.04048473517567679
Epoch 16 of 50


100%|██████████| 12211/12211 [01:24<00:00, 144.68it/s]
100%|██████████| 4071/4071 [00:13<00:00, 305.55it/s]


Train Loss: 0.034142722734742253, Validation Loss: 0.043012255518346615
Best Validation Loss: 0.04048473517567679
Epoch 17 of 50


100%|██████████| 12211/12211 [01:29<00:00, 136.70it/s]
100%|██████████| 4071/4071 [00:12<00:00, 337.39it/s]


Train Loss: 0.032353984793658155, Validation Loss: 0.039101863567137955
Best Validation Loss: 0.039101863567137955
Epoch 18 of 50


100%|██████████| 12211/12211 [01:24<00:00, 144.55it/s]
100%|██████████| 4071/4071 [00:12<00:00, 332.72it/s]


Train Loss: 0.030685686616132322, Validation Loss: 0.03725795186964271
Best Validation Loss: 0.03725795186964271
Epoch 19 of 50


100%|██████████| 12211/12211 [01:22<00:00, 147.92it/s]
100%|██████████| 4071/4071 [00:12<00:00, 330.90it/s]


Train Loss: 0.029434109797984904, Validation Loss: 0.03739811693436233
Best Validation Loss: 0.03725795186964271
Epoch 20 of 50


100%|██████████| 12211/12211 [01:27<00:00, 140.32it/s]
100%|██████████| 4071/4071 [00:13<00:00, 311.93it/s]


Train Loss: 0.028247022056045297, Validation Loss: 0.03795774249661284
Best Validation Loss: 0.03725795186964271
Epoch 21 of 50


100%|██████████| 12211/12211 [01:25<00:00, 142.48it/s]
100%|██████████| 4071/4071 [00:12<00:00, 333.11it/s]


Train Loss: 0.026870114374206912, Validation Loss: 0.03748325455469378
Best Validation Loss: 0.03725795186964271
Epoch 22 of 50


100%|██████████| 12211/12211 [01:23<00:00, 145.69it/s]
100%|██████████| 4071/4071 [00:12<00:00, 327.57it/s]


Train Loss: 0.026225268351504138, Validation Loss: 0.03499671280318807
Best Validation Loss: 0.03499671280318807
Epoch 23 of 50


100%|██████████| 12211/12211 [01:30<00:00, 135.09it/s]
100%|██████████| 4071/4071 [00:11<00:00, 340.65it/s]


Train Loss: 0.02457222778933488, Validation Loss: 0.03328360697138424
Best Validation Loss: 0.03328360697138424
Epoch 24 of 50


100%|██████████| 12211/12211 [01:22<00:00, 148.31it/s]
100%|██████████| 4071/4071 [00:13<00:00, 295.47it/s]


Train Loss: 0.022885863858402255, Validation Loss: 0.03427615483586725
Best Validation Loss: 0.03328360697138424
Epoch 25 of 50


100%|██████████| 12211/12211 [01:22<00:00, 147.37it/s]
100%|██████████| 4071/4071 [00:13<00:00, 307.12it/s]


Train Loss: 0.02227537760211944, Validation Loss: 0.03877328186069244
Best Validation Loss: 0.03328360697138424
Epoch 26 of 50


100%|██████████| 12211/12211 [01:30<00:00, 135.66it/s]
100%|██████████| 4071/4071 [00:12<00:00, 330.38it/s]


Train Loss: 0.021742840844179285, Validation Loss: 0.03837892903152301
Best Validation Loss: 0.03328360697138424
Epoch 27 of 50


100%|██████████| 12211/12211 [01:27<00:00, 139.35it/s]
100%|██████████| 4071/4071 [00:12<00:00, 325.46it/s]


Train Loss: 0.02058540249453419, Validation Loss: 0.034360068910339284
Best Validation Loss: 0.03328360697138424
Epoch 28 of 50


100%|██████████| 12211/12211 [01:23<00:00, 146.37it/s]
100%|██████████| 4071/4071 [00:12<00:00, 313.24it/s]


Train Loss: 0.020021201182460324, Validation Loss: 0.043431834542606795
Best Validation Loss: 0.03328360697138424
Epoch 29 of 50


100%|██████████| 12211/12211 [01:22<00:00, 147.14it/s]
100%|██████████| 4071/4071 [00:11<00:00, 341.35it/s]


Train Loss: 0.018639735906788128, Validation Loss: 0.0354288227310625
Best Validation Loss: 0.03328360697138424
Epoch 30 of 50


100%|██████████| 12211/12211 [01:25<00:00, 142.66it/s]
100%|██████████| 4071/4071 [00:12<00:00, 315.88it/s]


Train Loss: 0.01800189258038065, Validation Loss: 0.0340469064987475
Best Validation Loss: 0.03328360697138424
Epoch 31 of 50


100%|██████████| 12211/12211 [01:21<00:00, 150.04it/s]
100%|██████████| 4071/4071 [00:12<00:00, 337.90it/s]


Train Loss: 0.01722376467244611, Validation Loss: 0.034535516624026256
Best Validation Loss: 0.03328360697138424
Epoch 32 of 50


100%|██████████| 12211/12211 [01:23<00:00, 145.74it/s]
100%|██████████| 4071/4071 [00:12<00:00, 321.67it/s]


Train Loss: 0.01696233781115374, Validation Loss: 0.03020243835408858
Best Validation Loss: 0.03020243835408858
Epoch 33 of 50


100%|██████████| 12211/12211 [01:27<00:00, 139.41it/s]
100%|██████████| 4071/4071 [00:11<00:00, 341.63it/s]


Train Loss: 0.015715328926343242, Validation Loss: 0.03015086593042648
Best Validation Loss: 0.03015086593042648
Epoch 34 of 50


100%|██████████| 12211/12211 [02:15<00:00, 90.31it/s] 
100%|██████████| 4071/4071 [00:17<00:00, 226.43it/s]


Train Loss: 0.01592235774849997, Validation Loss: 0.03364165863443967
Best Validation Loss: 0.03015086593042648
Epoch 35 of 50


100%|██████████| 12211/12211 [02:16<00:00, 89.60it/s] 
100%|██████████| 4071/4071 [00:17<00:00, 230.82it/s]


Train Loss: 0.014969081042684169, Validation Loss: 0.029824369601622228
Best Validation Loss: 0.029824369601622228
Epoch 36 of 50


100%|██████████| 12211/12211 [01:50<00:00, 110.34it/s]
100%|██████████| 4071/4071 [00:17<00:00, 233.07it/s]


Train Loss: 0.014846290658338517, Validation Loss: 0.029389289093409512
Best Validation Loss: 0.029389289093409512
Epoch 37 of 50


100%|██████████| 12211/12211 [02:02<00:00, 99.30it/s] 
100%|██████████| 4071/4071 [00:17<00:00, 237.05it/s]


Train Loss: 0.014655243623619257, Validation Loss: 0.031540311519740974
Best Validation Loss: 0.029389289093409512
Epoch 38 of 50


100%|██████████| 12211/12211 [01:52<00:00, 108.59it/s]
100%|██████████| 4071/4071 [00:15<00:00, 262.07it/s]


Train Loss: 0.013744908851697548, Validation Loss: 0.028736742617540174
Best Validation Loss: 0.028736742617540174
Epoch 39 of 50


100%|██████████| 12211/12211 [01:53<00:00, 107.74it/s]
100%|██████████| 4071/4071 [00:15<00:00, 260.65it/s]


Train Loss: 0.01356321175558361, Validation Loss: 0.028793729088322242
Best Validation Loss: 0.028736742617540174
Epoch 40 of 50


100%|██████████| 12211/12211 [01:54<00:00, 106.82it/s]
100%|██████████| 4071/4071 [00:16<00:00, 251.02it/s]


Train Loss: 0.012976330550930221, Validation Loss: 0.02909985611421343
Best Validation Loss: 0.028736742617540174
Epoch 41 of 50


100%|██████████| 12211/12211 [01:50<00:00, 110.40it/s]
100%|██████████| 4071/4071 [00:15<00:00, 255.93it/s]


Train Loss: 0.012808166261613012, Validation Loss: 0.028884550833689952
Best Validation Loss: 0.028736742617540174
Epoch 42 of 50


100%|██████████| 12211/12211 [01:48<00:00, 112.33it/s]
100%|██████████| 4071/4071 [00:17<00:00, 239.36it/s]


Train Loss: 0.012419917209202032, Validation Loss: 0.029563046435005236
Best Validation Loss: 0.028736742617540174
Epoch 43 of 50


100%|██████████| 12211/12211 [01:50<00:00, 110.02it/s]
100%|██████████| 4071/4071 [00:15<00:00, 257.82it/s]


Train Loss: 0.012006118945537617, Validation Loss: 0.02893095054837497
Best Validation Loss: 0.028736742617540174
Epoch 44 of 50


100%|██████████| 12211/12211 [01:50<00:00, 110.32it/s]
100%|██████████| 4071/4071 [00:16<00:00, 243.57it/s]


Train Loss: 0.011385954638238858, Validation Loss: 0.029446805069122527
Best Validation Loss: 0.028736742617540174
Epoch 45 of 50


100%|██████████| 12211/12211 [01:51<00:00, 110.00it/s]
100%|██████████| 4071/4071 [00:16<00:00, 248.87it/s]


Train Loss: 0.010993294597863742, Validation Loss: 0.03011222234201439
Best Validation Loss: 0.028736742617540174
Epoch 46 of 50


100%|██████████| 12211/12211 [01:49<00:00, 112.01it/s]
100%|██████████| 4071/4071 [00:15<00:00, 259.11it/s]


Train Loss: 0.010875856920528913, Validation Loss: 0.030993710080763334
Best Validation Loss: 0.028736742617540174
Epoch 47 of 50


100%|██████████| 12211/12211 [01:47<00:00, 114.11it/s]
100%|██████████| 4071/4071 [00:16<00:00, 248.41it/s]


Train Loss: 0.010567817865961763, Validation Loss: 0.02866113228846888
Best Validation Loss: 0.02866113228846888
Epoch 48 of 50


100%|██████████| 12211/12211 [01:52<00:00, 108.86it/s]
100%|██████████| 4071/4071 [00:15<00:00, 258.47it/s]


Train Loss: 0.010303393428187901, Validation Loss: 0.029266787578731274
Best Validation Loss: 0.02866113228846888
Epoch 49 of 50


100%|██████████| 12211/12211 [01:47<00:00, 113.66it/s]
100%|██████████| 4071/4071 [00:16<00:00, 253.71it/s]


Train Loss: 0.01024128270721679, Validation Loss: 0.028731678649099916
Best Validation Loss: 0.02866113228846888
Epoch 50 of 50


100%|██████████| 12211/12211 [01:50<00:00, 110.72it/s]
100%|██████████| 4071/4071 [00:16<00:00, 241.00it/s]

Train Loss: 0.009757814201929884, Validation Loss: 0.030296439191271514
Best Validation Loss: 0.02866113228846888





In [297]:
r(modeller)
modeller.test(
    test_loader=test_loader,
    model=model,
    criterion=criterion,
    device=device
)

100%|██████████| 4071/4071 [00:20<00:00, 202.58it/s]

Test Loss: 0.030199647056001783, Test Accuracy: 0.9934444853844264





In [298]:
sample_data.columns

Index(['subusername', 'created_at', 'smstype', 'accounttype', 'pan', 'pantype',
       'smssubtype', 'transactiontype', 'transactionchannel', 'smsbody',
       'smsinboxdate', 'templatehash', 'hash', 'user_hash', 'sendername',
       'senderaddress', 'servicename', 'servicetype', 'totalamountdue',
       'minamountdue', 'outstandingamount', 'amount', 'balance', 'date',
       'availablelimit', 'pos', 'duedate', 'rn'],
      dtype='object')

In [57]:
TEST_SMS = 'Dear 100134275715, your passbook balance against GJAHD001035900A0013723 is Rs. ERROR:ORA-28002: the password will expire within 6 days8814758/-. Contribution of Rs. 59465/- for due month 082019 has been received.'
TEST_SMS

'Dear 100134275715, your passbook balance against GJAHD001035900A0013723 is Rs. ERROR:ORA-28002: the password will expire within 6 days8814758/-. Contribution of Rs. 59465/- for due month 082019 has been received.'

In [58]:
input_text = re.sub(r"[^\w\d'\s]+", " ", TEST_SMS)
input_text = re.sub("\d+", "", input_text)
input_text = re.sub(r'[x]{2,}', "", input_text)
input_text = re.sub(' +', ' ', input_text)
tokens = word_tokenize(input_text)

In [59]:
tokens

['Dear',
 'your',
 'passbook',
 'balance',
 'against',
 'GJAHDA',
 'is',
 'Rs',
 'ERROR',
 'ORA',
 'the',
 'password',
 'will',
 'expire',
 'within',
 'days',
 'Contribution',
 'of',
 'Rs',
 'for',
 'due',
 'month',
 'has',
 'been',
 'received']

In [60]:
tokens = tokens[:PADDING_LENGTH] if len(tokens) > PADDING_LENGTH - 1 else ['<pad>'] * (PADDING_LENGTH - len(tokens)) + tokens

In [61]:
OUTPUT_LABEL_ENCODER_PATH = 'Output/label_encoder_smstype20241108.pkl'

In [62]:
label_encoder = utils.load_file(OUTPUT_LABEL_ENCODER_PATH)
num_classes = len(label_encoder.classes_)

In [63]:
num_classes

8

In [64]:
OUTPUT_MODEL_PATH = 'Output/model_lstm_smstype20241108.pth'

In [76]:
model = modeller.LSTMNetwork(cfg.INPUT_SIZE, cfg.HIDDEN_SIZE, num_classes)
if torch.cuda.is_available():
    model = model.cuda()

model.load_state_dict(torch.load(OUTPUT_MODEL_PATH))

  model.load_state_dict(torch.load(OUTPUT_MODEL_PATH))


<All keys matched successfully>

In [None]:
ls

Engine.py                 README.MD                 config.py
[34mInput[m[m/                    RNN.ipynb                 predict.py
LSTMClassification.ipynb  [34mSource[m[m/                   processing.py
[34mOutput[m[m/                   [34m__pycache__[m[m/              requirements.txt


In [77]:
OUTPUT_VOCABULARY_PATH = 'Output/vocabulary_smstype20241108.pkl'
OUTPUT_EMBEDDINGS_PATH = 'Output/embeddings_v2_smstype20241108.pkl'

In [78]:
vocabulary = utils.load_file(OUTPUT_VOCABULARY_PATH)
embeddings = utils.load_file(OUTPUT_EMBEDDINGS_PATH)

In [79]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [80]:
tokens

['<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 'Dear',
 'your',
 'passbook',
 'balance',
 'against',
 'GJAHDA',
 'is',
 'Rs',
 'ERROR',
 'ORA',
 'the',
 'password',
 'will',
 'expire',
 'within',
 'days',
 'Contribution',
 'of',
 'Rs',
 'for',
 'due',
 'month',
 'has',
 'been',
 'received']

In [81]:
token_emb = np.array([embeddings.get(token, embeddings.get('<unk>')) for token in tokens], dtype=np.float32)
token_emb

array([[ 1.       ,  1.       ,  1.       , ...,  1.       ,  1.       ,
         1.       ],
       [ 1.       ,  1.       ,  1.       , ...,  1.       ,  1.       ,
         1.       ],
       [ 1.       ,  1.       ,  1.       , ...,  1.       ,  1.       ,
         1.       ],
       ...,
       [ 0.54822  ,  0.038847 ,  0.10127  , ...,  0.26588  , -0.40267  ,
        -0.17111  ],
       [ 0.92884  , -0.72457  ,  0.068095 , ...,  0.047085 , -0.32297  ,
        -0.64192  ],
       [-0.054145 ,  0.7298   ,  0.0016229, ...,  0.64339  , -0.26776  ,
         0.13484  ]], dtype=float32)

In [82]:
inp = torch.from_numpy(token_emb)
inp = inp.to(device)

In [83]:
inp = torch.unsqueeze(inp, 0)
out = torch.squeeze(model(inp))

In [84]:
prediction = label_encoder.classes_[torch.argmax(out)]
print(f"Predicted  Class: {prediction}")

Predicted  Class: bank-notification
