In [1]:
# import the required libraries
import re
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/abhishek/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# define configuration file paths
lr = 0.0001
input_size = 50
num_epochs = 50
hidden_size = 50
label_col = "Product"
tokens_path = "Output/tokens.pkl"
labels_path = "Output/labels.pkl"
data_path = "Input/complaints.csv"
rnn_model_path = "Output/model_rnn.pth"
lstm_model_path = "Output/model_lstm.pth"
vocabulary_path = "Output/vocabulary.pkl"
embeddings_path = "Output/embeddings.pkl"
glove_vector_path = "Input/glove.6B.50d.txt"
text_col_name = "Consumer complaint narrative"
label_encoder_path = "Output/label_encoder.pkl"
product_map = {'Vehicle loan or lease': 'vehicle_loan',
               'Credit reporting, credit repair services, or other personal consumer reports': 'credit_report',
               'Credit card or prepaid card': 'card',
               'Money transfer, virtual currency, or money service': 'money_transfer',
               'virtual currency': 'money_transfer',
               'Mortgage': 'mortgage',
               'Payday loan, title loan, or personal loan': 'loan',
               'Debt collection': 'debt_collection',
               'Checking or savings account': 'savings_account',
               'Credit card': 'card',
               'Bank account or service': 'savings_account',
               'Credit reporting': 'credit_report',
               'Prepaid card': 'card',
               'Payday loan': 'loan',
               'Other financial service': 'others',
               'Virtual currency': 'money_transfer',
               'Student loan': 'loan',
               'Consumer Loan': 'loan',
               'Money transfers': 'money_transfer'}

In [4]:
# define function for saving a file
def save_file(name, obj):
    """
    Function to save an object as pickle file
    """
    with open(name, 'wb') as f:
        pickle.dump(obj, f)

# define function for loading a file
def load_file(name):
    """
    Function to load a pickle object
    """
    return pickle.load(open(name, "rb"))

## Process glove embeddings
---

In [5]:
# open the glove embeddings file and read
with open(glove_vector_path, "rt") as f:
    emb = f.readlines()

### 400000 unique words are there in the embeddings

In [6]:
# length of embeddings
len(emb)

400000

### Check the first record

In [7]:
# check first record
emb[0]

'the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581\n'

In [8]:
# split the first record and check for vocabulary
emb[0].split()[0]

'the'

In [9]:
# split the first record and check for embeddings
emb[0].split()[1:]

['0.418',
 '0.24968',
 '-0.41242',
 '0.1217',
 '0.34527',
 '-0.044457',
 '-0.49688',
 '-0.17862',
 '-0.00066023',
 '-0.6566',
 '0.27843',
 '-0.14767',
 '-0.55677',
 '0.14658',
 '-0.0095095',
 '0.011658',
 '0.10204',
 '-0.12792',
 '-0.8443',
 '-0.12181',
 '-0.016801',
 '-0.33279',
 '-0.1552',
 '-0.23131',
 '-0.19181',
 '-1.8823',
 '-0.76746',
 '0.099051',
 '-0.42125',
 '-0.19526',
 '4.0071',
 '-0.18594',
 '-0.52287',
 '-0.31681',
 '0.00059213',
 '0.0074449',
 '0.17778',
 '-0.15897',
 '0.012041',
 '-0.054223',
 '-0.29871',
 '-0.15749',
 '-0.34758',
 '-0.045637',
 '-0.44251',
 '0.18785',
 '0.0027849',
 '-0.18411',
 '-0.11514',
 '-0.78581']

### Separate embeddings and vocabulary

In [10]:
vocabulary, embeddings = [], []

for item in emb:
    vocabulary.append(item.split()[0])
    embeddings.append(item.split()[1:])

### Convert embeddings to numpy float array

In [11]:
embeddings = np.array(embeddings, dtype=np.float32)

In [12]:
embeddings.shape

(400000, 50)

### Add embeddings for padding and unknown items

In [13]:
vocabulary[:10]

['the', ',', '.', 'of', 'to', 'and', 'in', 'a', '"', "'s"]

In [14]:
vocabulary = ["<pad>", "<unk>"] + vocabulary

In [15]:
embeddings = np.vstack([np.ones(50, dtype=np.float32), np.mean(embeddings, axis=0),
                            embeddings])

In [16]:
print(len(vocabulary), embeddings.shape)

400002 (400002, 50)


### Save embeddings and vocabulary

In [17]:
save_file(embeddings_path, embeddings)
save_file(vocabulary_path, vocabulary)

## Process text data
---

### Read the data file

In [18]:
data = pd.read_csv(data_path)

### Drop rows where the text column is empty

In [19]:
data.dropna(subset=[text_col_name], inplace=True)

### Replace duplicate labels

In [20]:
data.head(5)

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
1,2019-11-01,Vehicle loan or lease,Loan,Struggling to pay your loan,Denied request to lower payments,I contacted Ally on Friday XX/XX/XXXX after fa...,Company has responded to the consumer and the ...,ALLY FINANCIAL INC.,NJ,088XX,,Consent provided,Web,2019-11-01,Closed with explanation,Yes,,3425257
7,2019-07-08,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...,Hello This complaint is against the three cred...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",NY,109XX,,Consent provided,Web,2019-07-08,Closed with explanation,Yes,,3299394
8,2020-06-10,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Credit inquiries on your report that you don't...,I am a victim of Identity Theft & currently ha...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,MT,,Servicemember,Consent provided,Web,2020-06-10,Closed with explanation,Yes,,3692762
10,2019-07-03,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account information incorrect,Two accounts are still on my credit history af...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,FL,328XX,,Consent provided,Web,2019-07-03,Closed with non-monetary relief,Yes,,3294745
13,2019-03-21,"Credit reporting, credit repair services, or o...",Other personal consumer report,Identity theft protection or other monitoring ...,Received unwanted marketing or advertising,Receiving daily telephone call ( s ) from XXXX...,Company has responded to the consumer and the ...,"NRA Group, LLC",MA,,,Consent provided,Web,2019-03-27,Closed with explanation,Yes,,3186954


In [21]:
data[label_col].value_counts(dropna=False)

Product
Credit reporting, credit repair services, or other personal consumer reports    316465
Debt collection                                                                 157381
Mortgage                                                                         84461
Credit card or prepaid card                                                      60754
Checking or savings account                                                      37389
Credit reporting                                                                 31588
Student loan                                                                     29094
Credit card                                                                      18838
Money transfer, virtual currency, or money service                               18583
Vehicle loan or lease                                                            15034
Bank account or service                                                          14885
Payday loan, title loan, or persona

In [22]:
data.replace({label_col: product_map}, inplace=True)

In [23]:
data[label_col].value_counts(dropna=False)

Product
credit_report      348053
debt_collection    157381
mortgage            84461
card                81042
savings_account     52274
loan                50710
money_transfer      20096
vehicle_loan        15034
others                292
Name: count, dtype: int64

### Encode the label column and save the encoder and encoded labels

In [24]:
label_encoder = LabelEncoder()
label_encoder.fit(data[label_col])
labels = label_encoder.transform(data[label_col])

In [25]:
labels[0]

np.int64(8)

In [26]:
label_encoder.classes_

array(['card', 'credit_report', 'debt_collection', 'loan',
       'money_transfer', 'mortgage', 'others', 'savings_account',
       'vehicle_loan'], dtype=object)

In [27]:
data[label_col]

1             vehicle_loan
7            credit_report
8            credit_report
10           credit_report
13           credit_report
                ...       
2326240               card
2326241    debt_collection
2326242           mortgage
2326243      credit_report
2326244      credit_report
Name: Product, Length: 809343, dtype: object

In [28]:
save_file(labels_path, labels)
save_file(label_encoder_path, label_encoder)

### Process the text column

In [29]:
input_text = data[text_col_name]

### Convert text to lower case

In [30]:
input_text = [i.lower() for i in tqdm(input_text)]

100%|██████████| 809343/809343 [00:01<00:00, 630184.03it/s]


### Remove punctuations except apostrophe

In [31]:
input_text = [re.sub(r"[^\w\d'\s]+", " ", i) for i in tqdm(input_text)]

100%|██████████| 809343/809343 [00:17<00:00, 47316.18it/s]


### Remove digits

In [32]:
input_text = [re.sub("\d+", "", i) for i in tqdm(input_text)]

100%|██████████| 809343/809343 [00:11<00:00, 71491.47it/s]


### Remove more than one consecutive instance of 'x'

In [33]:
input_text = [re.sub(r'[x]{2,}', "", i) for i in tqdm(input_text)]

100%|██████████| 809343/809343 [00:07<00:00, 111013.65it/s]


### Replace multiple spaces with single space

In [34]:
input_text = [re.sub(' +', ' ', i) for i in tqdm(input_text)]

100%|██████████| 809343/809343 [00:18<00:00, 44019.91it/s]


### Tokenize the text

In [35]:
tokens = [word_tokenize(t) for t in tqdm(input_text)]

100%|██████████| 809343/809343 [04:43<00:00, 2858.91it/s] 


### Take the first 20 tokens in each complaint text

In [36]:
tokens = [i[:20] if len(i) > 19 else ['<pad>'] * (20 - len(i)) + i for i in tqdm(tokens)]

100%|██████████| 809343/809343 [01:01<00:00, 13171.74it/s] 


In [1]:
# tokens[:2]

### Convert tokens to integer indices from vocabulary

In [38]:
def token_index(tokens, vocabulary, missing='<unk>'):
    """
    :param tokens: List of word tokens
    :param vocabulary: All words in the embeddings
    :param missing: Token for words not present in the vocabulary
    :return: List of integers representing the word tokens
    """
    idx_token = []
    for text in tqdm(tokens):
        idx_text = []
        for token in text:
            if token in vocabulary:
                idx_text.append(vocabulary.index(token))
            else:
                idx_text.append(vocabulary.index(missing))
        idx_token.append(idx_text)
    return idx_token

In [39]:
tokens = token_index(tokens, vocabulary)

 52%|█████▏    | 422204/809343 [38:01<23:32, 274.16it/s]IOStream.flush timed out
 99%|█████████▊| 798677/809343 [1:08:42<00:59, 179.51it/s]IOStream.flush timed out
100%|██████████| 809343/809343 [1:09:37<00:00, 193.75it/s]


In [40]:
len(tokens)

809343

In [41]:
tokens[0]

[43,
 5909,
 3660,
 15,
 187,
 51,
 2333,
 563,
 15,
 3115,
 447,
 6,
 136,
 68,
 5,
 163,
 12,
 9,
 638,
 568]

In [42]:
data.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
1,2019-11-01,vehicle_loan,Loan,Struggling to pay your loan,Denied request to lower payments,I contacted Ally on Friday XX/XX/XXXX after fa...,Company has responded to the consumer and the ...,ALLY FINANCIAL INC.,NJ,088XX,,Consent provided,Web,2019-11-01,Closed with explanation,Yes,,3425257
7,2019-07-08,credit_report,Credit reporting,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...,Hello This complaint is against the three cred...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",NY,109XX,,Consent provided,Web,2019-07-08,Closed with explanation,Yes,,3299394
8,2020-06-10,credit_report,Credit reporting,Improper use of your report,Credit inquiries on your report that you don't...,I am a victim of Identity Theft & currently ha...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,MT,,Servicemember,Consent provided,Web,2020-06-10,Closed with explanation,Yes,,3692762
10,2019-07-03,credit_report,Credit reporting,Incorrect information on your report,Account information incorrect,Two accounts are still on my credit history af...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,FL,328XX,,Consent provided,Web,2019-07-03,Closed with non-monetary relief,Yes,,3294745
13,2019-03-21,credit_report,Other personal consumer report,Identity theft protection or other monitoring ...,Received unwanted marketing or advertising,Receiving daily telephone call ( s ) from XXXX...,Company has responded to the consumer and the ...,"NRA Group, LLC",MA,,,Consent provided,Web,2019-03-27,Closed with explanation,Yes,,3186954


In [43]:
vocabulary[tokens[0][0]]

'i'

### Save the tokens

In [44]:
save_file(tokens_path, tokens)

## Create PyTorch Dataset
---

In [45]:
class TextDataset(torch.utils.data.Dataset):

    def __init__(self, tokens, embeddings, labels):
        """
        :param tokens: List of word tokens
        :param embeddings: Word embeddings (from glove)
        :param labels: List of labels
        """
        self.tokens = tokens
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        return self.labels[idx], self.embeddings[self.tokens[idx], :]

## Create Models
---

### RNN Model

In [46]:
class RNNNetwork(torch.nn.Module):

    def __init__(self, input_size, hidden_size, num_classes):
        """
        :param input_size: Size of embedding
        :param hidden_size: Hidden vector size
        :param num_classes: Number of classes in the dataset
        """
        super(RNNNetwork, self).__init__()
        # RNN Layer
        self.rnn = torch.nn.RNN(input_size=input_size,
                                hidden_size=hidden_size,
                                batch_first=True)
        # Linear Layer
        self.linear = torch.nn.Linear(hidden_size, num_classes)

    def forward(self, input_data):
        _, hidden = self.rnn(input_data)
        output = self.linear(hidden)
        return output

### LSTM Model

In [47]:
class LSTMNetwork(torch.nn.Module):

    def __init__(self, input_size, hidden_size, num_classes):
        """
        :param input_size: Size of embedding
        :param hidden_size: Hidden vector size
        :param num_classes: Number of classes in the dataset
        """
        super(LSTMNetwork, self).__init__()
        # LSTM Layer
        self.rnn = torch.nn.LSTM(input_size=input_size,
                                 hidden_size=hidden_size,
                                 batch_first=True)
        # Linear Layer
        self.linear = torch.nn.Linear(hidden_size, num_classes)

    def forward(self, input_data):
        _, (hidden, _) = self.rnn(input_data)
        output = self.linear(hidden[-1])
        return output

### Define train function

In [49]:
def train(train_loader, valid_loader, model, criterion, optimizer, device,
          num_epochs, model_path):
    """
    Function to train the model
    :param train_loader: Data loader for train dataset
    :param valid_loader: Data loader for validation dataset
    :param model: Model object
    :param criterion: Loss function
    :param optimizer: Optimizer
    :param device: CUDA or CPU
    :param num_epochs: Number of epochs
    :param model_path: Path to save the model
    """
    best_loss = 1e8
    for i in range(num_epochs):
        print(f"Epoch {i+1} of {num_epochs}")
        valid_loss, train_loss = [], []
        model.train()
        # Train loop
        for batch_labels, batch_data in tqdm(train_loader):
            # Move data to GPU if available
            batch_labels = batch_labels.to(device)
            batch_labels = batch_labels.type(torch.LongTensor)
            batch_data = batch_data.to(device)
            # Forward pass
            batch_output = model(batch_data)
            batch_output = torch.squeeze(batch_output)
            # Calculate loss
            loss = criterion(batch_output, batch_labels)
            train_loss.append(loss.item())
            optimizer.zero_grad()
            # Backward pass
            loss.backward()
            # Gradient update step
            optimizer.step()
        model.eval()
        # Validation loop
        for batch_labels, batch_data in tqdm(valid_loader):
            # Move data to GPU if available
            batch_labels = batch_labels.to(device)
            batch_labels = batch_labels.type(torch.LongTensor)
            batch_data = batch_data.to(device)
            # Forward pass
            batch_output = model(batch_data)
            batch_output = torch.squeeze(batch_output)
            # Calculate loss
            loss = criterion(batch_output, batch_labels)
            valid_loss.append(loss.item())
        t_loss = np.mean(train_loss)
        v_loss = np.mean(valid_loss)
        print(f"Train Loss: {t_loss}, Validation Loss: {v_loss}")
        if v_loss < best_loss:
            best_loss = v_loss
            # Save model if validation loss improves
            torch.save(model.state_dict(), model_path)
        print(f"Best Validation Loss: {best_loss}")

### Define test function

In [50]:
def test(test_loader, model, criterion, device):
    """
    Function to test the model
    :param test_loader: Data loader for test dataset
    :param model: Model object
    :param criterion: Loss function
    :param device: CUDA or CPU
    """
    model.eval()
    test_loss = []
    test_accu = []
    for batch_labels, batch_data in tqdm(test_loader):
        # Move data to device
        batch_labels = batch_labels.to(device)
        batch_labels = batch_labels.type(torch.LongTensor)
        batch_data = batch_data.to(device)
        # Forward pass
        batch_output = model(batch_data)
        batch_output = torch.squeeze(batch_output)
        # Calculate loss
        loss = criterion(batch_output, batch_labels)
        test_loss.append(loss.item())
        batch_preds = torch.argmax(batch_output, axis=1)
        # Move predictions to CPU
        if torch.cuda.is_available():
            batch_labels = batch_labels.cpu()
            batch_preds = batch_preds.cpu()
        # Compute accuracy
        test_accu.append(accuracy_score(batch_labels.detach().numpy(),
                                        batch_preds.detach().numpy()))
    test_loss = np.mean(test_loss)
    test_accu = np.mean(test_accu)
    print(f"Test Loss: {test_loss}, Test Accuracy: {test_accu}")

## Train RNN Model
---

### Load the files

In [51]:
tokens = load_file(tokens_path)
labels = load_file(labels_path)
embeddings = load_file(embeddings_path)
label_encoder = load_file(label_encoder_path)
num_classes = len(label_encoder.classes_)

### Split data into train, validation and test sets

In [52]:
X_train, X_test, y_train, y_test = train_test_split(tokens, labels,
                                                    test_size=0.2)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                      test_size=0.25)

### Create PyTorch datasets

In [53]:
train_dataset = TextDataset(X_train, embeddings, y_train)
valid_dataset = TextDataset(X_valid, embeddings, y_valid)
test_dataset = TextDataset(X_test, embeddings, y_test)

### Create data loaders

In [54]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16,
                                           shuffle=True, drop_last=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=16)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16)

### Create model object

In [55]:
model = RNNNetwork(input_size, hidden_size, num_classes)

### Move the model to GPU if available

In [57]:
if torch.cuda.is_available():
    model = model.cuda()

### Define loss function and optimizer

In [58]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### Training loop

In [59]:
train(train_loader, valid_loader, model, criterion, optimizer,
      device, num_epochs, rnn_model_path)

Epoch 1 of 50


100%|██████████| 30350/30350 [01:00<00:00, 498.73it/s]
100%|██████████| 10117/10117 [00:08<00:00, 1127.44it/s]


Train Loss: 1.305462280321357, Validation Loss: 1.12854190928858
Best Validation Loss: 1.12854190928858
Epoch 2 of 50


100%|██████████| 30350/30350 [01:18<00:00, 388.82it/s]
100%|██████████| 10117/10117 [00:08<00:00, 1174.88it/s]


Train Loss: 1.0633383750738776, Validation Loss: 0.9980560378487713
Best Validation Loss: 0.9980560378487713
Epoch 3 of 50


100%|██████████| 30350/30350 [01:08<00:00, 444.72it/s]
100%|██████████| 10117/10117 [00:08<00:00, 1258.69it/s]


Train Loss: 0.9712646309552712, Validation Loss: 0.9311715503967691
Best Validation Loss: 0.9311715503967691
Epoch 4 of 50


100%|██████████| 30350/30350 [01:06<00:00, 454.52it/s]
100%|██████████| 10117/10117 [00:08<00:00, 1144.07it/s]


Train Loss: 0.9317542730107339, Validation Loss: 0.912754468000476
Best Validation Loss: 0.912754468000476
Epoch 5 of 50


100%|██████████| 30350/30350 [01:03<00:00, 481.40it/s]
100%|██████████| 10117/10117 [00:11<00:00, 882.25it/s]


Train Loss: 0.9078958927691474, Validation Loss: 0.8907457286615528
Best Validation Loss: 0.8907457286615528
Epoch 6 of 50


100%|██████████| 30350/30350 [01:05<00:00, 460.30it/s]
100%|██████████| 10117/10117 [00:10<00:00, 964.82it/s]


Train Loss: 0.8897850047928301, Validation Loss: 0.8869621217916567
Best Validation Loss: 0.8869621217916567
Epoch 7 of 50


100%|██████████| 30350/30350 [01:14<00:00, 408.92it/s]
100%|██████████| 10117/10117 [00:08<00:00, 1150.35it/s]


Train Loss: 0.8762083641480182, Validation Loss: 0.8631876677133307
Best Validation Loss: 0.8631876677133307
Epoch 8 of 50


100%|██████████| 30350/30350 [01:06<00:00, 456.80it/s]
100%|██████████| 10117/10117 [00:09<00:00, 1120.85it/s]


Train Loss: 0.865526166506222, Validation Loss: 0.8502649450765788
Best Validation Loss: 0.8502649450765788
Epoch 9 of 50


100%|██████████| 30350/30350 [01:04<00:00, 473.00it/s]
100%|██████████| 10117/10117 [00:08<00:00, 1222.17it/s]


Train Loss: 0.8566508811432408, Validation Loss: 0.8635556861865753
Best Validation Loss: 0.8502649450765788
Epoch 10 of 50


100%|██████████| 30350/30350 [01:03<00:00, 479.69it/s]
100%|██████████| 10117/10117 [00:07<00:00, 1300.95it/s]


Train Loss: 0.8488770872018188, Validation Loss: 0.8394032278641936
Best Validation Loss: 0.8394032278641936
Epoch 11 of 50


100%|██████████| 30350/30350 [01:13<00:00, 412.29it/s]
100%|██████████| 10117/10117 [00:09<00:00, 1026.91it/s]


Train Loss: 0.8419865683995225, Validation Loss: 0.8581462083650434
Best Validation Loss: 0.8394032278641936
Epoch 12 of 50


100%|██████████| 30350/30350 [01:07<00:00, 450.13it/s]
100%|██████████| 10117/10117 [00:08<00:00, 1210.73it/s]


Train Loss: 0.8368503605030612, Validation Loss: 0.8366958250641363
Best Validation Loss: 0.8366958250641363
Epoch 13 of 50


100%|██████████| 30350/30350 [01:04<00:00, 472.67it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1602.06it/s]


Train Loss: 0.8316204664918795, Validation Loss: 0.8316167837456775
Best Validation Loss: 0.8316167837456775
Epoch 14 of 50


100%|██████████| 30350/30350 [00:50<00:00, 602.81it/s]
100%|██████████| 10117/10117 [00:07<00:00, 1394.85it/s]


Train Loss: 0.8280047915159084, Validation Loss: 0.8191775587827683
Best Validation Loss: 0.8191775587827683
Epoch 15 of 50


100%|██████████| 30350/30350 [00:49<00:00, 614.61it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1450.36it/s]


Train Loss: 0.8232801456557958, Validation Loss: 0.8203334696822879
Best Validation Loss: 0.8191775587827683
Epoch 16 of 50


100%|██████████| 30350/30350 [00:51<00:00, 586.22it/s]
100%|██████████| 10117/10117 [00:07<00:00, 1389.55it/s]


Train Loss: 0.8201424082460671, Validation Loss: 0.82231493917416
Best Validation Loss: 0.8191775587827683
Epoch 17 of 50


100%|██████████| 30350/30350 [01:02<00:00, 483.77it/s]
100%|██████████| 10117/10117 [00:08<00:00, 1244.22it/s]


Train Loss: 0.8165044872013505, Validation Loss: 0.8167568667817283
Best Validation Loss: 0.8167568667817283
Epoch 18 of 50


100%|██████████| 30350/30350 [01:01<00:00, 494.17it/s]
100%|██████████| 10117/10117 [00:07<00:00, 1383.71it/s]


Train Loss: 0.8138511769657669, Validation Loss: 0.8220901922167697
Best Validation Loss: 0.8167568667817283
Epoch 19 of 50


100%|██████████| 30350/30350 [00:50<00:00, 600.85it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1593.14it/s]


Train Loss: 0.8112909333007733, Validation Loss: 0.8133508419641761
Best Validation Loss: 0.8133508419641761
Epoch 20 of 50


100%|██████████| 30350/30350 [00:52<00:00, 573.99it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1652.01it/s]


Train Loss: 0.8088636774718271, Validation Loss: 0.8135339990574887
Best Validation Loss: 0.8133508419641761
Epoch 21 of 50


100%|██████████| 30350/30350 [00:49<00:00, 614.20it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1682.13it/s]


Train Loss: 0.8061224449726659, Validation Loss: 0.8023605816205452
Best Validation Loss: 0.8023605816205452
Epoch 22 of 50


100%|██████████| 30350/30350 [00:50<00:00, 599.91it/s]
100%|██████████| 10117/10117 [00:07<00:00, 1345.12it/s]


Train Loss: 0.8044222050554178, Validation Loss: 0.7975513533757846
Best Validation Loss: 0.7975513533757846
Epoch 23 of 50


100%|██████████| 30350/30350 [00:53<00:00, 572.37it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1495.98it/s]


Train Loss: 0.802466705017025, Validation Loss: 0.8099861002252945
Best Validation Loss: 0.7975513533757846
Epoch 24 of 50


100%|██████████| 30350/30350 [00:49<00:00, 609.86it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1522.98it/s]


Train Loss: 0.8005807955911662, Validation Loss: 0.8042048770671801
Best Validation Loss: 0.7975513533757846
Epoch 25 of 50


100%|██████████| 30350/30350 [00:48<00:00, 629.30it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1506.64it/s]


Train Loss: 0.7991783129577287, Validation Loss: 0.8082107726830297
Best Validation Loss: 0.7975513533757846
Epoch 26 of 50


100%|██████████| 30350/30350 [00:44<00:00, 675.12it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1597.20it/s]


Train Loss: 0.797542612376873, Validation Loss: 0.7997976637050381
Best Validation Loss: 0.7975513533757846
Epoch 27 of 50


100%|██████████| 30350/30350 [00:47<00:00, 643.97it/s]
100%|██████████| 10117/10117 [00:05<00:00, 1687.97it/s]


Train Loss: 0.7959651133585015, Validation Loss: 0.8242966694702293
Best Validation Loss: 0.7975513533757846
Epoch 28 of 50


100%|██████████| 30350/30350 [00:48<00:00, 630.10it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1517.31it/s]


Train Loss: 0.7948503118871267, Validation Loss: 0.7944183549406006
Best Validation Loss: 0.7944183549406006
Epoch 29 of 50


100%|██████████| 30350/30350 [00:49<00:00, 609.17it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1611.82it/s]


Train Loss: 0.7933396024604311, Validation Loss: 0.7943687401465233
Best Validation Loss: 0.7943687401465233
Epoch 30 of 50


100%|██████████| 30350/30350 [00:47<00:00, 640.94it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1607.21it/s]


Train Loss: 0.7923236874578125, Validation Loss: 0.8037680365101756
Best Validation Loss: 0.7943687401465233
Epoch 31 of 50


100%|██████████| 30350/30350 [00:46<00:00, 648.29it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1626.51it/s]


Train Loss: 0.7912860986784321, Validation Loss: 0.8062297424550606
Best Validation Loss: 0.7943687401465233
Epoch 32 of 50


100%|██████████| 30350/30350 [00:47<00:00, 633.16it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1662.75it/s]


Train Loss: 0.7896141278247857, Validation Loss: 0.8177321894379534
Best Validation Loss: 0.7943687401465233
Epoch 33 of 50


100%|██████████| 30350/30350 [00:53<00:00, 564.07it/s]
100%|██████████| 10117/10117 [00:07<00:00, 1358.68it/s]


Train Loss: 0.7888575865944488, Validation Loss: 0.7920780800230668
Best Validation Loss: 0.7920780800230668
Epoch 34 of 50


100%|██████████| 30350/30350 [00:48<00:00, 628.63it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1653.78it/s]


Train Loss: 0.7878595198364391, Validation Loss: 0.7818968307568973
Best Validation Loss: 0.7818968307568973
Epoch 35 of 50


100%|██████████| 30350/30350 [00:46<00:00, 653.17it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1587.95it/s]


Train Loss: 0.7866633743644251, Validation Loss: 0.7824568486610184
Best Validation Loss: 0.7818968307568973
Epoch 36 of 50


100%|██████████| 30350/30350 [00:46<00:00, 648.85it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1638.99it/s]


Train Loss: 0.7862598809950925, Validation Loss: 0.7852414665844786
Best Validation Loss: 0.7818968307568973
Epoch 37 of 50


100%|██████████| 30350/30350 [00:47<00:00, 641.39it/s]
100%|██████████| 10117/10117 [00:07<00:00, 1333.70it/s]


Train Loss: 0.7851409801173053, Validation Loss: 0.7908566079439471
Best Validation Loss: 0.7818968307568973
Epoch 38 of 50


100%|██████████| 30350/30350 [00:47<00:00, 644.58it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1585.23it/s]


Train Loss: 0.7842497016894562, Validation Loss: 0.7848858094868175
Best Validation Loss: 0.7818968307568973
Epoch 39 of 50


100%|██████████| 30350/30350 [00:48<00:00, 619.93it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1472.59it/s]


Train Loss: 0.7834718816299022, Validation Loss: 0.785648777098527
Best Validation Loss: 0.7818968307568973
Epoch 40 of 50


100%|██████████| 30350/30350 [00:47<00:00, 632.31it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1561.52it/s]


Train Loss: 0.7830285620164046, Validation Loss: 0.789573162922254
Best Validation Loss: 0.7818968307568973
Epoch 41 of 50


100%|██████████| 30350/30350 [00:45<00:00, 663.99it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1521.90it/s]


Train Loss: 0.7818130693571179, Validation Loss: 0.7865121438841557
Best Validation Loss: 0.7818968307568973
Epoch 42 of 50


100%|██████████| 30350/30350 [00:47<00:00, 638.55it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1581.20it/s]


Train Loss: 0.7812294029276297, Validation Loss: 0.7834890100873364
Best Validation Loss: 0.7818968307568973
Epoch 43 of 50


100%|██████████| 30350/30350 [00:46<00:00, 650.35it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1613.61it/s]


Train Loss: 0.7800910723494148, Validation Loss: 0.7874385960804411
Best Validation Loss: 0.7818968307568973
Epoch 44 of 50


100%|██████████| 30350/30350 [00:47<00:00, 635.68it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1505.76it/s]


Train Loss: 0.7799775541039825, Validation Loss: 0.7782703645529572
Best Validation Loss: 0.7782703645529572
Epoch 45 of 50


100%|██████████| 30350/30350 [00:48<00:00, 627.07it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1533.50it/s]


Train Loss: 0.7793747964440009, Validation Loss: 0.7881875202859522
Best Validation Loss: 0.7782703645529572
Epoch 46 of 50


100%|██████████| 30350/30350 [00:49<00:00, 614.75it/s]
100%|██████████| 10117/10117 [00:08<00:00, 1147.71it/s]


Train Loss: 0.7784560581693146, Validation Loss: 0.7922045113532642
Best Validation Loss: 0.7782703645529572
Epoch 47 of 50


100%|██████████| 30350/30350 [00:50<00:00, 606.12it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1541.03it/s]


Train Loss: 0.7778393973046807, Validation Loss: 0.7960779434700569
Best Validation Loss: 0.7782703645529572
Epoch 48 of 50


100%|██████████| 30350/30350 [00:49<00:00, 615.21it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1514.70it/s]


Train Loss: 0.7771851942836747, Validation Loss: 0.789217439401971
Best Validation Loss: 0.7782703645529572
Epoch 49 of 50


100%|██████████| 30350/30350 [00:48<00:00, 624.87it/s]
100%|██████████| 10117/10117 [00:08<00:00, 1212.90it/s]


Train Loss: 0.7768552448243344, Validation Loss: 0.7854628233310673
Best Validation Loss: 0.7782703645529572
Epoch 50 of 50


100%|██████████| 30350/30350 [00:47<00:00, 640.64it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1606.90it/s]

Train Loss: 0.7761956553298618, Validation Loss: 0.7895209233247917
Best Validation Loss: 0.7782703645529572





## Train LSTM Model
---

In [60]:
model = LSTMNetwork(input_size, hidden_size, num_classes)

In [61]:
if torch.cuda.is_available():
    model = model.cuda()

In [62]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [63]:
train(train_loader, valid_loader, model, criterion, optimizer,
      device, num_epochs, lstm_model_path)

Epoch 1 of 50


100%|██████████| 30350/30350 [01:58<00:00, 255.51it/s]
100%|██████████| 10117/10117 [00:15<00:00, 668.89it/s]


Train Loss: 1.080295099545841, Validation Loss: 0.8992703016628281
Best Validation Loss: 0.8992703016628281
Epoch 2 of 50


100%|██████████| 30350/30350 [01:44<00:00, 290.49it/s]
100%|██████████| 10117/10117 [00:14<00:00, 716.79it/s]


Train Loss: 0.8606255145996956, Validation Loss: 0.8418276267975136
Best Validation Loss: 0.8418276267975136
Epoch 3 of 50


100%|██████████| 30350/30350 [01:55<00:00, 262.04it/s]
100%|██████████| 10117/10117 [00:15<00:00, 671.93it/s]


Train Loss: 0.8105015787253957, Validation Loss: 0.7948699417342967
Best Validation Loss: 0.7948699417342967
Epoch 4 of 50


100%|██████████| 30350/30350 [01:48<00:00, 279.67it/s]
100%|██████████| 10117/10117 [00:14<00:00, 702.01it/s]


Train Loss: 0.7838519688841735, Validation Loss: 0.7778982427098233
Best Validation Loss: 0.7778982427098233
Epoch 5 of 50


100%|██████████| 30350/30350 [01:49<00:00, 277.65it/s]
100%|██████████| 10117/10117 [00:14<00:00, 712.49it/s]


Train Loss: 0.7654530245397099, Validation Loss: 0.7637947960609891
Best Validation Loss: 0.7637947960609891
Epoch 6 of 50


100%|██████████| 30350/30350 [01:47<00:00, 282.57it/s]
100%|██████████| 10117/10117 [00:15<00:00, 671.03it/s]


Train Loss: 0.7520337783860709, Validation Loss: 0.753666719377742
Best Validation Loss: 0.753666719377742
Epoch 7 of 50


100%|██████████| 30350/30350 [01:47<00:00, 282.35it/s]
100%|██████████| 10117/10117 [00:14<00:00, 678.27it/s]


Train Loss: 0.7414386615395251, Validation Loss: 0.7445820605466462
Best Validation Loss: 0.7445820605466462
Epoch 8 of 50


100%|██████████| 30350/30350 [01:49<00:00, 277.23it/s]
100%|██████████| 10117/10117 [00:14<00:00, 689.94it/s]


Train Loss: 0.7326010665908775, Validation Loss: 0.7321363875852857
Best Validation Loss: 0.7321363875852857
Epoch 9 of 50


100%|██████████| 30350/30350 [01:49<00:00, 276.13it/s]
100%|██████████| 10117/10117 [00:17<00:00, 585.02it/s]


Train Loss: 0.7250196981039727, Validation Loss: 0.7269639821465901
Best Validation Loss: 0.7269639821465901
Epoch 10 of 50


100%|██████████| 30350/30350 [01:47<00:00, 283.41it/s]
100%|██████████| 10117/10117 [00:20<00:00, 503.59it/s]


Train Loss: 0.718625591872306, Validation Loss: 0.7217105512005652
Best Validation Loss: 0.7217105512005652
Epoch 11 of 50


100%|██████████| 30350/30350 [01:51<00:00, 271.90it/s]
100%|██████████| 10117/10117 [00:13<00:00, 724.11it/s]


Train Loss: 0.7128806014603258, Validation Loss: 0.7215211406457657
Best Validation Loss: 0.7215211406457657
Epoch 12 of 50


100%|██████████| 30350/30350 [01:57<00:00, 258.09it/s]
100%|██████████| 10117/10117 [00:14<00:00, 701.91it/s]


Train Loss: 0.7083694055696099, Validation Loss: 0.7142648222314295
Best Validation Loss: 0.7142648222314295
Epoch 13 of 50


100%|██████████| 30350/30350 [01:44<00:00, 290.36it/s]
100%|██████████| 10117/10117 [00:15<00:00, 657.20it/s]


Train Loss: 0.7039533184316543, Validation Loss: 0.7108634748908799
Best Validation Loss: 0.7108634748908799
Epoch 14 of 50


100%|██████████| 30350/30350 [01:50<00:00, 275.83it/s]
100%|██████████| 10117/10117 [00:22<00:00, 454.51it/s]


Train Loss: 0.7000626367126795, Validation Loss: 0.7162537941604402
Best Validation Loss: 0.7108634748908799
Epoch 15 of 50


100%|██████████| 30350/30350 [01:58<00:00, 255.31it/s]
100%|██████████| 10117/10117 [00:15<00:00, 672.37it/s]


Train Loss: 0.6966368327858126, Validation Loss: 0.7045715729129448
Best Validation Loss: 0.7045715729129448
Epoch 16 of 50


100%|██████████| 30350/30350 [01:54<00:00, 264.03it/s]
100%|██████████| 10117/10117 [00:15<00:00, 662.68it/s]


Train Loss: 0.6932389963220627, Validation Loss: 0.7018126357430359
Best Validation Loss: 0.7018126357430359
Epoch 17 of 50


100%|██████████| 30350/30350 [01:52<00:00, 269.16it/s]
100%|██████████| 10117/10117 [00:15<00:00, 661.45it/s]


Train Loss: 0.6900495241908894, Validation Loss: 0.7075287903296072
Best Validation Loss: 0.7018126357430359
Epoch 18 of 50


100%|██████████| 30350/30350 [01:53<00:00, 267.48it/s]
100%|██████████| 10117/10117 [00:14<00:00, 693.65it/s]


Train Loss: 0.6874848237081356, Validation Loss: 0.6978744672790127
Best Validation Loss: 0.6978744672790127
Epoch 19 of 50


100%|██████████| 30350/30350 [02:00<00:00, 251.34it/s]
100%|██████████| 10117/10117 [00:15<00:00, 647.99it/s]


Train Loss: 0.6847978833251475, Validation Loss: 0.6992575633478127
Best Validation Loss: 0.6978744672790127
Epoch 20 of 50


100%|██████████| 30350/30350 [01:49<00:00, 276.03it/s]
100%|██████████| 10117/10117 [00:15<00:00, 662.95it/s]


Train Loss: 0.6824158798649146, Validation Loss: 0.69511058771635
Best Validation Loss: 0.69511058771635
Epoch 21 of 50


100%|██████████| 30350/30350 [01:48<00:00, 280.70it/s]
100%|██████████| 10117/10117 [00:15<00:00, 656.02it/s]


Train Loss: 0.6800410522942782, Validation Loss: 0.6992430727823198
Best Validation Loss: 0.69511058771635
Epoch 22 of 50


100%|██████████| 30350/30350 [01:55<00:00, 262.29it/s]
100%|██████████| 10117/10117 [00:14<00:00, 720.94it/s]


Train Loss: 0.677893554907556, Validation Loss: 0.694997925156306
Best Validation Loss: 0.694997925156306
Epoch 23 of 50


100%|██████████| 30350/30350 [01:48<00:00, 278.79it/s]
100%|██████████| 10117/10117 [00:15<00:00, 669.48it/s]


Train Loss: 0.6760322362707415, Validation Loss: 0.6949617177074201
Best Validation Loss: 0.6949617177074201
Epoch 24 of 50


100%|██████████| 30350/30350 [02:00<00:00, 251.09it/s]
100%|██████████| 10117/10117 [00:16<00:00, 605.38it/s]


Train Loss: 0.6740228308249983, Validation Loss: 0.6924516911520557
Best Validation Loss: 0.6924516911520557
Epoch 25 of 50


100%|██████████| 30350/30350 [01:58<00:00, 255.48it/s]
100%|██████████| 10117/10117 [00:15<00:00, 633.44it/s]


Train Loss: 0.6721270323820916, Validation Loss: 0.6940533012752654
Best Validation Loss: 0.6924516911520557
Epoch 26 of 50


100%|██████████| 30350/30350 [01:50<00:00, 275.06it/s]
100%|██████████| 10117/10117 [00:16<00:00, 629.51it/s]


Train Loss: 0.6704298829511912, Validation Loss: 0.6889626992516771
Best Validation Loss: 0.6889626992516771
Epoch 27 of 50


100%|██████████| 30350/30350 [01:58<00:00, 257.08it/s]
100%|██████████| 10117/10117 [00:14<00:00, 693.12it/s]


Train Loss: 0.6688768414728331, Validation Loss: 0.691005371643542
Best Validation Loss: 0.6889626992516771
Epoch 28 of 50


100%|██████████| 30350/30350 [01:47<00:00, 282.95it/s]
100%|██████████| 10117/10117 [00:15<00:00, 646.77it/s]


Train Loss: 0.6674871538228415, Validation Loss: 0.6866893923302065
Best Validation Loss: 0.6866893923302065
Epoch 29 of 50


100%|██████████| 30350/30350 [01:51<00:00, 271.71it/s]
100%|██████████| 10117/10117 [00:14<00:00, 701.18it/s]


Train Loss: 0.6657862885355262, Validation Loss: 0.6852634041570588
Best Validation Loss: 0.6852634041570588
Epoch 30 of 50


100%|██████████| 30350/30350 [01:42<00:00, 295.92it/s]
100%|██████████| 10117/10117 [00:14<00:00, 690.30it/s]


Train Loss: 0.6643395832751738, Validation Loss: 0.6889564336945037
Best Validation Loss: 0.6852634041570588
Epoch 31 of 50


100%|██████████| 30350/30350 [01:56<00:00, 260.55it/s]
100%|██████████| 10117/10117 [00:15<00:00, 662.32it/s]


Train Loss: 0.6629917586290414, Validation Loss: 0.6908648633283507
Best Validation Loss: 0.6852634041570588
Epoch 32 of 50


100%|██████████| 30350/30350 [01:47<00:00, 282.19it/s]
100%|██████████| 10117/10117 [00:16<00:00, 624.24it/s]


Train Loss: 0.6618764971726907, Validation Loss: 0.6869341177289413
Best Validation Loss: 0.6852634041570588
Epoch 33 of 50


100%|██████████| 30350/30350 [02:00<00:00, 252.68it/s]
100%|██████████| 10117/10117 [00:17<00:00, 588.83it/s]


Train Loss: 0.6605769626455016, Validation Loss: 0.6852569230518711
Best Validation Loss: 0.6852569230518711
Epoch 34 of 50


100%|██████████| 30350/30350 [01:35<00:00, 316.59it/s]
100%|██████████| 10117/10117 [00:12<00:00, 778.95it/s]


Train Loss: 0.6595391395167621, Validation Loss: 0.6829724178635769
Best Validation Loss: 0.6829724178635769
Epoch 35 of 50


100%|██████████| 30350/30350 [01:32<00:00, 327.49it/s]
100%|██████████| 10117/10117 [00:14<00:00, 702.58it/s]


Train Loss: 0.6583480152705012, Validation Loss: 0.6890958193986002
Best Validation Loss: 0.6829724178635769
Epoch 36 of 50


100%|██████████| 30350/30350 [01:39<00:00, 304.90it/s]
100%|██████████| 10117/10117 [00:14<00:00, 703.36it/s]


Train Loss: 0.6572570344475536, Validation Loss: 0.6815019263606528
Best Validation Loss: 0.6815019263606528
Epoch 37 of 50


100%|██████████| 30350/30350 [01:32<00:00, 327.06it/s]
100%|██████████| 10117/10117 [00:12<00:00, 822.73it/s]


Train Loss: 0.6561707115366049, Validation Loss: 0.6861804363818476
Best Validation Loss: 0.6815019263606528
Epoch 38 of 50


100%|██████████| 30350/30350 [01:30<00:00, 333.77it/s]
100%|██████████| 10117/10117 [00:12<00:00, 782.22it/s]


Train Loss: 0.6551692431146368, Validation Loss: 0.6813174233487134
Best Validation Loss: 0.6813174233487134
Epoch 39 of 50


100%|██████████| 30350/30350 [01:32<00:00, 328.76it/s]
100%|██████████| 10117/10117 [00:12<00:00, 815.83it/s]


Train Loss: 0.654143845541326, Validation Loss: 0.6807664826926954
Best Validation Loss: 0.6807664826926954
Epoch 40 of 50


100%|██████████| 30350/30350 [01:32<00:00, 328.72it/s]
100%|██████████| 10117/10117 [00:12<00:00, 795.76it/s]


Train Loss: 0.653442825090134, Validation Loss: 0.6813282321225729
Best Validation Loss: 0.6807664826926954
Epoch 41 of 50


100%|██████████| 30350/30350 [01:33<00:00, 325.43it/s]
100%|██████████| 10117/10117 [00:12<00:00, 798.52it/s]


Train Loss: 0.6523578157560682, Validation Loss: 0.6797220424146088
Best Validation Loss: 0.6797220424146088
Epoch 42 of 50


100%|██████████| 30350/30350 [01:35<00:00, 316.60it/s]
100%|██████████| 10117/10117 [00:12<00:00, 807.70it/s]


Train Loss: 0.6514294478172035, Validation Loss: 0.6820602599298329
Best Validation Loss: 0.6797220424146088
Epoch 43 of 50


100%|██████████| 30350/30350 [01:33<00:00, 323.73it/s]
100%|██████████| 10117/10117 [00:12<00:00, 824.67it/s]


Train Loss: 0.6505952295684294, Validation Loss: 0.6796445548924708
Best Validation Loss: 0.6796445548924708
Epoch 44 of 50


100%|██████████| 30350/30350 [01:31<00:00, 331.47it/s]
100%|██████████| 10117/10117 [00:14<00:00, 707.80it/s]


Train Loss: 0.6497122811092773, Validation Loss: 0.6804861786355612
Best Validation Loss: 0.6796445548924708
Epoch 45 of 50


100%|██████████| 30350/30350 [01:36<00:00, 314.58it/s]
100%|██████████| 10117/10117 [00:12<00:00, 816.63it/s]


Train Loss: 0.6488690782572822, Validation Loss: 0.6827470049666353
Best Validation Loss: 0.6796445548924708
Epoch 46 of 50


100%|██████████| 30350/30350 [01:56<00:00, 259.93it/s]
100%|██████████| 10117/10117 [00:13<00:00, 732.59it/s]


Train Loss: 0.6481051689162973, Validation Loss: 0.6779381349296912
Best Validation Loss: 0.6779381349296912
Epoch 47 of 50


100%|██████████| 30350/30350 [01:39<00:00, 306.38it/s]
100%|██████████| 10117/10117 [00:13<00:00, 764.27it/s]


Train Loss: 0.6474209205841851, Validation Loss: 0.6791270566439398
Best Validation Loss: 0.6779381349296912
Epoch 48 of 50


100%|██████████| 30350/30350 [01:34<00:00, 319.50it/s]
100%|██████████| 10117/10117 [00:13<00:00, 775.77it/s]


Train Loss: 0.6468479179420326, Validation Loss: 0.6793247973374242
Best Validation Loss: 0.6779381349296912
Epoch 49 of 50


100%|██████████| 30350/30350 [01:47<00:00, 282.20it/s]
100%|██████████| 10117/10117 [00:14<00:00, 693.36it/s]


Train Loss: 0.6459965191730582, Validation Loss: 0.6807471138376474
Best Validation Loss: 0.6779381349296912
Epoch 50 of 50


100%|██████████| 30350/30350 [01:49<00:00, 276.58it/s]
100%|██████████| 10117/10117 [00:15<00:00, 657.32it/s]

Train Loss: 0.6454481828606747, Validation Loss: 0.6788011151430835
Best Validation Loss: 0.6779381349296912





In [64]:
test(test_loader, model, criterion, device)

100%|██████████| 10117/10117 [00:19<00:00, 530.93it/s]

Test Loss: 0.6795250973896664, Test Accuracy: 0.770104679100676





## Predict on new text
---

In [65]:
input_text = '''I am a victim of Identity Theft & currently have an Experian account that 
I can view my Experian Credit Report and getting notified when there is activity on 
my Experian Credit Report. For the past 3 days I've spent a total of approximately 9 
hours on the phone with Experian. Every time I call I get transferred repeatedly and 
then my last transfer and automated message states to press 1 and leave a message and 
someone would call me. Every time I press 1 I get an automatic message stating than you 
before I even leave a message and get disconnected. I call Experian again, explain what 
is happening and the process begins again with the same end result. I was trying to have 
this issue attended and resolved informally but I give up after 9 hours. There are hard 
hit inquiries on my Experian Credit Report that are fraud, I didn't authorize, or recall 
and I respectfully request that Experian remove the hard hit inquiries immediately just 
like they've done in the past when I was able to speak to a live Experian representative 
in the United States. The following are the hard hit inquiries : BK OF XXXX XX/XX/XXXX 
XXXX XXXX XXXX  XX/XX/XXXX XXXX  XXXX XXXX  XX/XX/XXXX XXXX  XX/XX/XXXX XXXX  XXXX 
XX/XX/XXXX'''

### Process input text

In [66]:
from nltk.corpus import stopwords
input_text = input_text.lower()
input_text = re.sub(r"[^\w\d'\s]+", " ", input_text)
input_text = re.sub("\d+", "", input_text)
input_text = re.sub(r'[x]{2,}', "", input_text)
input_text = re.sub(' +', ' ', input_text)
print(input_text)
tokens = word_tokenize(input_text)
print(tokens)

i am a victim of identity theft currently have an experian account that 
i can view my experian credit report and getting notified when there is activity on 
my experian credit report for the past days i've spent a total of approximately 
hours on the phone with experian every time i call i get transferred repeatedly and 
then my last transfer and automated message states to press and leave a message and 
someone would call me every time i press i get an automatic message stating than you 
before i even leave a message and get disconnected i call experian again explain what 
is happening and the process begins again with the same end result i was trying to have 
this issue attended and resolved informally but i give up after hours there are hard 
hit inquiries on my experian credit report that are fraud i didn't authorize or recall 
and i respectfully request that experian remove the hard hit inquiries immediately just 
like they've done in the past when i was able to speak to a live e

### Add padding if the length of tokens is less than 20

In [67]:
tokens = ['<pad>']*(20-len(tokens))+tokens

### Tokenize the input text

In [68]:
idx_token = []
for token in tokens:
    if token in vocabulary:
        idx_token.append(vocabulary.index(token))
    else:
        idx_token.append(vocabulary.index('<unk>'))

### Get embeddings for tokens

In [69]:
token_emb = embeddings[idx_token,:]

### Convert to torch tensor

In [70]:
inp = torch.from_numpy(token_emb)

### Move the tensor to GPU if available

In [71]:
inp = inp.to(device)

### Create a batch of one record

In [72]:
inp = torch.unsqueeze(inp, 0)

### Load label encoder

In [73]:
label_encoder = load_file(label_encoder_path)
num_classes = len(label_encoder.classes_)

In [74]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### RNN prediction

In [75]:
# Create model object
model = RNNNetwork(input_size, hidden_size, num_classes)

# Load trained weights
model.load_state_dict(torch.load(rnn_model_path))

# Move the model to GPU if available
if torch.cuda.is_available():
    model = model.cuda()
    
# Forward pass
out = torch.squeeze(model(inp))

# Find predicted class
prediction = label_encoder.classes_[torch.argmax(out)]
print(f"Predicted  Class: {prediction}")

Predicted  Class: credit_report


  model.load_state_dict(torch.load(rnn_model_path))


### LSTM prediction

In [76]:
# Create model object
model = LSTMNetwork(input_size, hidden_size, num_classes)

# Load trained weights
model.load_state_dict(torch.load(lstm_model_path))

# Move the model to GPU if available
if torch.cuda.is_available():
    model = model.cuda()
    
# Forward pass
out = torch.squeeze(model(inp))

# Find predicted class
prediction = label_encoder.classes_[torch.argmax(out)]
print(f"Predicted  Class: {prediction}")

Predicted  Class: credit_report


  model.load_state_dict(torch.load(lstm_model_path))
