In [None]:
# Importing Libraries
import re
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
lr = 0.0005  # Learning rate for the model training process
vec_len = 50  # Length of the vector for the attention model
seq_len = 20  # Length of the input sequence for the attention model
num_epochs = 50  # Number of training epochs

label_col = "Product"  # Name of the column containing product labels in the dataset

# Paths to various data files and saved models
tokens_path = "Output/tokens.pkl"  # Path to save tokens
labels_path = "Output/labels.pkl"  # Path to save labels
data_path = "Input/complaints.csv"  # Path to the input dataset (CSV file)
model_path = "Output/attention.pth"  # Path to save the trained attention model
vocabulary_path = "Output/vocabulary.pkl"  # Path to save vocabulary
embeddings_path = "Output/embeddings.pkl"  # Path to save word embeddings
glove_vector_path = "Input/glove.6B.50d.txt"  # Path to the GloVe word vectors file
text_col_name = "Consumer complaint narrative"  # Name of the text column in the dataset

label_encoder_path = "Output/label_encoder.pkl"  # Path to save the label encoder

# A dictionary mapping product names to shorter names or labels
product_map = {
    'Vehicle loan or lease': 'vehicle_loan',
    'Credit reporting, credit repair services, or other personal consumer reports': 'credit_report',
    'Credit card or prepaid card': 'card',
    'Money transfer, virtual currency, or money service': 'money_transfer',
    'virtual currency': 'money_transfer',
    'Mortgage': 'mortgage',
    'Payday loan, title loan, or personal loan': 'loan',
    'Debt collection': 'debt_collection',
    'Checking or savings account': 'savings_account',
    'Credit card': 'card',
    'Bank account or service': 'savings_account',
    'Credit reporting': 'credit_report',
    'Prepaid card': 'card',
    'Payday loan': 'loan',
    'Other financial service': 'others',
    'Virtual currency': 'money_transfer',
    'Student loan': 'loan',
    'Consumer Loan': 'loan',
    'Money transfers': 'money_transfer'
}


In [None]:
def save_file(name, obj):
    """
    Function to save an object as pickle file
    """
    with open(name, 'wb') as f:
        pickle.dump(obj, f)


def load_file(name):
    """
    Function to load a pickle object
    """
    return pickle.load(open(name, "rb"))

## Process glove embeddings
---

In [None]:
# Open and read the GloVe word vectors file at 'glove_vector_path'
with open(glove_vector_path, "rt") as f:
    emb = f.readlines()


In [None]:
# Initialize empty lists to store vocabulary and embeddings
vocabulary, embeddings = [], []

# Iterate through each item in the 'emb' list
for item in emb:
    # Split the item into a list of words using space as the delimiter
    # The first element [0] is considered as the word in the vocabulary
    word = item.split()[0]

    # The remaining elements [1:] are considered as the embedding values
    embedding = item.split()[1:]

    # Append the word to the 'vocabulary' list
    vocabulary.append(word)

    # Append the embedding values to the 'embeddings' list
    embeddings.append(embedding)


In [None]:
# Convert the 'embeddings' list to a NumPy array
embeddings = np.array(embeddings, dtype=np.float32)


In [None]:
# Define the special tokens
special_tokens = ["<pad>", "<unk>"]

# Concatenate the special tokens with the existing 'vocabulary' list
vocabulary = special_tokens + vocabulary


In [None]:
# Create a row of ones
ones_row = np.ones(50, dtype=np.float32)

# Compute the mean of 'embeddings'
mean_embedding = np.mean(embeddings, axis=0)

# Stack ones, mean, and original embeddings vertically
combined_embeddings = np.vstack([ones_row, mean_embedding, embeddings])


In [None]:
# Save the 'embeddings' data
save_file(embeddings_path, embeddings)

# Save the 'vocabulary' data
save_file(vocabulary_path, vocabulary)


## Process text data
---

In [None]:
data = pd.read_csv(data_path)

In [None]:
data.dropna(subset=[text_col_name], inplace=True)

In [None]:
# Replace values in the 'label_col' column of the 'data' DataFrame
# using the mapping defined in 'product_map'
data.replace({label_col: product_map}, inplace=True)


---

### Encode labels

In [None]:
# Initialize a label encoder
label_encoder = LabelEncoder()

# Fit the encoder to the 'label_col' values in the data
label_encoder.fit(data[label_col])

# Transform 'label_col' values into numerical labels
labels = label_encoder.transform(data[label_col])


In [None]:
save_file(labels_path, labels)
save_file(label_encoder_path, label_encoder)

---

### Process the text column

In [None]:
# Create a list of input text from the 'text_col_name' column in the 'data' DataFrame
input_text = list(data[text_col_name])


In [None]:
len(input_text)

### Convert text to lower case

In [None]:
# Convert each element in 'input_text' to lowercase using list comprehension
# while displaying a tqdm progress bar
input_text = [i.lower() for i in tqdm(input_text)]


### Remove punctuations except apostrophe

In [None]:
# Replace non-alphanumeric characters (excluding single quotes and spaces) 
# with a space in each element of 'input_text' while displaying a tqdm progress bar
input_text = [re.sub(r"[^\w\d'\s]+", " ", i) for i in tqdm(input_text)]


### Remove digits

In [None]:
# Remove all digits from each element of 'input_text' while displaying a tqdm progress bar
input_text = [re.sub("\d+", "", i) for i in tqdm(input_text)]


### Remove more than one consecutive instance of 'x'

In [None]:
# Remove consecutive occurrences of 'x' (two or more) from each element of 'input_text'
input_text = [re.sub(r'[x]{2,}', "", i) for i in tqdm(input_text)]


### Remove multiple spaces with single space

In [None]:
# Replace multiple consecutive spaces with a single space in each element of 'input_text'
input_text = [re.sub(' +', ' ', i) for i in tqdm(input_text)]


### Tokenize the text

In [None]:
# Tokenize each element of 'input_text' into words while displaying a tqdm progress bar
tokens = [word_tokenize(t) for t in tqdm(input_text)]


### Take the first 20 tokens in each complaint text

In [None]:
# Ensure that each tokenized element in 'tokens' has a maximum length of 20 words
# by padding with '<pad>' if needed, while displaying a tqdm progress bar
tokens = [i[:20] if len(i) > 19 else ['<pad>'] * (20 - len(i)) + i for i in tqdm(tokens)]


---

### Convert tokens to integer indices from vocabulary

In [None]:
def token_index(tokens, vocabulary, missing='<unk>'):
    """
    Convert a list of word tokens to a list of corresponding integers based on a given vocabulary.

    :param tokens: List of word tokens to be converted.
    :param vocabulary: List of all words in the embeddings.
    :param missing: Token to use for words not present in the vocabulary (default is '<unk>').
    
    :return: List of integers representing the word tokens.
    """
    
    # Initialize an empty list to store the integer representations of tokens
    idx_token = []
    
    # Iterate through each text in 'tokens'
    for text in tqdm(tokens):
        # Initialize an empty list to store the integer representations of words in the text
        idx_text = []
        # Iterate through each token in the text
        for token in text:
            # Check if the token is in the vocabulary
            if token in vocabulary:
                # Append the index of the token in the vocabulary to idx_text
                idx_text.append(vocabulary.index(token))
            else:
                # Append the index of the 'missing' token to idx_text if token is not in the vocabulary
                idx_text.append(vocabulary.index(missing))
        # Append the list of integer representations of words in the text to idx_token
        idx_token.append(idx_text)
    
    # Return the list of integer representations of word tokens
    return idx_token


In [None]:
tokens = token_index(tokens, vocabulary)

### Save the tokens

In [None]:
save_file(tokens_path, tokens)

---

## Create attention model

In [None]:
class AttentionModel(nn.Module):
    """
    A neural network model that applies attention mechanism to input data.

    :param vec_len: Length of input vectors.
    :param seq_len: Length of input sequences.
    :param n_classes: Number of output classes.
    """

    def __init__(self, vec_len, seq_len, n_classes):
        super(AttentionModel, self).__init__()

        # Define the length of input vectors and sequences
        self.vec_len = vec_len
        self.seq_len = seq_len

        # Initialize attention weights with random values
        self.attn_weights = torch.cat([torch.tensor([[0.]]),
                                       torch.randn(vec_len, 1) /
                                       torch.sqrt(torch.tensor(vec_len))])
        self.attn_weights.requires_grad = True
        self.attn_weights = nn.Parameter(self.attn_weights)

        # Activation function for attention
        self.activation = nn.Tanh()

        # Softmax function to compute attention weights
        self.softmax = nn.Softmax(dim=1)

        # Linear layer for the final output
        self.linear = nn.Linear(vec_len + 1, n_classes)

    def forward(self, input_data):
        """
        Forward pass of the attention model.

        :param input_data: Input data (shape: batch_size x seq_len x vec_len).

        :return: Model output (shape: batch_size x n_classes).
        """

        # Calculate weighted hidden states using attention weights
        hidden = torch.matmul(input_data, self.attn_weights)
        hidden = self.activation(hidden)

        # Compute attention weights using softmax
        attn = self.softmax(hidden)

        # Repeat and reshape attention weights for element-wise multiplication
        attn = attn.repeat(1, 1, self.vec_len + 1).reshape(attn.shape[0],
                                                           self.seq_len,
                                                           self.vec_len + 1)

        # Apply attention to the input data
        attn_output = input_data * attn

        # Sum along the sequence dimension
        attn_output = torch.sum(attn_output, axis=1)

        # Pass the attention-weighted output through a linear layer
        output = self.linear(attn_output)

        return output


## Create PyTorch dataset
---

In [None]:
class TextDataset(torch.utils.data.Dataset):
    """
    A PyTorch dataset for text classification tasks.

    :param tokens: List of word tokens.
    :param embeddings: Word embeddings (e.g., from GloVe).
    :param labels: List of labels.
    """

    def __init__(self, tokens, embeddings, labels):
        """
        Initialize the dataset with the provided data.

        :param tokens: List of word tokens.
        :param embeddings: Word embeddings (from GloVe or similar).
        :param labels: List of labels.
        """
        self.tokens = tokens
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        """
        Get the total number of samples in the dataset.

        :return: The number of samples in the dataset.
        """
        return len(self.tokens)

    def __getitem__(self, idx):
        """
        Get a single sample from the dataset by index.

        :param idx: Index of the sample to retrieve.

        :return: A tuple containing label and input data.
        """
        emb = torch.tensor(self.embeddings[self.tokens[idx], :])

        # Concatenate a column of ones to the embeddings
        input_ = torch.cat((torch.ones(emb.shape[0], 1), emb), dim=1)

        return torch.tensor(self.labels[idx]), input_


### Function to train the model

In [None]:
def train(train_loader, valid_loader, model, criterion, optimizer, 
          device, num_epochs, model_path):
    """
    Train a deep learning model.

    :param train_loader: Data loader for the training dataset.
    :param valid_loader: Data loader for the validation dataset.
    :param model: The neural network model to be trained.
    :param criterion: Loss function to compute training and validation loss.
    :param optimizer: Optimizer for updating model parameters.
    :param device: Device for training (e.g., CUDA or CPU).
    :param num_epochs: Number of training epochs.
    :param model_path: Path to save the trained model.
    """
    best_loss = 1e8

    # Loop over the specified number of training epochs
    for i in range(num_epochs):
        print(f"Epoch {i+1} of {num_epochs}")
        valid_loss, train_loss = [], []

        # Set the model to training mode
        model.train()

        # Training loop
        for batch_labels, batch_data in tqdm(train_loader):
            # Move data to the specified device (e.g., GPU)
            batch_labels = batch_labels.to(device)
            batch_data = batch_data.to(device)

            # Forward pass
            batch_output = model(batch_data)
            batch_output = torch.squeeze(batch_output)

            # Calculate loss
            loss = criterion(batch_output, batch_labels)
            train_loss.append(loss.item())

            # Zero the gradients
            optimizer.zero_grad()

            # Backward pass and gradient update
            loss.backward()
            optimizer.step()

        # Set the model to evaluation mode
        model.eval()

        # Validation loop
        for batch_labels, batch_data in tqdm(valid_loader):
            # Move data to the specified device (e.g., GPU)
            batch_labels = batch_labels.to(device)
            batch_data = batch_data.to(device)

            # Forward pass
            batch_output = model(batch_data)
            batch_output = torch.squeeze(batch_output)

            # Calculate loss
            loss = criterion(batch_output, batch_labels)
            valid_loss.append(loss.item())

        # Compute and print average training and validation loss
        t_loss = np.mean(train_loss)
        v_loss = np.mean(valid_loss)
        print(f"Train Loss: {t_loss}, Validation Loss: {v_loss}")

        if v_loss < best_loss:
            best_loss = v_loss

            # Save the model if validation loss improves
            torch.save(model.state_dict(), model_path)

        print(f"Best Validation Loss: {best_loss}")


### Function to test the model

In [None]:
def test(test_loader, model, criterion, device):
    """
    Test a trained deep learning model on a test dataset.

    :param test_loader: Data loader for the test dataset.
    :param model: The trained neural network model to be tested.
    :param criterion: Loss function to compute test loss.
    :param device: Device for testing (e.g., CUDA or CPU).
    """
    # Set the model to evaluation mode
    model.eval()

    # Initialize lists to store test loss and accuracy
    test_loss = []
    test_accu = []

    # Iterate over the test dataset
    for batch_labels, batch_data in tqdm(test_loader):
        # Move data to the specified device (e.g., GPU)
        batch_labels = batch_labels.to(device)
        batch_data = batch_data.to(device)

        # Forward pass
        batch_output = model(batch_data)
        batch_output = torch.squeeze(batch_output)

        # Calculate loss and store it
        loss = criterion(batch_output, batch_labels)
        test_loss.append(loss.item())

        # Calculate batch predictions
        batch_preds = torch.argmax(batch_output, axis=1)

        # Move predictions and labels to CPU if using CUDA
        if torch.cuda.is_available():
            batch_labels = batch_labels.cpu()
            batch_preds = batch_preds.cpu()

        # Compute accuracy for the batch and store it
        test_accu.append(accuracy_score(batch_labels.detach().numpy(), batch_preds.detach().numpy()))

    # Compute and print the average test loss and accuracy
    test_loss = np.mean(test_loss)
    test_accu = np.mean(test_accu)
    print(f"Test Loss: {test_loss}, Test Accuracy: {test_accu}")


## Train attention model
---

### Load the files

In [None]:
# Load token data from 'tokens_path'
tokens = load_file(tokens_path)

# Load label data from 'labels_path'
labels = load_file(labels_path)

# Load word embeddings from 'embeddings_path'
embeddings = load_file(embeddings_path)

# Load label encoder from 'label_encoder_path'
label_encoder = load_file(label_encoder_path)

# Determine the number of classes based on the label encoder
num_classes = len(label_encoder.classes_)

# Load vocabulary data from 'vocabulary_path'
vocabulary = load_file(vocabulary_path)


### Split data into train, validation and test sets

In [None]:
# Split the data into training and testing sets, reserving 20% for testing
X_train, X_test, y_train, y_test = train_test_split(tokens, labels, test_size=0.2)

# Further split the training data into training and validation sets, reserving 25% for validation
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25)


### Create PyTorch datasets

In [None]:
# Create training dataset using 'X_train' data, 'embeddings', and 'y_train' labels
train_dataset = TextDataset(X_train, embeddings, y_train)

# Create validation dataset using 'X_valid' data, 'embeddings', and 'y_valid' labels
valid_dataset = TextDataset(X_valid, embeddings, y_valid)

# Create test dataset using 'X_test' data, 'embeddings', and 'y_test' labels
test_dataset = TextDataset(X_test, embeddings, y_test)


### Create data loaders

In [None]:
# Create a training data loader with a batch size of 16, shuffling the data, and dropping the last incomplete batch
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True, drop_last=True)

# Create a validation data loader with a batch size of 16
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=16)

# Create a test data loader with a batch size of 16
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16)


### Create model object

In [None]:
# Determine the computing device to use for training (GPU if available, otherwise CPU)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [None]:
# Create an instance of the 'AttentionModel' with specified parameters
model = AttentionModel(vec_len, seq_len, num_classes)


### Move the model to GPU if available

In [None]:
# Check if a GPU (CUDA) is available, and if so, move the model to the GPU
if torch.cuda.is_available():
    model = model.cuda()


### Define loss function and optimizer

In [None]:
# Define the loss criterion for classification tasks (CrossEntropyLoss)
criterion = torch.nn.CrossEntropyLoss()

# Define the optimizer for model parameter updates (Adam optimizer with a specified learning rate)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)


### Training loop

In [None]:
# Train the deep learning model using the specified data loaders, model, loss criterion,
# optimizer, device, number of epochs, and save the best model to 'model_path'
train(train_loader, valid_loader, model, criterion, optimizer, device, num_epochs, model_path)


### Test the model 

In [None]:
# Evaluate the trained deep learning model on the test dataset using the specified data loader,
# model, loss criterion, and device for testing
test(test_loader, model, criterion, device)


## Predict on new text
---

In [None]:
input_text = '''I am a victim of Identity Theft & currently have an Experian account that 
I can view my Experian Credit Report and getting notified when there is activity on 
my Experian Credit Report. For the past 3 days I've spent a total of approximately 9 
hours on the phone with Experian. Every time I call I get transferred repeatedly and 
then my last transfer and automated message states to press 1 and leave a message and 
someone would call me. Every time I press 1 I get an automatic message stating than you 
before I even leave a message and get disconnected. I call Experian again, explain what 
is happening and the process begins again with the same end result. I was trying to have 
this issue attended and resolved informally but I give up after 9 hours. There are hard 
hit inquiries on my Experian Credit Report that are fraud, I didn't authorize, or recall 
and I respectfully request that Experian remove the hard hit inquiries immediately just 
like they've done in the past when I was able to speak to a live Experian representative 
in the United States. The following are the hard hit inquiries : BK OF XXXX XX/XX/XXXX 
XXXX XXXX XXXX  XX/XX/XXXX XXXX  XXXX XXXX  XX/XX/XXXX XXXX  XX/XX/XXXX XXXX  XXXX 
XX/XX/XXXX'''

### Process input text

In [None]:
# Convert input text to lowercase
input_text = input_text.lower()

# Replace non-alphanumeric characters (excluding single quotes and spaces) with spaces
input_text = re.sub(r"[^\w\d'\s]+", " ", input_text)

# Remove all digits from the text
input_text = re.sub("\d+", "", input_text)

# Remove consecutive occurrences of 'x' (two or more) from the text
input_text = re.sub(r'[x]{2,}', "", input_text)

# Replace multiple consecutive spaces with a single space
input_text = re.sub(' +', ' ', input_text)

# Tokenize the preprocessed input text
tokens = word_tokenize(input_text)


In [None]:
# Ensure that the 'tokens' list has a maximum length of 20 by padding with '<pad>' if needed
tokens = ['<pad>'] * (20 - len(tokens)) + tokens


In [None]:
# Convert 'tokens' into a list of integers by looking up each token in the 'vocabulary'
# If a token is not found in the vocabulary, use the index of '<unk>' as a fallback
idx_token = []
for token in tokens:
    if token in vocabulary:
        idx_token.append(vocabulary.index(token))
    else:
        idx_token.append(vocabulary.index('<unk>'))


In [None]:
# Retrieve word embeddings for the 'idx_token' indices from the 'embeddings' matrix
token_emb = embeddings[idx_token, :]

# Limit the embeddings to the first 'seq_len' elements, if needed
token_emb = token_emb[:seq_len, :]

# Convert the 'token_emb' NumPy array to a PyTorch tensor
inp = torch.from_numpy(token_emb)


In [None]:
# Concatenate a column of ones to the 'inp' tensor along the second dimension (column-wise)
inp = torch.cat((torch.ones(inp.shape[0], 1), inp), dim=1)


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() 
                      else "cpu")

In [None]:
# Move the 'inp' tensor to the specified device (e.g., GPU)
inp = inp.to(device)

# Add an extra dimension at the beginning of the tensor (batch dimension)
inp = torch.unsqueeze(inp, 0)


In [None]:
# Load the label encoder from the specified file path
label_encoder = load_file(label_encoder_path)

# Determine the number of classes based on the label encoder
num_classes = len(label_encoder.classes_)


In [None]:
# Create an instance of the 'AttentionModel' with specified parameters
model = AttentionModel(vec_len, seq_len, num_classes)

# Load the trained model weights from the specified 'model_path'
model.load_state_dict(torch.load(model_path))

# Move the model to the GPU if a GPU is available
if torch.cuda.is_available():
    model = model.cuda()

# Perform a forward pass of the model to obtain the output
out = torch.squeeze(model(inp))

# Find the predicted class by selecting the class with the highest output probability
prediction = label_encoder.classes_[torch.argmax(out)]

# Print the predicted class
print(f"Predicted Class: {prediction}")


---