In [None]:
# Importing Libraries
import os
import re
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from nltk.tokenize import word_tokenize

In [None]:
import nltk
nltk.download('punkt')

---

### Function to save and load files

In [None]:
def save_file(name, obj):
    """
    Function to save an object as pickle file
    """
    with open(name, 'wb') as f:
        pickle.dump(obj, f)


def load_file(name):
    """
    Function to load a pickle object
    """
    return pickle.load(open(name, "rb"))

---

# Data Processing

In [None]:
# Importing Data
tokens_path = "Output/tokens.pkl"
file_path = "Input/complaints.csv"
col_name = "Consumer complaint narrative"

In [None]:
data = pd.read_csv(file_path)

In [None]:
data.shape

---

### Drop missing values

In [None]:
# Remove rows with missing values in the specified column
data.dropna(subset=[col_name], inplace=True)


In [None]:
data.shape

In [None]:
# Extract the specified column from the DataFrame and assign it to the variable input_text
input_text = data[col_name]


---

### Convert text to lower case

In [None]:
# Convert text data in the list to lowercase while displaying a progress bar
input_text = [i.lower() for i in tqdm(input_text)]


### Remove punctuations except apostrophe

In [None]:
# Remove punctuation characters (except apostrophes) from text in the list while displaying a progress bar
input_text = [re.sub(r"[^\w\d'\s]+", " ", i) for i in tqdm(input_text)]


### Remove digits

In [None]:
# Remove digits from text in the list while displaying a progress bar
input_text = [re.sub("\d+", "", i) for i in tqdm(input_text)]


### Remove 'xxxx' in text

In [None]:
# Remove consecutive instances of 'x' from text in the list while displaying a progress bar
input_text = [re.sub(r'[x]{2,}', "", i) for i in tqdm(input_text)]


### Remove additional spaces

In [None]:
# Replace multiple consecutive spaces with a single space in text within the list while displaying a progress bar
input_text = [re.sub(' +', ' ', i) for i in tqdm(input_text)]


### Tokenize the text

In [None]:
# Tokenize the text within the first 100 elements of input_text and store the tokens in the tokens list while displaying a progress bar
tokens = [word_tokenize(t) for t in tqdm(input_text[:100])]


### Save tokens

In [None]:
save_file(tokens_path, tokens)

---

# Data loader

In [None]:
k = 10
t = 1e-5
context_window = 5

In [None]:
# Import necessary libraries and modules
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter
from Source.utils import save_file

# Define the SkipGramDataset class
class SkipGramDataset(torch.utils.data.Dataset):

    def __init__(self, input_data, context_window=5, out_path="Output", t=1e-5, k=10):
        # Initialize the dataset
        self.k = k
        self.context_window = context_window

        # Count the frequency of words in the input data
        print("Counting word tokens...")
        counter = Counter([t for d in tqdm(input_data) for t in d])
        self.vocab_count = len(counter)
        print(f"Unique words in the corpus: {self.vocab_count}")

        # Create positive data samples for Skip-gram
        print("Creating data samples...")
        self.samples = self.positive_samples(input_data)

        # Create vocabulary mapping and sampling probabilities
        word2idx = dict()
        idx2word = dict()
        sampling_prob = []
        print("Generating vocabulary...")
        for i, c in enumerate(counter.most_common(len(counter))):
            word2idx[c[0]] = i
            idx2word[i] = c[0]
            sampling_prob.append(c[1])
        self.word2idx = word2idx
        self.idx2word = idx2word

        # Calculate and normalize sampling probabilities
        print("Calculating sampling probabilities...")
        sampling_prob = np.sqrt(t/np.array(sampling_prob))
        sampling_prob = sampling_prob / np.sum(sampling_prob)
        self.sampling_prob = sampling_prob

        # Save vocabulary mapping to files
        print("Saving files...")
        self.save_files(out_path)

    def __len__(self):
        return self.samples.shape[0]

    def __getitem__(self, idx):
        # Generate negative samples and prepare the dataset for training
        neg_words = self.negative_samples()
        center_word = self.word2idx[self.samples.loc[idx, "center_word"]]
        context_word = self.word2idx[self.samples.loc[idx, "context_word"]]
        return torch.tensor(center_word), torch.tensor([context_word]+neg_words)

    def positive_samples(self, input_data):
        # Create positive data samples by defining context windows
        samples = []
        cw = self.context_window
        for data in tqdm(input_data):
            text = [None] * cw + data + [None] * cw
            for i in range(cw, len(text) - cw):
                samples.append((text[i], text[i - cw:i] + text[i + 1: i + cw + 1]))
        samples = pd.DataFrame(samples, columns=["center_word", "context_word"])
        samples = samples.explode("context_word")
        samples.dropna(inplace=True)
        samples.reset_index(drop=True, inplace=True)
        return samples

    def negative_samples(self):
        # Generate negative samples for Skip-gram training
        neg_words = list(np.random.choice(np.arange(self.vocab_count), self.k, p=self.sampling_prob))
        return neg_words

    def save_files(self, out_path="Output"):
        # Save vocabulary mapping to files
        save_file(os.path.join(out_path, "word2idx.pkl"), self.word2idx)
        save_file(os.path.join(out_path, "idx2word.pkl"), self.idx2word)


---

# Skip-Gram Model

In [None]:
embedding_size = 64

In [None]:
# Import necessary libraries and modules
import torch
import torch.nn as nn

# Define the SkipGram class
class SkipGram(nn.Module):

    def __init__(self, vocab_len, embedding_size=64):
        # Initialize the SkipGram model
        super(SkipGram, self).__init()

        # Define the word embedding layer
        self.embeddings = nn.Embedding(vocab_len, embedding_size)

        # Initialize the weights matrix for Skip-gram
        self.weights = torch.empty(embedding_size, vocab_len, requires_grad=True).type(torch.FloatTensor)
        _ = torch.nn.init.normal_(self.weights)

        # Define the output layer with LogSigmoid activation
        self.out = nn.LogSigmoid()

    def forward(self, center_word, context_words):
        # Define the forward pass for Skip-gram
        embeddings_ = self.embeddings(center_word)
        weights_ = self.weights[:, context_words]
        output = torch.einsum('bi,ibo->bo', embeddings_, weights_)
        true_y = torch.zeros(output.shape[0], dtype=torch.int64)
        return self.out(output), true_y

    def save_files(self, out_path="Output"):
        # Save the model's embeddings and weights to files
        save_file(os.path.join(out_path, "emb.pkl"), self.embeddings)
        save_file(os.path.join(out_path, "weights.pkl"), self.weights)


# Training

In [None]:
# Number of negative samples for Skip-gram training
k = 10

# Learning rate for model training
lr = 0.01

# Number of training epochs
num_epochs = 2

# Batch size for training data
batch_size = 128

# Context window size for Skip-gram training
context_window = 5

# Output path for saving model and data files
out_path = "Output"


In [None]:
import torch

# Check if a CUDA-compatible GPU is available; if not, use the CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [None]:
def train_sg(dataloader, model, criterion, optimizer, device, num_epochs):
    # Set the model in training mode
    model.train()
    
    # Initialize variables to track training progress
    best_loss = 1e8  # A high initial value for tracking the best loss
    patience = 0     # Counter for early stopping

    # Loop over a specified number of training epochs
    for i in range(num_epochs):
        epoch_loss = []  # List to store losses for each epoch
        print(f"Epoch {i+1} of {num_epochs}")
        
        # Iterate over the data loader (batches of training data)
        for center_word, context_words in tqdm(dataloader):
            center_word = center_word.to(device)
            context_words = context_words.to(device)
            
            # Forward pass: compute model output and loss
            output, true_y = model(center_word, context_words)
            loss = criterion(output, true_y)
            
            # Backpropagation and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Append the loss for the current batch to the epoch_loss list
            epoch_loss.append(loss.item())
        
        # Calculate the average loss for the current epoch
        epoch_loss = np.mean(epoch_loss)
        
        # Update best_loss if the current epoch's loss is better
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            patience = 0
        else:
            patience += 1
        
        # Print the loss for the current epoch
        print(f"Loss: {epoch_loss}")
        
        # Check for early stopping based on patience
        if patience == 5:
            print("Early stopping...")
    
    # Save model files after training
    model.save_files()


In [None]:
# Create a SkipGramDataset instance with the following parameters
dataset = SkipGramDataset(input_data=tokens,  # Input data, typically tokenized text
                          context_window=context_window,  # Size of the context window
                          out_path=out_path,  # Output path for saving model files
                          t=t,  # Threshold parameter for word sampling
                          k=k)  # Number of negative samples


In [None]:
# Create a PyTorch data loader for the SkipGramDataset
dataloader = torch.utils.data.DataLoader(dataset,  # The dataset to load
                                         batch_size=batch_size,  # Batch size for training
                                         shuffle=True,  # Shuffle the data in each epoch
                                         drop_last=True)  # Drop the last batch if it's incomplete


In [None]:
# Create a SkipGram model with the specified vocabulary size and embedding size
model = SkipGram(dataset.vocab_count, embedding_size=embedding_size)


In [None]:
# Initialize the loss criterion
criterion = nn.NLLLoss()

# Initialize the optimizer with model parameters and learning rate
optimizer = optim.Adam(model.parameters(), lr=0.01)


In [None]:
# Train the Skip-gram model
train_sg(dataloader, model, criterion, optimizer, device, num_epochs)


---

# Using embedings to get word vectors

In [None]:
# Load the word-to-index dictionary from a file
word2idx = load_file("Output/word2idx.pkl")


In [None]:
word2idx["payments"]

In [None]:
embeddings = load_file("Output/emb.pkl")

In [None]:
embeddings(torch.tensor(83))

---