# NLP: Text Classification

In [4]:
# import the required libraries
import re
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [7]:
# define configuration file paths
lr = 0.0001
input_size = 50
num_epochs = 50
hidden_size = 50
label_col = "products"
tokens_path = "Output/tokens.pkl"
labels_path = "Output/labels.pkl"
data_path = "Input/ecommerceDataset.csv"
rnn_model_path = "Output/model_rnn.pth"
lstm_model_path = "Output/model_lstm.pth"
vocabulary_path = "Output/vocabulary.pkl"
embeddings_path = "Output/embeddings.pkl"
glove_vector_path = "Input/glove.6B.50d.txt"
text_col_name = "product_decsription"
label_encoder_path = "Output/label_encoder.pkl"

In [8]:
# define function for saving a file
def save_file(name, obj):
    """
    Function to save an object as pickle file
    """
    with open(name, 'wb') as f:
        pickle.dump(obj, f)

# define function for loading a file
def load_file(name):
    """
    Function to load a pickle object
    """
    return pickle.load(open(name, "rb"))

## Process glove embeddings

In [9]:
# open the glove embeddings file and read
with open(glove_vector_path, "rt") as f:
    emb = f.readlines()

In [10]:
# length of embeddings
len(emb)

400000

## Check the first record

In [11]:
# check first record
emb[0]

'the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581\n'

In [12]:
# split the first record and check for vocabulary
emb[0].split()[0]

'the'

In [13]:
# split the first record and check for embeddings
emb[0].split()[1:]

['0.418',
 '0.24968',
 '-0.41242',
 '0.1217',
 '0.34527',
 '-0.044457',
 '-0.49688',
 '-0.17862',
 '-0.00066023',
 '-0.6566',
 '0.27843',
 '-0.14767',
 '-0.55677',
 '0.14658',
 '-0.0095095',
 '0.011658',
 '0.10204',
 '-0.12792',
 '-0.8443',
 '-0.12181',
 '-0.016801',
 '-0.33279',
 '-0.1552',
 '-0.23131',
 '-0.19181',
 '-1.8823',
 '-0.76746',
 '0.099051',
 '-0.42125',
 '-0.19526',
 '4.0071',
 '-0.18594',
 '-0.52287',
 '-0.31681',
 '0.00059213',
 '0.0074449',
 '0.17778',
 '-0.15897',
 '0.012041',
 '-0.054223',
 '-0.29871',
 '-0.15749',
 '-0.34758',
 '-0.045637',
 '-0.44251',
 '0.18785',
 '0.0027849',
 '-0.18411',
 '-0.11514',
 '-0.78581']

## Separate embeddings and vocabulary

In [14]:
vocabulary, embeddings = [], []

for item in emb:
    vocabulary.append(item.split()[0])
    embeddings.append(item.split()[1:])

### Convert embeddings to numpy float array

In [15]:
embeddings = np.array(embeddings, dtype=np.float32)

In [16]:
embeddings.shape

(400000, 50)

### Add embeddings for padding and unknown items

In [17]:
vocabulary[:10]

['the', ',', '.', 'of', 'to', 'and', 'in', 'a', '"', "'s"]

In [18]:
vocabulary = ["<pad>", "<unk>"] + vocabulary

In [19]:
embeddings = np.vstack([np.ones(50, dtype=np.float32), np.mean(embeddings, axis=0),
                            embeddings])

In [20]:
print(len(vocabulary), embeddings.shape)

400002 (400002, 50)


### Save embeddings and vocabulary

In [21]:
save_file(embeddings_path, embeddings)
save_file(vocabulary_path, vocabulary)

## Process text data

### Read the data file

In [22]:
data = pd.read_csv(data_path, header=None)

In [23]:
data.set_axis(['products','product_decsription'], axis="columns", inplace = True)

In [24]:
data

Unnamed: 0,products,product_decsription
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...
...,...,...
50420,Electronics,Strontium MicroSD Class 10 8GB Memory Card (Bl...
50421,Electronics,CrossBeats Wave Waterproof Bluetooth Wireless ...
50422,Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...
50423,Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou..."


In [25]:
data.products.value_counts()

Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: products, dtype: int64

### Drop rows where the text column is empty

In [26]:
data.dropna(subset=[text_col_name], inplace=True)

### Encode the label column and save the encoder and encoded labels

In [27]:
label_encoder = LabelEncoder()
label_encoder.fit(data[label_col])
labels = label_encoder.transform(data[label_col])

In [28]:
labels[0]

3

In [29]:
label_encoder.classes_

array(['Books', 'Clothing & Accessories', 'Electronics', 'Household'],
      dtype=object)

In [30]:
data[label_col]

0          Household
1          Household
2          Household
3          Household
4          Household
            ...     
50420    Electronics
50421    Electronics
50422    Electronics
50423    Electronics
50424    Electronics
Name: products, Length: 50424, dtype: object

In [31]:
save_file(labels_path, labels)
save_file(label_encoder_path, label_encoder)

### Process the text column

In [32]:
input_text = data[text_col_name]

### Convert text to lower case

In [33]:
input_text = [i.lower() for i in tqdm(input_text)]

100%|█████████████████████████████████| 50424/50424 [00:00<00:00, 213242.39it/s]


### Remove punctuations except apostrophe

In [34]:
input_text = [re.sub(r"[^\w\d'\s]+", " ", i) for i in tqdm(input_text)]

100%|██████████████████████████████████| 50424/50424 [00:01<00:00, 39648.23it/s]


### Remove digits

In [35]:
input_text = [re.sub("\d+", "", i) for i in tqdm(input_text)]

100%|██████████████████████████████████| 50424/50424 [00:00<00:00, 73058.11it/s]


### Remove more than one consecutive instance of 'x'

In [36]:
input_text = [re.sub(r'[x]{2,}', "", i) for i in tqdm(input_text)]

100%|█████████████████████████████████| 50424/50424 [00:00<00:00, 113562.20it/s]


### Replace multiple spaces with single space

In [37]:
input_text = [re.sub(' +', ' ', i) for i in tqdm(input_text)]

100%|██████████████████████████████████| 50424/50424 [00:01<00:00, 31445.91it/s]


### Tokenize the text

In [38]:
tokens = [word_tokenize(t) for t in tqdm(input_text)]

100%|███████████████████████████████████| 50424/50424 [00:21<00:00, 2368.00it/s]


### Take the first 20 tokens in each complaint text

In [39]:
tokens = [i[:20] if len(i) > 19 else ['<pad>'] * (20 - len(i)) + i for i in tqdm(tokens)]

100%|█████████████████████████████████| 50424/50424 [00:00<00:00, 611383.89it/s]


### Convert tokens to integer indices from vocabulary

In [40]:
def token_index(tokens, vocabulary, missing='<unk>'):
    """
    :param tokens: List of word tokens
    :param vocabulary: All words in the embeddings
    :param missing: Token for words not present in the vocabulary
    :return: List of integers representing the word tokens
    """
    idx_token = []
    for text in tqdm(tokens):
        idx_text = []
        for token in text:
            if token in vocabulary:
                idx_text.append(vocabulary.index(token))
            else:
                idx_text.append(vocabulary.index(missing))
        idx_token.append(idx_text)
    return idx_token

In [41]:
tokens = token_index(tokens, vocabulary)

100%|█████████████████████████████████████| 50424/50424 [28:19<00:00, 29.67it/s]


In [42]:
len(tokens)

50424

In [43]:
tokens[0]

[1309,
 1315,
 1254,
 11878,
 1017,
 5664,
 25892,
 285,
 22359,
 762,
 10472,
 1587,
 6523,
 210,
 5,
 3714,
 118,
 62,
 8,
 11862]

In [122]:
data.head()

Unnamed: 0,products,product_decsription
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [45]:
vocabulary[tokens[0][0]]

'paper'

### Save the tokens

In [46]:
save_file(tokens_path, tokens)

## Create PyTorch Dataset

In [47]:
class TextDataset(torch.utils.data.Dataset):

    def __init__(self, tokens, embeddings, labels):
        """
        :param tokens: List of word tokens
        :param embeddings: Word embeddings (from glove)
        :param labels: List of labels
        """
        self.tokens = tokens
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        return self.labels[idx], self.embeddings[self.tokens[idx], :]

## Create Models

### RNN Model

In [48]:
class RNNNetwork(torch.nn.Module):

    def __init__(self, input_size, hidden_size, num_classes):
        """
        :param input_size: Size of embedding
        :param hidden_size: Hidden vector size
        :param num_classes: Number of classes in the dataset
        """
        super(RNNNetwork, self).__init__()
        # RNN Layer
        self.rnn = torch.nn.RNN(input_size=input_size,
                                hidden_size=hidden_size,
                                batch_first=True)
        # Linear Layer
        self.linear = torch.nn.Linear(hidden_size, num_classes)

    def forward(self, input_data):
        _, hidden = self.rnn(input_data)
        output = self.linear(hidden)
        return output

### LSTM Model

In [49]:
class LSTMNetwork(torch.nn.Module):

    def __init__(self, input_size, hidden_size, num_classes):
        """
        :param input_size: Size of embedding
        :param hidden_size: Hidden vector size
        :param num_classes: Number of classes in the dataset
        """
        super(LSTMNetwork, self).__init__()
        # LSTM Layer
        self.rnn = torch.nn.LSTM(input_size=input_size,
                                 hidden_size=hidden_size,
                                 batch_first=True)
        # Linear Layer
        self.linear = torch.nn.Linear(hidden_size, num_classes)

    def forward(self, input_data):
        _, (hidden, _) = self.rnn(input_data)
        output = self.linear(hidden[-1])
        return output

### Define train function

In [50]:
def train(train_loader, valid_loader, model, criterion, optimizer, device,
          num_epochs, model_path):
    """
    Function to train the model
    :param train_loader: Data loader for train dataset
    :param valid_loader: Data loader for validation dataset
    :param model: Model object
    :param criterion: Loss function
    :param optimizer: Optimizer
    :param device: CUDA or CPU
    :param num_epochs: Number of epochs
    :param model_path: Path to save the model
    """
    best_loss = 1e8
    for i in range(num_epochs):
        print(f"Epoch {i+1} of {num_epochs}")
        valid_loss, train_loss = [], []
        model.train()
        # Train loop
        for batch_labels, batch_data in tqdm(train_loader):
            # Move data to GPU if available
            batch_labels = batch_labels.to(device)
            batch_labels = batch_labels.type(torch.LongTensor)
            batch_data = batch_data.to(device)
            # Forward pass
            batch_output = model(batch_data)
            batch_output = torch.squeeze(batch_output)
            # Calculate loss
            loss = criterion(batch_output, batch_labels)
            train_loss.append(loss.item())
            optimizer.zero_grad()
            # Backward pass
            loss.backward()
            # Gradient update step
            optimizer.step()
        model.eval()
        # Validation loop
        for batch_labels, batch_data in tqdm(valid_loader):
            # Move data to GPU if available
            batch_labels = batch_labels.to(device)
            batch_labels = batch_labels.type(torch.LongTensor)
            batch_data = batch_data.to(device)
            # Forward pass
            batch_output = model(batch_data)
            batch_output = torch.squeeze(batch_output)
            # Calculate loss
            loss = criterion(batch_output, batch_labels)
            valid_loss.append(loss.item())
        t_loss = np.mean(train_loss)
        v_loss = np.mean(valid_loss)
        print(f"Train Loss: {t_loss}, Validation Loss: {v_loss}")
        if v_loss < best_loss:
            best_loss = v_loss
            # Save model if validation loss improves
            torch.save(model.state_dict(), model_path)
        print(f"Best Validation Loss: {best_loss}")

### Define test function

In [51]:
def test(test_loader, model, criterion, device):
    """
    Function to test the model
    :param test_loader: Data loader for test dataset
    :param model: Model object
    :param criterion: Loss function
    :param device: CUDA or CPU
    """
    model.eval()
    test_loss = []
    test_accu = []
    for batch_labels, batch_data in tqdm(test_loader):
        # Move data to device
        batch_labels = batch_labels.to(device)
        batch_labels = batch_labels.type(torch.LongTensor)
        batch_data = batch_data.to(device)
        # Forward pass
        batch_output = model(batch_data)
        batch_output = torch.squeeze(batch_output)
        # Calculate loss
        loss = criterion(batch_output, batch_labels)
        test_loss.append(loss.item())
        batch_preds = torch.argmax(batch_output, axis=1)
        # Move predictions to CPU
        if torch.cuda.is_available():
            batch_labels = batch_labels.cpu()
            batch_preds = batch_preds.cpu()
        # Compute accuracy
        test_accu.append(accuracy_score(batch_labels.detach().numpy(),
                                        batch_preds.detach().numpy()))
    test_loss = np.mean(test_loss)
    test_accu = np.mean(test_accu)
    print(f"Test Loss: {test_loss}, Test Accuracy: {test_accu}")

### Train RNN Model

### Load the files

In [52]:
tokens = load_file(tokens_path)
labels = load_file(labels_path)
embeddings = load_file(embeddings_path)
label_encoder = load_file(label_encoder_path)
num_classes = len(label_encoder.classes_)

### Split data into train, validation and test sets

In [53]:
X_train, X_test, y_train, y_test = train_test_split(tokens, labels,
                                                    test_size=0.2)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                      test_size=0.25)

### Create PyTorch datasets

In [93]:
train_dataset = TextDataset(X_train, embeddings, y_train)
valid_dataset = TextDataset(X_valid, embeddings, y_valid)
test_dataset = TextDataset(X_test, embeddings, y_test)

### Create data loaders¶

In [92]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16,
                                           shuffle=True, drop_last=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=16)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16)

### Create model object

In [56]:
model = RNNNetwork(input_size, hidden_size, num_classes)

### Move the model to GPU if available

In [57]:
if torch.cuda.is_available():
    model = model.cuda()

### Define loss function and optimizer

In [67]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

RNNNetwork(
  (rnn): RNN(50, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=4, bias=True)
)

### Training loop

In [105]:
train(train_loader, valid_loader, model, criterion, optimizer,
      device, num_epochs, rnn_model_path)

### Train LSTM Model

In [100]:
model = LSTMNetwork(input_size, hidden_size, num_classes).to(device)

In [101]:
if torch.cuda.is_available():
    model = model.cuda()

In [102]:
criterion = torch.nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [106]:
train(train_loader, valid_loader, model, criterion, optimizer,
      device, num_epochs, lstm_model_path)

In [None]:
test(test_loader, model, criterion, device)

### Predict on new text

In [107]:
input_text = '''ART DIOR | Dancing Village Girls 
| Canvas Wall Art | Unframed Canvas Art Print | 18 inch x 46 inch |'''

### Process input text

In [108]:
input_text = input_text.lower()
input_text = re.sub(r"[^\w\d'\s]+", " ", input_text)
input_text = re.sub("\d+", "", input_text)
input_text = re.sub(r'[x]{2,}', "", input_text)
input_text = re.sub(' +', ' ', input_text)
tokens = word_tokenize(input_text)

### Add padding if the length of tokens is less than 20

In [109]:
tokens = ['<pad>']*(20-len(tokens))+tokens

### Tokenize the input text

In [110]:
idx_token = []
for token in tokens:
    if token in vocabulary:
        idx_token.append(vocabulary.index(token))
    else:
        idx_token.append(vocabulary.index('<unk>'))

### Get embeddings for tokens

In [111]:
token_emb = embeddings[idx_token,:]

### Convert to torch tensor

In [112]:
inp = torch.from_numpy(token_emb)

### Move the tensor to GPU if available

In [113]:
inp = inp.to(device)

### Create a batch of one record

In [114]:
inp = torch.unsqueeze(inp, 0)

### Load label encoder

In [115]:
label_encoder = load_file(label_encoder_path)
num_classes = len(label_encoder.classes_)

In [116]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### RNN prediction

In [119]:
# Create model object
model = RNNNetwork(input_size, hidden_size, num_classes)

# Load trained weights
model.load_state_dict(torch.load(rnn_model_path))

# Move the model to GPU if available
if torch.cuda.is_available():
    model = model.cuda()
    
# Forward pass
out = torch.squeeze(model(inp))

# Find predicted class
prediction = label_encoder.classes_[torch.argmax(out)]
print(f"Predicted  Class: {prediction}")

### LSTM prediction

In [120]:
# Create model object
model = LSTMNetwork(input_size, hidden_size, num_classes)

# Load trained weights
model.load_state_dict(torch.load(lstm_model_path))

# Move the model to GPU if available
if torch.cuda.is_available():
    model = model.cuda()
    
# Forward pass
out = torch.squeeze(model(inp))

# Find predicted class
prediction = label_encoder.classes_[torch.argmax(out)]
print(f"Predicted  Class: {prediction}")