###  text classification model 

 model used to automatically categorize text documents into predefined classes or categories based on their content. The goal of text classification is to assign one or more labels to each document, indicating its category or class.

 Example - Transformers sentiment-analysis models

In [1]:
# Importing necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
# Read input dataset and in this dataset label represents ->
# 1-> positive comment
# 0-> negitive comment
# 2-> neutral comment
import json
with open("./support_files/text_classification_dataset.json") as f:
    initial_data = json.load(f)

# Seprating texts and labels
texts = [data["text"] for data in initial_data]
labels = [data["label"] for data in initial_data]

print("Texts : ",texts[:3], "\nLabels : ",labels[:3])

Texts :  ['This movie is great!', "I didn't like this film.", 'The acting was superb.'] 
Labels :  [1, 0, 1]


In [10]:
# Creating Tokenizations and vocabulary
# We can use transformers AutoTokenizer to create tokens instead as well!
word_to_idx = {}
idx_to_word = {}
idx = 0
for text in texts:
    for word in text.split():
        if word not in word_to_idx:
            word_to_idx[word] = idx
            idx_to_word[idx] = word
            idx += 1

# Convert texts to sequences of word indices
sequences = [[word_to_idx[word] for word in text.split()] for text in texts]
print("Tokens : ",sequences[:3])

# Padding sequences to the same length to help with efficient training {Added 0 to eralize the length}
max_seq_length = max(len(seq) for seq in sequences)
padded_sequences = [seq + [0] * (max_seq_length - len(seq)) for seq in sequences]
print("Max Length : ",max_seq_length,"\nPadding Example : ",padded_sequences[:3])

# Convert data to PyTorch tensors
# Tensors are multi-dimensional arrays used to represent data in deep learning models. 
# They have a rank, shape, and data type, and support various mathematical operations. 
# Tensors are fundamental to building and training neural networks, 
# serving as the main data structure for input data and model parameters.
X = torch.tensor(padded_sequences, dtype=torch.long)
y = torch.tensor(labels, dtype=torch.long)
print("X : ",X[:3],"\ny : ",y[:3])

# Creating a simple dataset class
# This is torch dataset which we will use to train torch model
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Split data into train and test sets using sklearn train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# now we will create data loaders and datasets
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# Model definition
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=1, dropout=0.2):
        super(TextClassifier, self).__init__()
        # This is first layer which is creating embedings, 
        # We can use sentence_transformers and directly pass embedings to prevent overhead
        self.embedding = nn.Embedding(vocab_size, embedding_dim) 
        # this is 2nd layer which is LSTM (Long Short-Term Memory) based on RNN
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=dropout) 
        # this is fully connected layer it performs a linear transformation on the input data.
        # It consists of weights and biases that are learned during the training process.
        # The linear layer projects the output of the LSTM layer to the desired output dimension (output_dim), which represents the number of classes in the classification task.
        self.fc = nn.Linear(hidden_dim, output_dim)
        # Dropout is a regularization technique used to prevent overfitting in neural networks by randomly dropping (zeroing out) a proportion of input units during training.
        self.dropout = nn.Dropout(dropout)

    # In PyTorch neural network modules, 
    # the forward method defines the computation performed when the module is called with input data. 
    # It outlines how the input data flows through the layers of the network to produce the output.
    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        # Take the output from the last timestep
        output = output[:, -1, :]
        output = self.dropout(output)
        output = self.fc(output)
        return output
    
TextClassifier

Tokens :  [[0, 1, 2, 3], [4, 5, 6, 7, 8], [9, 10, 11, 12]]
Max Length :  10 
Padding Example :  [[0, 1, 2, 3, 0, 0, 0, 0, 0, 0], [4, 5, 6, 7, 8, 0, 0, 0, 0, 0], [9, 10, 11, 12, 0, 0, 0, 0, 0, 0]]
X :  tensor([[ 0,  1,  2,  3,  0,  0,  0,  0,  0,  0],
        [ 4,  5,  6,  7,  8,  0,  0,  0,  0,  0],
        [ 9, 10, 11, 12,  0,  0,  0,  0,  0,  0]]) 
y :  tensor([1, 0, 1])


__main__.TextClassifier

In [22]:
# Model parameters
vocab_size = len(word_to_idx)
embedding_dim = 100 # Vectors dimenssions
hidden_dim = 128
output_dim = 3  # Three classes: positive 1, negative 0, neutral 2
num_layers = 2  # Increase number of LSTM layers
dropout = 0.5  # Add dropout for regularization

# Instantiate the model
model = TextClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=num_layers, dropout=dropout)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
# An epoch is a single iteration through the entire training dataset during neural network training. 
# It involves forward and backward passes, where the model makes predictions, computes the loss, and updates parameters. 
# Multiple epochs are typically used to ensure the model learns from the entire dataset multiple times for better convergence and performance.
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

    # Evaluation
    model.eval()
    correct = 0
    total = 0
    confusion_matrix = np.zeros((output_dim, output_dim), dtype=int)
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            _, predicted = torch.max(output, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()
            for t, p in zip(target.view(-1), predicted.view(-1)):
                confusion_matrix[t.long(), p.long()] += 1

    print(f"Test Accuracy: {100 * correct / total:.2f}%")
    # print("Confusion Matrix:\n", confusion_matrix)
    print()


Epoch [1/10], Loss: 1.0219
Test Accuracy: 28.57%

Epoch [2/10], Loss: 0.8568
Test Accuracy: 28.57%

Epoch [3/10], Loss: 0.8280
Test Accuracy: 28.57%

Epoch [4/10], Loss: 0.7503
Test Accuracy: 28.57%

Epoch [5/10], Loss: 0.6591
Test Accuracy: 42.86%

Epoch [6/10], Loss: 0.5694
Test Accuracy: 42.86%

Epoch [7/10], Loss: 0.4380
Test Accuracy: 42.86%

Epoch [8/10], Loss: 0.1532
Test Accuracy: 57.14%

Epoch [9/10], Loss: 0.0672
Test Accuracy: 57.14%

Epoch [10/10], Loss: 0.0507
Test Accuracy: 28.57%



In [23]:
model.eval()
# inference
# to use the trained model
# For example, to predict the sentiment of a new text
def predict_sentiment(text):
    # Preprocess the text (tokenization, padding, etc.)
    sequence = [word_to_idx.get(word, 0) for word in text.split()]  # Assign index 0 for OOV words
    padded_sequence = sequence + [0] * (max_seq_length - len(sequence))  # Pad sequence
    input_tensor = torch.tensor([padded_sequence], dtype=torch.long)

    # Pass the input tensor through the model
    with torch.no_grad():
        output = model(input_tensor)

    # Get predicted label
    _, predicted = torch.max(output, 1)
    predicted_label = "Positive" if predicted.item() else ("Neutral" if predicted.item()==2 else "Negative")
    return predicted_label

# Example usage
new_text = "It was good."
predicted_sentiment = predict_sentiment(new_text)
print(predicted_sentiment, ": ", new_text)

new_text = "It was a terrible experience."
predicted_sentiment = predict_sentiment(new_text)
print(predicted_sentiment, ": ", new_text)

Positive :  It was good.
Negative :  It was a terrible experience.


In [24]:
model_path = "trained_data/base_text_classifier_model.pth"

# Save the model state dict
torch.save(model.state_dict(), model_path)

# Instantiate the model
model = TextClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=num_layers, dropout=dropout)

# Load the saved model state dict
model.load_state_dict(torch.load(model_path))
model.eval()  # Set the model to evaluation mode to text it later


TextClassifier(
  (embedding): Embedding(103, 100)
  (rnn): LSTM(100, 128, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=128, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

### Skipping Embeddings?

We can skip embedding layer in model by passing embeddings in our model directly



In [7]:
# We can create a model with no embeding step and use sentence transformer to help us creating embeding instead
from sentence_transformers import SentenceTransformer

emb_model = SentenceTransformer("avsolatorio/GIST-small-Embedding-v0", device="cuda")
print(emb_model)
sentence_embeddings = emb_model.encode(texts, batch_size=8)
print("Sample : ",sentence_embeddings[0][:3])
print("Embedding size : ",sentence_embeddings.shape[1])

# Create tensor and datasets
# We can use previous TextDataset(...)
X = torch.tensor(sentence_embeddings, dtype=torch.float32)
y = torch.tensor(labels, dtype=torch.long)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create data loaders
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)



SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)
Sample :  [-0.06804912 -0.0057736  -0.00555508]
Embedding size :  384


In [8]:
# Model definition without embedding layer
class TextClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2, dropout=0.2):
        super(TextClassifier, self).__init__()
        self.rnn = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        output, _ = self.rnn(x.unsqueeze(1))  # Add an extra dimension for sequence length
        # Take the output from the last timestep
        output = output[:, -1, :]
        output = self.dropout(output)
        output = self.fc(output)
        return output
    

# Model parameters
input_dim = sentence_embeddings.shape[1]
hidden_dim = 128
output_dim = 3  # Three classes: positive, negative, neutral
num_layers = 2  # Increase number of LSTM layers
dropout = 0.5  # Add dropout for regularization

# Instantiate the model
model = TextClassifier(input_dim, hidden_dim, output_dim, num_layers=num_layers, dropout=dropout)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

    # Evaluation
    model.eval()
    correct = 0
    total = 0
    confusion_matrix = np.zeros((output_dim, output_dim), dtype=int)
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            _, predicted = torch.max(output, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()
            for t, p in zip(target.view(-1), predicted.view(-1)):
                confusion_matrix[t.long(), p.long()] += 1

    print(f"Test Accuracy: {100 * correct / total:.2f}%")
    # print("Confusion Matrix:\n", confusion_matrix)
    print()

Epoch [1/5], Loss: 1.0604
Test Accuracy: 57.14%

Epoch [2/5], Loss: 1.0396
Test Accuracy: 57.14%

Epoch [3/5], Loss: 0.9936
Test Accuracy: 57.14%

Epoch [4/5], Loss: 0.9557
Test Accuracy: 42.86%

Epoch [5/5], Loss: 0.8775
Test Accuracy: 42.86%



In [9]:
# Define the inference function
def predict_sentiment(text):
    # Encode input text
    input_embedding = emb_model.encode([text])
    input_tensor = torch.tensor(input_embedding, dtype=torch.float32)

    # Forward pass through the model
    output = model(input_tensor)

    # Get predicted class
    _, predicted_class = torch.max(output, 1)

    return predicted_class.item()

# Example usage
new_text = "It is a good movie"
predicted_sentiment = predict_sentiment(new_text)
print("Predicted sentiment:", predicted_sentiment)

Predicted sentiment: 1
