In [4]:
!pip install torch torchtext scikit-learn pandas numpy -q

In [2]:
import pandas as pd
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from collections import defaultdict
from tqdm import tqdm

# Load the dataset
data = pd.read_csv(r'D:\projects\python\sentiment-analysis\sentiment analysis\cleaned_imdb_reviews.csv')  # Replace with your dataset path

reviews = data['review'].values
sentiments = data['sentiment'].values
print(set(sentiments))
# Load GloVe embeddings
embedding_dim = 100
glove_path = r'D:\projects\python\sentiment-analysis\sentiment analysis\GloVe\glove.6B\glove.6B.100d.txt'  # Replace with your GloVe file path
embeddings_index = {}

with open(glove_path, 'r', encoding='utf-8') as f:
    for line in tqdm(f, desc="Loading GloVe"):
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print(f"Loaded {len(embeddings_index)} word vectors.")

# Tokenize text and create vocabulary
word_to_index = defaultdict(lambda: len(word_to_index))  # Assign unique index to each word
word_to_index['<PAD>'] = 0  # Padding token
word_to_index['<UNK>'] = 1  # Unknown token

tokenized_reviews = []
for review in reviews:
    tokens = review.split()  # Simple whitespace tokenization
    tokenized_reviews.append([word_to_index[token] for token in tokens])

# Prepare embedding matrix
vocab_size = len(word_to_index)
embedding_matrix = torch.zeros((vocab_size, embedding_dim))

for word, index in word_to_index.items():
    if word in embeddings_index:
        embedding_matrix[index] = torch.tensor(embeddings_index[word], dtype=torch.float)
    else:
        embedding_matrix[index] = torch.randn(embedding_dim)  # Random initialization for unknown words

print("Embedding matrix prepared.")

# Pad sequences
max_sequence_length = 100
padded_reviews = pad_sequence(
    [torch.tensor(seq[:max_sequence_length]) for seq in tokenized_reviews],
    batch_first=True,
    padding_value=word_to_index['<PAD>']
)

print(f"Padded reviews shape: {padded_reviews.shape}")


{'negative', 'positive'}


Loading GloVe: 400000it [00:07, 56528.05it/s]


Loaded 400000 word vectors.
Embedding matrix prepared.
Padded reviews shape: torch.Size([50000, 100])


In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class LSTMWithAttention(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, dropout=0.5):
        super(LSTMWithAttention, self).__init__()

        # Embedding Layer
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)

        # Bidirectional LSTM
        self.lstm = nn.LSTM(input_size=embedding_matrix.size(1),
                            hidden_size=hidden_dim,
                            num_layers=1,
                            bidirectional=True,
                            batch_first=True)

        # Attention Layer
        self.attn = nn.Linear(hidden_dim * 2, 1)

        # Dropout Layer
        self.dropout = nn.Dropout(dropout)

        # Fully Connected Layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def attention(self, lstm_out):
        """
        Attention mechanism to compute attention scores and weight LSTM outputs.
        """
        attn_weights = F.softmax(self.attn(lstm_out), dim=1)
        # Apply attention weights to LSTM outputs
        attn_output = torch.sum(attn_weights * lstm_out, dim=1)
        return attn_output

    def forward(self, x):
        # Get the embedded representation
        embedded = self.embedding(x)

        # Pass through LSTM layer
        lstm_out, (h_n, c_n) = self.lstm(embedded)

        # Apply attention mechanism
        attn_out = self.attention(lstm_out)

        # Pass through dropout
        out = self.dropout(attn_out)

        # Pass through fully connected layer for classification
        output = self.fc(out)

        return output


In [16]:
# Hyperparameters
embedding_dim = 100  # GloVe embedding dimension
hidden_dim = 128  # LSTM hidden size
output_dim = 2  # Sentiment (positive or negative)
dropout = 0.5  # Dropout rate
batch_size = 64  # Batch size
lr = 1e-3  # Learning rate

# Initialize model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Training loop
model = LSTMWithAttention(embedding_matrix, hidden_dim=128, output_dim=2)  # Example values
model.to(device)


LSTMWithAttention(
  (embedding): Embedding(438731, 100)
  (lstm): LSTM(100, 128, batch_first=True, bidirectional=True)
  (attn): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)

In [10]:
from torch.utils.data import DataLoader, TensorDataset

# Assuming the sentiment values are strings like 'positive' and 'negative'
sentiment_map = {'positive': 1, 'negative': 0}
sentiments = data['sentiment'].map(sentiment_map).values

# Convert data to tensors
padded_reviews_tensor = padded_reviews  # Already padded
sentiments_tensor = torch.tensor(sentiments, dtype=torch.long)  # Sentiment labels

# Create a TensorDataset and DataLoader
dataset = TensorDataset(padded_reviews_tensor, sentiments_tensor)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [11]:
import torch.optim as optim
# Loss and optimizer
criterion = nn.CrossEntropyLoss()  # For binary classification
optimizer = optim.Adam(model.parameters(), lr=lr)


In [17]:
# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)  # Move data to GPU

        optimizer.zero_grad()  # Zero gradients

        # Forward pass
        outputs = model(inputs)

        # Calculate loss
        loss = criterion(outputs, labels)
        loss.backward()  # Backpropagation

        # Update weights
        optimizer.step()

        running_loss += loss.item()

        # Compute accuracy
        _, preds = torch.max(outputs, dim=1)
        correct_preds += (preds == labels).sum().item()
        total_preds += labels.size(0)

    avg_loss = running_loss / len(train_loader)
    accuracy = correct_preds / total_preds * 100
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")


Epoch 1/5, Loss: 0.6937, Accuracy: 50.12%
Epoch 2/5, Loss: 0.6936, Accuracy: 50.29%


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import classification_report

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, dim=1)

        all_preds.extend(preds.cpu().numpy())  # Move predictions to CPU and store
        all_labels.extend(labels.cpu().numpy())  # Move labels to CPU and store

# Generate classification report
report = classification_report(all_labels, all_preds, target_names=["Negative", "Positive"])
print(report)


In [None]:
# Save the model's state dict (recommended)
torch.save(model.state_dict(), 'lstm_attention_model.pth')

# Save the entire model
torch.save(model, 'lstm_attention_full_model.pth')


In [None]:
# Initialize the model again with the same architecture
model = LSTMWithAttention(embedding_matrix, hidden_dim=128, output_dim=2)  # Define your model architecture

# Load the state dict into the model
model.load_state_dict(torch.load('lstm_attention_model.pth'))
model.to(device)  # Move to GPU if necessary



In [None]:
def preprocess_review(review, word_to_index, max_length=100):
    # Tokenize the review
    tokens = review.split()  # Simple whitespace tokenization
    tokenized = [word_to_index[token] if token in word_to_index else word_to_index['<UNK>'] for token in tokens]

    # Pad the review
    padded_review = torch.tensor(tokenized[:max_length])  # Ensure review is no longer than max_length
    padded_review = F.pad(padded_review, (0, max_length - len(padded_review)), value=word_to_index['<PAD>'])

    return padded_review.unsqueeze(0)  # Add batch dimension

# Example custom review
custom_review = "the movie was not good"
processed_review = preprocess_review(custom_review, word_to_index)

# Inference on the custom review
model.eval()
processed_review = processed_review.to(device)
with torch.no_grad():
    output = model(processed_review)
    predicted_class = torch.argmax(output, dim=1)
    print("Predicted Class:", "Positive" if predicted_class.item() == 1 else "Negative")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

def visualize_attention(idx, model, review_tensor):
    # Get the LSTM outputs
    embedded = model.embedding(review_tensor)
    lstm_out, (h_n, c_n) = model.lstm(embedded)

    # Get the attention weights
    attn_weights = F.softmax(model.attn(lstm_out), dim=1)
    attn_weights = attn_weights.squeeze().detach().cpu().numpy()  # Detach and move to CPU before converting to numpy

    # Get the tokenized words (for visualization)
    review_tokens = [
        list(word_to_index.keys())[list(word_to_index.values()).index(int(word))]
        for word in review_tensor.squeeze().cpu().numpy()
    ]

    # Add padding token representation explicitly for display purposes
    review_tokens = [
        token if token != '<PAD>' else f'<PAD> (index {word_to_index["<PAD>"]})'
        for token in review_tokens
    ]

    # Print each token and its corresponding attention weight
    for token, attn in zip(review_tokens, attn_weights):
        print(f"Token: {token}, Attention Weight: {attn}")

    # Plotting the attention weights
    plt.figure(figsize=(10, 8))
    plt.barh(review_tokens, attn_weights)
    plt.xlabel('Attention Weight')
    plt.title(f'Attention Weights for Review {idx}')
    plt.show()



In [None]:
# Visualize attention for the custom review
visualize_attention(0, model, processed_review)