In [16]:
import numpy as np
import torch
import torch.nn as nn
from collections import Counter
from itertools import chain
import torch.optim as optim
import os

In [2]:
# Step 1: Load GloVe Embeddings
def load_glove_embeddings(file_path, embedding_dim):
    """
    Load GloVe embeddings from the file into a dictionary.
    """
    embedding_dict = {}
    with open(file_path, 'r', encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]  # The word
            vector = np.array(values[1:], dtype="float32")  # The embedding vector
            embedding_dict[word] = vector
    print(f"Loaded {len(embedding_dict)} word vectors.")
    return embedding_dict


In [3]:
# Step 3: Create the Embedding Matrix
def create_embedding_matrix(vocab, glove_embeddings, embedding_dim):
    """
    Create an embedding matrix where each row corresponds to a token in the vocabulary.
    """
    vocab_size = len(vocab)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))  # Initialize matrix with zeros

    for word, idx in vocab.items():
        if word in glove_embeddings:
            embedding_matrix[idx] = glove_embeddings[word]
        else:
            # Initialize randomly for missing words
            embedding_matrix[idx] = np.random.uniform(-0.01, 0.01, embedding_dim)

    return embedding_matrix

In [4]:
# Path to GloVe file and embedding dimensions
glove_file_path = "glove.6B.100d.txt"
embedding_dim = 100

In [5]:
# Load GloVe embeddings
glove_embeddings = load_glove_embeddings(glove_file_path, embedding_dim)


Loaded 400001 word vectors.


In [6]:
# Step 2: Build Vocabulary
# Example tokenized dataset (replace with your dataset tokens)
vocab_file = 'imdb.vocab'
with open(vocab_file, 'r') as f:
    vocab_words = f.read().splitlines()

tokenized_sentences = vocab_words
tokenized_sentences.append('<UNK>')  # Add <UNK> token for unknown words
tokenized_sentences.append('<PAD>')  # Add <PAD> token to pad sequences
# Create vocabulary
vocab_size = len(vocab_words)
# tokens = list(chain(*tokenized_sentences))
vocab_counter = Counter(tokenized_sentences)


In [7]:
# Assign an index to each word in the vocabulary
vocab = {word: idx for idx, (word, _) in enumerate(vocab_counter.most_common())}
print(f"Vocabulary size: {len(vocab)}")


Vocabulary size: 89529


In [8]:
# Create embedding matrix
embedding_matrix = create_embedding_matrix(vocab, glove_embeddings, embedding_dim)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

Embedding matrix shape: (89529, 100)


In [9]:
# Step 4: Load Embedding Matrix into PyTorch Embedding Layer
embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float32)

# Define the embedding layer
embedding_layer = nn.Embedding(len(vocab), embedding_dim)
embedding_layer.weight.data.copy_(embedding_tensor)  # Load pre-trained weights

tensor([[-3.8194e-02, -2.4487e-01,  7.2812e-01,  ..., -1.4590e-01,
          8.2780e-01,  2.7062e-01],
        [-7.1953e-02,  2.3127e-01,  2.3731e-02,  ..., -7.1895e-01,
          8.6894e-01,  1.9539e-01],
        [-2.7086e-01,  4.4006e-02, -2.0260e-02,  ..., -4.9230e-01,
          6.3687e-01,  2.3642e-01],
        ...,
        [ 8.9225e-03, -7.5127e-03,  3.3671e-03,  ..., -8.5873e-03,
         -2.2376e-03, -7.4692e-03],
        [ 5.6891e-03,  1.4857e-03, -3.0688e-03,  ...,  5.3542e-03,
         -8.7294e-04,  7.9123e-04],
        [ 1.5660e-03, -7.9837e-03, -4.0311e-03,  ..., -4.8636e-03,
          7.7952e-03,  8.9897e-03]])

In [10]:
# Optionally freeze the embedding layer
embedding_layer.weight.requires_grad = False
print("Embedding layer is ready.")

# Step 5: Test the Embedding Layer
example_sentence = ["this", "movie", "is", "great"]  # Example input sentence
token_indices = [vocab.get(token, vocab["<UNK>"]) for token in example_sentence]


Embedding layer is ready.


In [11]:
# Convert token indices to a PyTorch tensor
input_tensor = torch.tensor(token_indices).unsqueeze(0)  # Add batch dimension

# Pass through the embedding layer
output_embeddings = embedding_layer(input_tensor)
print(f"Input indices: {token_indices}")
print(f"Output embeddings shape: {output_embeddings.shape}")


Input indices: [9, 15, 5, 83]
Output embeddings shape: torch.Size([1, 4, 100])


In [12]:


# Output embedding for visualization
print("Output embeddings for the example sentence:")
print(output_embeddings.shape)


Output embeddings for the example sentence:
torch.Size([1, 4, 100])


In [None]:
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix=None, freeze_embeddings=True):
        super(SentimentLSTM, self).__init__()
        
        # Step 1. Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if embedding_matrix is not None:
            self.embedding.weight.data.copy_(torch.tensor(embedding_matrix, dtype=torch.float32))  # Load pre-trained embeddings
            self.embedding.weight.requires_grad = not freeze_embeddings  # Freeze or allow fine-tuning
        
        # Step 2. LSTM Layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, batch_first=True)
        
        # Step 3. Fully Connected Layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        # Step 4. Sigmoid Activation for Binary Classification
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        # Pass input through embedding layer
        embedded = self.embedding(x)
        
        # Pass embeddings through the LSTM layer
        lstm_out, (hidden, cell) = self.lstm(embedded)
        
        # Take the output from the last hidden state
        final_output = self.fc(hidden[-1])
        
        # Apply sigmoid activation
        output = self.sigmoid(final_output)
        return output

In [14]:
vocab_size = len(vocab)
embedding_dim = 100  # Same as GloVe
hidden_dim = 128
output_dim = 1
learning_rate = 0.001
num_epochs = 5  # Number of epochs

# Initialize the LSTM model
model = SentimentLSTM(vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix, freeze_embeddings=True)

# Move the model to the appropriate device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [15]:

# Loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [20]:
batched_comments = []
pos = os.listdir('train/pos')
neg = os.listdir('train/neg')
pos = list(filter(lambda x: '._' not in x, pos))
neg = list(filter(lambda x: '._' not in x, neg))


18750

In [None]:
# Training Loop
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    epoch_loss = 0

    for comments, labels in batched_comments:  # Assuming batched_comments is a DataLoader
        # Move data to device
        comments = comments.to(device)  # Shape: [batch_size, sequence_length]
        labels = labels.to(device)  # Shape: [batch_size]

        # Forward pass
        predictions = model(comments).squeeze(1)  # Shape: [batch_size]
        
        # Compute loss
        loss = criterion(predictions, labels)
        
        # Backward pass
        optimizer.zero_grad()  # Reset gradients
        loss.backward()  # Backpropagation
        optimizer.step()  # Update model weights

        # Accumulate loss for the epoch
        epoch_loss += loss.item()
    
    # Print epoch summary
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(batched_comments)}")
