In [26]:
import numpy as np
import torch
import torch.nn as nn
from collections import Counter
from itertools import chain
import torch.optim as optim
import os
import random
random.seed(42)

In [2]:
# Step 1: Load GloVe Embeddings
def load_glove_embeddings(file_path, embedding_dim):
    """
    Load GloVe embeddings from the file into a dictionary.
    """
    embedding_dict = {}
    with open(file_path, 'r', encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]  # The word
            vector = np.array(values[1:], dtype="float32")  # The embedding vector
            embedding_dict[word] = vector
    print(f"Loaded {len(embedding_dict)} word vectors.")
    return embedding_dict


In [54]:
# Tokenize comments
def tokenize_comments(comments, vocab):
    tokenized_comments = []
    for comment in comments:
        tokens = comment.split()  # Simple whitespace tokenizer
        token_indices = [vocab.get(token, vocab["<UNK>"]) for token in tokens]
        tokenized_comments.append(token_indices)
    return tokenized_comments

# Pad sequences to a fixed length
def pad_sequences(sequences, max_len, pad_value):
    padded_sequences = []
    for seq in sequences:
        if len(seq) < max_len:
            seq = seq + [pad_value] * (max_len - len(seq))
        else:
            seq = seq[:max_len]
        padded_sequences.append(seq)
    return padded_sequences

In [3]:
# Step 3: Create the Embedding Matrix
def create_embedding_matrix(vocab, glove_embeddings, embedding_dim):
    """
    Create an embedding matrix where each row corresponds to a token in the vocabulary.
    """
    vocab_size = len(vocab)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))  # Initialize matrix with zeros

    for word, idx in vocab.items():
        if word in glove_embeddings:
            embedding_matrix[idx] = glove_embeddings[word]
        else:
            # Initialize randomly for missing words
            embedding_matrix[idx] = np.random.uniform(-0.01, 0.01, embedding_dim)

    return embedding_matrix

In [4]:
# Path to GloVe file and embedding dimensions
glove_file_path = "glove.6B.100d.txt"
embedding_dim = 100

In [5]:
# Load GloVe embeddings
glove_embeddings = load_glove_embeddings(glove_file_path, embedding_dim)


Loaded 400001 word vectors.


In [6]:
# Step 2: Build Vocabulary
# Example tokenized dataset (replace with your dataset tokens)
vocab_file = 'imdb.vocab'
with open(vocab_file, 'r') as f:
    vocab_words = f.read().splitlines()

tokenized_sentences = vocab_words
tokenized_sentences.append('<UNK>')  # Add <UNK> token for unknown words
tokenized_sentences.append('<PAD>')  # Add <PAD> token to pad sequences
# Create vocabulary
vocab_size = len(vocab_words)
# tokens = list(chain(*tokenized_sentences))
vocab_counter = Counter(tokenized_sentences)


In [7]:
# Assign an index to each word in the vocabulary
vocab = {word: idx for idx, (word, _) in enumerate(vocab_counter.most_common())}
print(f"Vocabulary size: {len(vocab)}")


Vocabulary size: 89529


In [8]:
# Create embedding matrix
embedding_matrix = create_embedding_matrix(vocab, glove_embeddings, embedding_dim)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

Embedding matrix shape: (89529, 100)


In [9]:
# Step 4: Load Embedding Matrix into PyTorch Embedding Layer
embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float32)

# Define the embedding layer
embedding_layer = nn.Embedding(len(vocab), embedding_dim)
embedding_layer.weight.data.copy_(embedding_tensor)  # Load pre-trained weights

tensor([[-3.8194e-02, -2.4487e-01,  7.2812e-01,  ..., -1.4590e-01,
          8.2780e-01,  2.7062e-01],
        [-7.1953e-02,  2.3127e-01,  2.3731e-02,  ..., -7.1895e-01,
          8.6894e-01,  1.9539e-01],
        [-2.7086e-01,  4.4006e-02, -2.0260e-02,  ..., -4.9230e-01,
          6.3687e-01,  2.3642e-01],
        ...,
        [ 8.9225e-03, -7.5127e-03,  3.3671e-03,  ..., -8.5873e-03,
         -2.2376e-03, -7.4692e-03],
        [ 5.6891e-03,  1.4857e-03, -3.0688e-03,  ...,  5.3542e-03,
         -8.7294e-04,  7.9123e-04],
        [ 1.5660e-03, -7.9837e-03, -4.0311e-03,  ..., -4.8636e-03,
          7.7952e-03,  8.9897e-03]])

In [10]:
# Optionally freeze the embedding layer
embedding_layer.weight.requires_grad = False
print("Embedding layer is ready.")

# Step 5: Test the Embedding Layer
example_sentence = ["this", "movie", "is", "great"]  # Example input sentence
token_indices = [vocab.get(token, vocab["<UNK>"]) for token in example_sentence]


Embedding layer is ready.


In [11]:
# Convert token indices to a PyTorch tensor
input_tensor = torch.tensor(token_indices).unsqueeze(0)  # Add batch dimension

# Pass through the embedding layer
output_embeddings = embedding_layer(input_tensor)
print(f"Input indices: {token_indices}")
print(f"Output embeddings shape: {output_embeddings.shape}")


Input indices: [9, 15, 5, 83]
Output embeddings shape: torch.Size([1, 4, 100])


In [12]:


# Output embedding for visualization
print("Output embeddings for the example sentence:")
print(output_embeddings.shape)


Output embeddings for the example sentence:
torch.Size([1, 4, 100])


In [None]:
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix=None, freeze_embeddings=True):
        super(SentimentLSTM, self).__init__()
        
        # Step 1. Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if embedding_matrix is not None:
            self.embedding.weight.data.copy_(torch.tensor(embedding_matrix, dtype=torch.float32))  # Load pre-trained embeddings
            self.embedding.weight.requires_grad = not freeze_embeddings  # Freeze or allow fine-tuning
        
        # Step 2. LSTM Layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, batch_first=True)
        
        # Step 3. Fully Connected Layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        # Step 4. Sigmoid Activation for Binary Classification
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        # Pass input through embedding layer
        embedded = self.embedding(x)
        
        # Pass embeddings through the LSTM layer
        lstm_out, (hidden, cell) = self.lstm(embedded)
        
        # Take the output from the last hidden state
        final_output = self.fc(hidden[-1])
        
        # Apply sigmoid activation
        output = self.sigmoid(final_output)
        return output

In [14]:
vocab_size = len(vocab)
embedding_dim = 100  # Same as GloVe
hidden_dim = 128
output_dim = 1
learning_rate = 0.001
num_epochs = 5  # Number of epochs

# Initialize the LSTM model
model = SentimentLSTM(vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix, freeze_embeddings=True)

# Move the model to the appropriate device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [15]:

# Loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [20]:
batched_comments = []
pos = os.listdir('train/pos')
neg = os.listdir('train/neg')
pos = list(filter(lambda x: '._' not in x, pos))
neg = list(filter(lambda x: '._' not in x, neg))


In [25]:
# Read the comments from files
pos_comments = [open(f'train/pos/{f}', 'r').read() for f in pos]
neg_comments = [open(f'train/neg/{f}', 'r').read() for f in neg]

# Label the comments
pos_labels = [1] * len(pos_comments)
neg_labels = [0] * len(neg_comments)

# Combine and shuffle the comments and labels
comments = pos_comments + neg_comments
labels = pos_labels + neg_labels

In [27]:
combined = list(zip(comments, labels))
random.shuffle(combined)
comments, labels = zip(*combined)

In [30]:
# Create batches of size 50
batch_size = 50
batched_comments = [(comments[i:i + batch_size], labels[i:i + batch_size]) for i in range(0, len(comments), batch_size)]
len(batched_comments)

750

In [31]:
train_size = 0.8
test_size = 0.2
train_len = train_size * len(batched_comments)
test_len = len(batched_comments) - train_len
train_comments = batched_comments[:int(train_len)]
val_comments = labels[:int(train_len)]

test_comments = batched_comments[int(train_len):]
test_labels = labels[int(train_len):]

In [55]:
# Training Loop
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    epoch_loss = 0
    correct_predictions = 0
    total_predictions = 0

    # Loop through batches
    for comments, labels in train_comments:  # Assuming train_comments is a list of (comments, labels) batches
        # Move data to device
        comments_padded = pad_sequences(tokenize_comments(comments, vocab), max_len=200, pad_value=vocab["<PAD>"])
        comments_tokenized = tokenize_comments(comments_padded, vocab)
        
        comments = torch.tensor(comments_tokenized, dtype=torch.long).to(device)  # Shape: [batch_size, sequence_length]
        labels = torch.tensor(labels, dtype=torch.float32).to(device)  # Shape: [batch_size]

        # Forward pass
        predictions = model(comments).squeeze(1)  # Shape: [batch_size]

        # Compute loss
        loss = criterion(predictions, labels)

        # Backward pass
        optimizer.zero_grad()  # Reset gradients
        loss.backward()  # Backpropagation
        optimizer.step()  # Update model weights

        # Accumulate loss and accuracy
        epoch_loss += loss.item()
        preds = (predictions >= 0.5).float()  # Convert probabilities to binary labels
        correct_predictions += (preds == labels).sum().item()
        total_predictions += labels.size(0)

    # Print epoch summary
    accuracy = correct_predictions / total_predictions
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss / len(train_comments):.4f}, Accuracy: {accuracy:.4f}")


AttributeError: 'list' object has no attribute 'split'

"I do not like Himesh Reshamiya. I do not like his singing too. But his songs are a craze in India, especially among commoners. Now when he ventured to become an actor \x96 that was a big joke! What guts he has to reap as much as he can in his prime time. I did never want to see this movie. But one thing changed it. The movie becoming a super-duper hit! After 2 weeks, Aap Ka Saroor has raked box office collection of 14 crores \x96 compared to Apne that has collected 7 crores in the same 2 weeks. If I can sit through Apne and Rajnikant's absurd Sivaji \x96 I should give this movie also a try to understand what stuff this movie has got that made it such a big hit? The story is about the real life singer Himesh Reshamiya (HR) who has gone to Germany for a concert and falls in love with Riya (Hansika Motwani). A German lawyer Ruby (Mallika Sherawat) loves Himesh. Now Himesh is arrested for a murder. The mission of Himesh (in last 40 minutes) after he runs away from jail is to prove himself

In [38]:
len(train_comments)

600

In [45]:
res = tokenize_comments(comments, vocab)

In [None]:
len(res[0])

TypeError: object of type 'int' has no len()

In [49]:
type(res[0])

list

89527