In [28]:
import numpy as np
import torch
import torch.nn as nn
from collections import Counter
from itertools import chain

In [29]:
# Step 1: Load GloVe Embeddings
def load_glove_embeddings(file_path, embedding_dim):
    """
    Load GloVe embeddings from the file into a dictionary.
    """
    embedding_dict = {}
    with open(file_path, 'r', encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]  # The word
            vector = np.array(values[1:], dtype="float32")  # The embedding vector
            embedding_dict[word] = vector
    print(f"Loaded {len(embedding_dict)} word vectors.")
    return embedding_dict


In [30]:
# Step 3: Create the Embedding Matrix
def create_embedding_matrix(vocab, glove_embeddings, embedding_dim):
    """
    Create an embedding matrix where each row corresponds to a token in the vocabulary.
    """
    vocab_size = len(vocab)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))  # Initialize matrix with zeros

    for word, idx in vocab.items():
        if word in glove_embeddings:
            embedding_matrix[idx] = glove_embeddings[word]
        else:
            # Initialize randomly for missing words
            embedding_matrix[idx] = np.random.uniform(-0.01, 0.01, embedding_dim)

    return embedding_matrix

In [53]:
# Path to GloVe file and embedding dimensions
glove_file_path = "glove.6B.100d.txt"
embedding_dim = 100

In [54]:
# Load GloVe embeddings
glove_embeddings = load_glove_embeddings(glove_file_path, embedding_dim)


Loaded 400001 word vectors.


In [55]:
# Step 2: Build Vocabulary
# Example tokenized dataset (replace with your dataset tokens)
vocab_file = 'imdb.vocab'
with open(vocab_file, 'r') as f:
    vocab_words = f.read().splitlines()

vocab_size = len(vocab_words)
tokenized_sentences = vocab_words
# Create vocabulary
# tokens = list(chain(*tokenized_sentences))
vocab_counter = Counter(tokenized_sentences)


In [56]:
# Assign an index to each word in the vocabulary
vocab = {word: idx for idx, (word, _) in enumerate(vocab_counter.most_common())}
vocab["<PAD>"] = 0  # Add special token for padding
vocab["<UNK>"] = len(vocab)  # Add special token for unknown words
print(f"Vocabulary size: {len(vocab)}")


Vocabulary size: 89529


In [57]:
# Create embedding matrix
embedding_matrix = create_embedding_matrix(vocab, glove_embeddings, embedding_dim)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

Embedding matrix shape: (89529, 100)


In [58]:
# Step 4: Load Embedding Matrix into PyTorch Embedding Layer
embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float32)

# Define the embedding layer
embedding_layer = nn.Embedding(len(vocab), embedding_dim)
embedding_layer.weight.data.copy_(embedding_tensor)  # Load pre-trained weights

tensor([[ 5.5091e-03, -4.2487e-03,  1.6969e-03,  ..., -9.4443e-03,
          1.5800e-03,  1.9134e-03],
        [-7.1953e-02,  2.3127e-01,  2.3731e-02,  ..., -7.1895e-01,
          8.6894e-01,  1.9539e-01],
        [-2.7086e-01,  4.4006e-02, -2.0260e-02,  ..., -4.9230e-01,
          6.3687e-01,  2.3642e-01],
        ...,
        [-4.3755e-03,  5.8909e-03, -1.3712e-03,  ...,  3.8811e-03,
         -6.1648e-04,  2.6436e-03],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-6.4111e-03, -9.2641e-03,  9.5760e-03,  ..., -6.4316e-03,
          4.8961e-03, -1.9350e-03]])

In [59]:
# Optionally freeze the embedding layer
embedding_layer.weight.requires_grad = False
print("Embedding layer is ready.")

# Step 5: Test the Embedding Layer
example_sentence = ["this", "movie", "is", "great"]  # Example input sentence
token_indices = [vocab.get(token, vocab["<UNK>"]) for token in example_sentence]


Embedding layer is ready.


In [60]:
# Convert token indices to a PyTorch tensor
input_tensor = torch.tensor(token_indices).unsqueeze(0)  # Add batch dimension

# Pass through the embedding layer
output_embeddings = embedding_layer(input_tensor)
print(f"Input indices: {token_indices}")
print(f"Output embeddings shape: {output_embeddings.shape}")


Input indices: [9, 15, 5, 83]
Output embeddings shape: torch.Size([1, 4, 100])


In [61]:


# Output embedding for visualization
print("Output embeddings for the example sentence:")
print(output_embeddings.shape)


Output embeddings for the example sentence:
torch.Size([1, 4, 100])


In [62]:
embedding_matrix.shape

(89529, 100)

In [63]:
embedding_tensor.shape

torch.Size([89529, 100])

In [65]:
token_indices

[9, 15, 5, 83]

In [49]:
x = Counter(tokenized_sentences)


In [52]:
len(x.most_common())

89527