In [2]:
import sys
print(sys.executable)  # Should show path to rnn environment

/opt/miniconda3/envs/aipnd/bin/python


In [3]:
import torch
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /Users/chaklader/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/chaklader/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chaklader/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chaklader/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/chaklader/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
sample_text = "The quick brown fox jumps over the lazy dog."
tokens = sample_text.lower().split()

### GloVE

##### Word Embeddings Explained

Word embeddings like GloVe are dense vector representations of words where:

- Each word is mapped to a fixed-length vector of real numbers
- The vectors capture semantic relationships between words
- Words with similar meanings have vectors that are close in the vector space
- The vector dimensions implicitly represent different semantic aspects of words

GloVe (Global Vectors for Word Representation) specifically is trained to capture global word-word co-occurrence statistics from a corpus. The resulting embeddings have interesting properties:

- Words that appear in similar contexts have similar embeddings
- Vector arithmetic works meaningfully: e.g., vector("king") - vector("man") + vector("woman") ≈ vector("queen")
- The distance between word vectors correlates with semantic similarity

The file naming convention `glove.6B.50d.txt` indicates:
- `6B`: Trained on 6 billion tokens
- `50d`: Each word is represented by a 50-dimensional vector

These pre-trained embeddings allow you to convert text data into numerical representations that machine learning models can process while preserving semantic relationships between words.

##### How These Components Work Together

In a typical NLP pipeline:

1. The `preprocess_text` function would clean and tokenize raw text
2. The tokens would be converted to embeddings using the loaded embedding dictionary
3. These embeddings would then be fed into a machine learning model

For example, after preprocessing a sentence, you might average the embeddings of all its words to get a sentence representation, or you might create sequences of embeddings to feed into an LSTM or other neural network.

This approach is fundamental to many NLP tasks like sentiment analysis, text classification, and question answering.

In [8]:
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def preprocess_text(text: str) -> list:
    """Preprocess text by normalizing, tokenizing, and removing stopwords.
    
    Args:
        text (str): Input text to preprocess
        
    Returns:
        list: List of processed tokens
    """
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation (expanded punctuation set)
    text = ''.join(c for c in text if c not in '.,;:!?-"\'()[]{}')
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens  # Changed from 'tokens' to 'filtered_tokens'

def load_glove_model(file) -> dict:
    """Load pre-trained GloVe word embeddings from file.
    
    Args:
        file (str): Path to the GloVe embeddings file
        
    Returns:
        dict: Dictionary mapping words to their embedding vectors
    """
    # Init an empty dict to store "word" as key and its "embedding" as value
    glove_model = {}

    with open(file,'r') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding

    return glove_model

# Load the pre-trained GloVe embeddings (50-dimensional)
embedding_dict = load_glove_model("data/glove.6B.50d.txt")

# Let's check embeddings of a word
hello_embedding = embedding_dict['hello']
print("Embedding for 'hello':")
print(hello_embedding)

# Let's print the embedding vector dimension
print(f"Embedding dimension: {hello_embedding.shape[0]}")  # This should be 50 for this specific file


Embedding for 'hello':
[-0.38497   0.80092   0.064106 -0.28355  -0.026759 -0.34532  -0.64253
 -0.11729  -0.33257   0.55243  -0.087813  0.9035    0.47102   0.56657
  0.6985   -0.35229  -0.86542   0.90573   0.03576  -0.071705 -0.12327
  0.54923   0.47005   0.35572   1.2611   -0.67581  -0.94983   0.68666
  0.3871   -1.3492    0.63512   0.46416  -0.48814   0.83827  -0.9246
 -0.33722   0.53741  -1.0616   -0.081403 -0.67111   0.30923  -0.3923
 -0.55002  -0.68827   0.58049  -0.11626   0.013139 -0.57654   0.048833
  0.67204 ]
Embedding dimension: 50


In [None]:
# DEMONSTRATION: Working with word embeddings
# -----------------------------------------------------------------------------

def get_sentence_embedding(text: str, embedding_dict: dict) -> np.ndarray:
    """Convert a sentence to its embedding representation by averaging word vectors.
    
    Args:
        text (str): Input text to convert to embedding
        embedding_dict (dict): Dictionary of word embeddings
        
    Returns:
        np.ndarray: Average embedding vector for the sentence
    """
    # Preprocess the text to get clean tokens
    tokens = preprocess_text(text)
    
    # Filter out words not in our embedding dictionary
    valid_tokens = [token for token in tokens if token in embedding_dict]
    
    if not valid_tokens:
        # If no valid tokens, return a zero vector with same dimension as embeddings
        embedding_dim = next(iter(embedding_dict.values())).shape[0]
        return np.zeros(embedding_dim)
    
    # Get embeddings for all valid tokens
    token_embeddings = [embedding_dict[token] for token in valid_tokens]
    
    # Average the embeddings to get a sentence-level representation
    sentence_embedding = np.mean(token_embeddings, axis=0)
    
    return sentence_embedding

def find_similar_words(word: str, embedding_dict: dict, n: int = 5) -> list:
    """Find n most similar words to the given word based on cosine similarity.
    
    Args:
        word (str): Target word to find similar words for
        embedding_dict (dict): Dictionary of word embeddings
        n (int): Number of similar words to return
        
    Returns:
        list: List of tuples (word, similarity_score) of the most similar words
    """
    # Check if word exists in the embedding dictionary
    if word not in embedding_dict:
        return [("Word not found in vocabulary", 0)]
    
    # Get the embedding for the target word
    word_embedding = embedding_dict[word]
    
    # Function to calculate cosine similarity between two vectors
    def cosine_similarity(vec1, vec2):
        dot_product = np.dot(vec1, vec2)
        norm_vec1 = np.linalg.norm(vec1)
        norm_vec2 = np.linalg.norm(vec2)
        return dot_product / (norm_vec1 * norm_vec2)
    
    # Calculate similarity with all words in the dictionary
    similarities = []
    for other_word, other_embedding in embedding_dict.items():
        # Skip the same word
        if other_word == word:
            continue
        
        # Calculate similarity score
        similarity = cosine_similarity(word_embedding, other_embedding)
        similarities.append((other_word, similarity))
    
    # Sort by similarity (highest first) and take top n
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:n]

# Demonstrate embedding an entire sentence
sample_sentence = "The quick brown fox jumps over the lazy dog"
print("\n--- Sentence Embedding Example ---")

sentence_embedding = get_sentence_embedding(sample_sentence, embedding_dict)
print(f"Original sentence: '{sample_sentence}'")
print(f"Preprocessed tokens: {preprocess_text(sample_sentence)}")
print(f"Sentence embedding shape: {sentence_embedding.shape}")
print(f"First 5 values of sentence embedding: {sentence_embedding[:5]}")

# Demonstrate finding similar words
target_word = "king"
print(f"\n--- Finding words similar to '{target_word}' ---")

similar_words = find_similar_words(target_word, embedding_dict)
print("Most similar words (with similarity scores):")

for word, score in similar_words:
    print(f"{word}: {score:.4f}")

# Demonstrate vector arithmetic (king - man + woman ≈ queen)
if all(word in embedding_dict for word in ["king", "man", "woman"]):
    print("\n--- Word Vector Arithmetic Example ---")
    result_vector = embedding_dict["king"] - embedding_dict["man"] + embedding_dict["woman"]
    
    # Find the closest word to this result vector
    closest_word = None
    highest_similarity = -1
    
    for word, embedding in embedding_dict.items():
        # Skip the words used in the equation
        if word in ["king", "man", "woman"]:
            continue
            
        # Calculate cosine similarity
        similarity = np.dot(result_vector, embedding) / (np.linalg.norm(result_vector) * np.linalg.norm(embedding))
        
        if similarity > highest_similarity:
            highest_similarity = similarity
            closest_word = word
    
    print(f"king - man + woman ≈ {closest_word} (similarity: {highest_similarity:.4f})")


--- Sentence Embedding Example ---
Original sentence: 'The quick brown fox jumps over the lazy dog'
Preprocessed tokens: ['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog']
Sentence embedding shape: (50,)
First 5 values of sentence embedding: [-0.15505333 -0.18144967 -0.12989    -0.17379167  0.29983667]

--- Finding words similar to 'king' ---
Most similar words (with similarity scores):
prince: 0.8236
queen: 0.7839
ii: 0.7746
emperor: 0.7736
son: 0.7667

--- Word Vector Arithmetic Example ---
king - man + woman ≈ queen (similarity: 0.8610)


In [9]:
# Now let's create the embedding matrix for sample_text
sample_tokens = preprocess_text(sample_text)
sample_embedding_matrix = []

for sample_token in sample_tokens:
    sample_embedding_matrix.append(embedding_dict[sample_token])

# we should have as many embedding vectors (rows of embedding matrix) as there are sample tokens
assert len(sample_embedding_matrix) == len(sample_tokens)

# lets print a token and its embedding
print(sample_tokens[2])
print(sample_embedding_matrix[2])

fox
[ 0.44206   0.059552  0.15861   0.92777   0.1876    0.24256  -1.593
 -0.79847  -0.34099  -0.24021  -0.32756   0.43639  -0.11057   0.50472
  0.43853   0.19738  -0.1498   -0.046979 -0.83286   0.39878   0.062174
  0.28803   0.79134   0.31798  -0.21933  -1.1015   -0.080309  0.39122
  0.19503  -0.5936    1.7921    0.3826   -0.30509  -0.58686  -0.76935
 -0.61914  -0.61771  -0.68484  -0.67919  -0.74626  -0.036646  0.78251
 -1.0072   -0.59057  -0.7849   -0.39113  -0.49727  -0.4283   -0.15204
  1.5064  ]
