# Lab 06: N-gram Language Models & Word Embeddings

- **Libraries:** NLTK, NumPy, Gensim, Matplotlib, scikit-learn
- **Datasets:** Reuters Corpus, Brown Corpus, Text8

## Part 1: N-gram Language Model

### 1.1. Introduction to N-gram Language Models

In [1]:
# !pip install nltk numpy gensim matplotlib scikit-learn

Collecting nltk
  Obtaining dependency information for nltk from https://files.pythonhosted.org/packages/60/90/81ac364ef94209c100e12579629dc92bf7a709a84af32f8c551b02c07e94/nltk-3.9.2-py3-none-any.whl.metadata
  Using cached nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting numpy
  Obtaining dependency information for numpy from https://files.pythonhosted.org/packages/8e/ba/80fc0b1e3cb2fd5c6143f00f42eb67762aa043eaa05ca924ecc3222a7849/numpy-2.4.1-cp311-cp311-macosx_14_0_arm64.whl.metadata
  Using cached numpy-2.4.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (6.6 kB)
Collecting gensim
  Obtaining dependency information for gensim from https://files.pythonhosted.org/packages/38/7c/18d40f341276a7461962512ca1fb716d5982db57615dfa272f651ecb96d7/gensim-4.4.0-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Downloading gensim-4.4.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (8.4 kB)
Collecting matplotlib
  Obtaining dependency information for matplotlib from https://files.pythonhosted.org/pac

In [None]:
import nltk

from collections import Counter, defaultdict
from nltk import word_tokenize
from nltk.corpus import brown, reuters
import numpy as np
import re
import math
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import gensim.downloader as api
from sklearn.manifold import TSNE

In [None]:
nltk.download('reuters')
nltk.download('brown')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

## Part 1: N-gram Language Model

### 1.2 Exercise 1: Build N-gram Counter

In [None]:
class NgramCounter:
    """N-gram frequency counter"""
    
    def __init__(self, n=2):
        self.n = n
        self.ngram_counts = Counter()
        self.context_counts = Counter()
        self.vocab = set()
    
    def get_ngrams(self, tokens):
        """
        Generate n-grams from token list
        Add <s> at start and </s> at end
        """
        # TODO: Implement this method
        # 1. Add (n-1) <s> tokens at start
        padded = ['<s>'] * (self.n - 1) + tokens + ['</s>']
        
        # 2. Generate all n-grams as tuples
        ngrams = []
        for i in range(len(padded) - self.n + 1):
            ngram = tuple(padded[i:i + self.n])
            ngrams.append(ngram)
        
        return ngrams
    
    def train(self, sentences):
        """Train on list of tokenized sentences"""
        # TODO: Implement this method
        for sentence in sentences:
            # 1. Get ngrams from sentence
            ngrams = self.get_ngrams(sentence)
            
            # 2. Count ngrams and contexts
            for ngram in ngrams:
                self.ngram_counts[ngram] += 1
                # Context is all except last word
                context = ngram[:-1]
                self.context_counts[context] += 1
                # Add word to vocabulary
                self.vocab.add(ngram[-1])
        
        print(f"Trained on {len(sentences)} sentences")
        print(f"Vocabulary size: {len(self.vocab)}")
        print(f"Total {self.n}-grams: {sum(self.ngram_counts.values())}")

counter = NgramCounter(n=2)
counter.train([['i', 'am', 'sam'], ['sam', 'i', 'am']])

print(f"\nngram_counts[('i', 'am')]: {counter.ngram_counts[('i', 'am')]}")
print(f"context_counts[('i',)]: {counter.context_counts[('i',)]}")

### 1.3 Exercise 2: Calculate N-gram Probabilities

#### Example: Test NgramLanguageModel with different smoothing methods

In [None]:
class NgramLanguageModel(NgramCounter):
    """N-gram Language Model with smoothing"""
    
    def probability(self, word, context, smoothing='none', k=1):
        """
        Calculate P(word | context) with optional smoothing
        smoothing: 'none', 'add-k', 'laplace'
        """
        ngram = context + (word,)
        
        if smoothing == 'none':
            # MLE: P(word|context) = C(context, word) / C(context)
            ngram_count = self.ngram_counts.get(ngram, 0)
            context_count = self.context_counts.get(context, 0)
            
            if context_count == 0:
                return 1e-10  # Small probability for unseen context
            
            return ngram_count / context_count
        
        elif smoothing in ['add-k', 'laplace']:
            # Add-k smoothing: P(word|context) = (C(context, word) + k) / (C(context) + k*V)
            if smoothing == 'laplace':
                k = 1
            
            ngram_count = self.ngram_counts.get(ngram, 0)
            context_count = self.context_counts.get(context, 0)
            V = len(self.vocab)
            
            return (ngram_count + k) / (context_count + k * V)
        
        else:
            raise ValueError(f"Unknown smoothing method: {smoothing}")
    
    def sentence_probability(self, sentence, log=True, smoothing='add-k', k=1):
        """Calculate probability of entire sentence"""
        # Get n-grams from sentence
        ngrams = self.get_ngrams(sentence)
        
        if log:
            # Use log probability
            log_prob = 0
            for ngram in ngrams:
                context = ngram[:-1]
                word = ngram[-1]
                prob = self.probability(word, context, smoothing, k)
                log_prob += math.log(prob + 1e-10)  # Add small epsilon
            return log_prob
        else:
            # Regular probability
            prob = 1.0
            for ngram in ngrams:
                context = ngram[:-1]
                word = ngram[-1]
                prob *= self.probability(word, context, smoothing, k)
            return prob
    
    def perplexity(self, test_sentences, smoothing='add-k', k=1):
        """Calculate perplexity on test set"""
        total_log_prob = 0
        total_words = 0
        
        for sentence in test_sentences:
            log_prob = self.sentence_probability(sentence, log=True, smoothing=smoothing, k=k)
            total_log_prob += log_prob
            total_words += len(sentence) + 1  # +1 for </s>
        
        # Perplexity = exp(-1/N * sum(log P))
        avg_log_prob = total_log_prob / total_words
        perplexity = math.exp(-avg_log_prob)
        
        return perplexity

In [None]:
# Create and train a bigram language model
lm = NgramLanguageModel(n=2)
train_sents = [
    ['i', 'am', 'sam'],
    ['sam', 'i', 'am'],
    ['i', 'do', 'not', 'like', 'green', 'eggs', 'and', 'ham']
]
lm.train(train_sents)

# Test probability calculations
print("\n" + "="*60)
print("Testing Probability Calculations")
print("="*60)

context = ('i',)
word = 'am'

# No smoothing
prob_none = lm.probability(word, context, smoothing='none')
print(f"\nP('{word}' | '{context[0]}') [no smoothing] = {prob_none:.4f}")

# Laplace smoothing
prob_laplace = lm.probability(word, context, smoothing='laplace')
print(f"P('{word}' | '{context[0]}') [Laplace] = {prob_laplace:.4f}")

# Add-k smoothing
prob_addk = lm.probability(word, context, smoothing='add-k', k=0.5)
print(f"P('{word}' | '{context[0]}') [Add-0.5] = {prob_addk:.4f}")

# Test on unseen word
unseen_word = 'python'
prob_unseen = lm.probability(unseen_word, context, smoothing='laplace')
print(f"\nP('{unseen_word}' | '{context[0]}') [Laplace] = {prob_unseen:.6f}")

# Test sentence probability
test_sent = ['i', 'am', 'sam']
log_prob = lm.sentence_probability(test_sent, log=True, smoothing='laplace')
print(f"\nLog probability of '{' '.join(test_sent)}': {log_prob:.4f}")

### 1.4. Training on Real Corpus

In [None]:
# Load and preprocess Brown corpus
brown_sents = brown.sents()

# Lowercase and limit corpus size for faster training
def preprocess_sentences(sents, max_sents=10000):
    """Preprocess sentences: lowercase and filter"""
    processed = []
    for sent in sents[:max_sents]:
        # Lowercase all words
        sent_lower = [word.lower() for word in sent]
        processed.append(sent_lower)
    return processed

corpus = preprocess_sentences(brown_sents, max_sents=10000)

# Split into train and test
split_idx = int(len(corpus) * 0.9)
train_corpus = corpus[:split_idx]
test_corpus = corpus[split_idx:]

print(f"✓ Loaded {len(corpus):,} sentences")
print(f"  Training: {len(train_corpus):,} sentences")
print(f"  Testing: {len(test_corpus):,} sentences")

# Show sample sentences
print("\nSample sentences:")
for i, sent in enumerate(train_corpus[:3]):
    print(f"  [{i+1}] {' '.join(sent[:10])}{'...' if len(sent) > 10 else ''}")

In [None]:
# Train models with different n values
print("Training N-gram Language Models")

models = {}

for n in [1, 2, 3]:
    print(f"\nTraining {n}-gram model...")
    model = NgramLanguageModel(n=n)
    model.train(train_corpus)
    models[n] = model
    
    # Show top 10 most common n-grams
    top_ngrams = model.ngram_counts.most_common(10)
    print(f"\n  Top 10 {n}-grams:")
    for ngram, count in top_ngrams:
        ngram_str = ' '.join(ngram)
        print(f"    {ngram_str:30} : {count:,}")

### 1.5. Text Generation & Autocomplete

In [None]:
class TextGenerator(NgramLanguageModel):
    """Text generation using N-gram Language Model"""
    
    def predict_next_word(self, context, top_k=5, smoothing='add-k', k=1):
        """
        Predict top-k most likely next words given context.
        
        Args:
            context: tuple of (n-1) previous words
            top_k: number of predictions to return
            smoothing: smoothing method
            k: smoothing parameter
            
        Returns:
            list of (word, probability) tuples
        """
        # Calculate probability for all words in vocabulary
        word_probs = []
        
        for word in self.vocab:
            if word not in ['<s>', '</s>']:  # Skip special tokens
                prob = self.probability(word, context, smoothing, k)
                word_probs.append((word, prob))
        
        # Sort by probability and return top-k
        word_probs.sort(key=lambda x: x[1], reverse=True)
        return word_probs[:top_k]
    
    def generate_text(self, start_words=None, max_length=20, smoothing='add-k', k=1):
        """
        Generate text using the language model.
        
        Args:
            start_words: list of starting words (optional)
            max_length: maximum number of words to generate
            smoothing: smoothing method
            k: smoothing parameter
            
        Returns:
            generated text as string
        """
        if start_words is None:
            start_words = ['<s>'] * (self.n - 1)
        else:
            start_words = ['<s>'] * (self.n - 1) + start_words
        
        generated = list(start_words)
        
        for _ in range(max_length):
            # Get context (last n-1 words)
            context = tuple(generated[-(self.n-1):])
            
            # Predict next word
            predictions = self.predict_next_word(context, top_k=10, smoothing=smoothing, k=k)
            
            if not predictions:
                break
            
            # Sample from predictions (weighted by probability)
            words, probs = zip(*predictions)
            probs = np.array(probs)
            probs = probs / probs.sum()  # Normalize
            
            next_word = np.random.choice(words, p=probs)
            
            # Stop if we generate end token
            if next_word == '</s>':
                break
            
            generated.append(next_word)
        
        # Remove start tokens and return
        result = [w for w in generated if w != '<s>']
        return ' '.join(result)
    
    def autocomplete(self, partial_text, top_k=5, smoothing='add-k', k=1):
        """
        Autocomplete: suggest next words given partial text.
        
        Args:
            partial_text: string of partial text
            top_k: number of suggestions
            smoothing: smoothing method
            k: smoothing parameter
            
        Returns:
            list of (word, probability) tuples
        """
        # Tokenize
        tokens = partial_text.lower().split()
        
        # Get context (last n-1 words)
        if len(tokens) >= self.n - 1:
            context = tuple(tokens[-(self.n-1):])
        else:
            # Pad with <s> if not enough words
            padding = ['<s>'] * (self.n - 1 - len(tokens))
            context = tuple(padding + tokens)
        
        # Predict next word
        predictions = self.predict_next_word(context, top_k=top_k, smoothing=smoothing, k=k)
        
        return predictions

#### Example: Text Generation and Autocomplete

In [None]:
# Create text generator from trained bigram model
generator = TextGenerator(n=2)
generator.train(train_corpus)

print("TEXT GENERATION")

# Generate text from scratch
print("\n1. Generate text from scratch:")
for i in range(3):
    text = generator.generate_text(max_length=15)
    print(f"  [{i+1}] {text}")

# Generate text with starting words
print("\n2. Generate text starting with 'the':")
for i in range(3):
    text = generator.generate_text(start_words=['the'], max_length=12)
    print(f"  [{i+1}] {text}")

print("\n3. Generate text starting with 'he was':")
for i in range(3):
    text = generator.generate_text(start_words=['he', 'was'], max_length=12)
    print(f"  [{i+1}] {text}")

print("AUTOCOMPLETE / NEXT WORD PREDICTION")

# Test autocomplete
test_phrases = [
    "the",
    "he was",
    "in the",
    "to be",
    "it is"
]

for phrase in test_phrases:
    predictions = generator.autocomplete(phrase, top_k=5)
    print(f"\n'{phrase}' ->")
    for word, prob in predictions:
        print(f"  {word:<15} (p = {prob:.4f})")

### 1.6. Model Evaluation with Perplexity

In [None]:
# Evaluate perplexity on test set
print("PERPLEXITY EVALUATION")

results = []

for n in [1, 2, 3]:
    model = models[n]
    
    print(f"\n{n}-gram Model:")
    
    # Try different smoothing methods
    for smoothing in ['laplace', 'add-k']:
        k_value = 1 if smoothing == 'laplace' else 0.5
        
        perplexity = model.perplexity(test_corpus, smoothing=smoothing, k=k_value)
        
        print(f"  {smoothing:<15} (k={k_value}): Perplexity = {perplexity:.2f}")
        
        results.append({
            'n': n,
            'smoothing': smoothing,
            'k': k_value,
            'perplexity': perplexity
        })

print("SUMMARY")
print(f"\n{'Model':<20} {'Smoothing':<15} {'Perplexity':<15}")
for r in results:
    model_name = f"{r['n']}-gram"
    print(f"{model_name:<20} {r['smoothing']:<15} {r['perplexity']:<15.2f}")

In [None]:
# Visualize perplexity comparison
import pandas as pd

df = pd.DataFrame(results)

plt.figure(figsize=(12, 5))

# Plot 1: Perplexity by n-gram size
plt.subplot(1, 2, 1)
for smoothing in df['smoothing'].unique():
    subset = df[df['smoothing'] == smoothing]
    plt.plot(subset['n'], subset['perplexity'], marker='o', label=smoothing, linewidth=2)

plt.xlabel('N-gram Size', fontsize=12)
plt.ylabel('Perplexity', fontsize=12)
plt.title('Perplexity vs N-gram Size', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks([1, 2, 3])

# Plot 2: Bar chart comparison
plt.subplot(1, 2, 2)
x = np.arange(len(df))
colors = ['#3498db', '#2ecc71', '#e74c3c', '#f39c12', '#9b59b6', '#1abc9c']
bars = plt.bar(x, df['perplexity'], color=colors[:len(df)])

plt.xlabel('Model Configuration', fontsize=12)
plt.ylabel('Perplexity', fontsize=12)
plt.title('Perplexity Comparison', fontsize=14, fontweight='bold')
plt.xticks(x, [f"{r['n']}-gram\n{r['smoothing']}" for r in results], rotation=0, fontsize=9)
plt.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar, val in zip(bars, df['perplexity']):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{val:.1f}', ha='center', va='bottom', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.show()

# Show best model
best_idx = df['perplexity'].idxmin()
best_model = df.loc[best_idx]
print(f"\n Best Model: {best_model['n']}-gram with {best_model['smoothing']} smoothing")
print(f"   Perplexity: {best_model['perplexity']:.2f}")

## Part 2: Word Embeddings (Word2Vec)

### 2.1. Introduction to Word Embeddings

**Word2Vec Architecture:**
1. **CBOW (Continuous Bag of Words):** Predict target word from context
2. **Skip-gram:** Predict context words from target word

**Applications:**
- Word similarity
- Analogies (king - man + woman ≈ queen)
- Document classification
- Semantic search

### 2.2. Training Word2Vec Model

In [None]:
# Prepare corpus for Word2Vec
print("Preparing corpus for Word2Vec training...")
w2v_corpus = preprocess_sentences(brown_sents, max_sents=20000)

print(f"Corpus size: {len(w2v_corpus):,} sentences")
print(f"Sample: {' '.join(w2v_corpus[0][:15])}...\n")

# Train Word2Vec model
print("TRAINING WORD2VEC MODEL")

# Skip-gram model
print("\n[1/2] Training Skip-gram model...")
sg_model = Word2Vec(
    sentences=w2v_corpus,
    vector_size=100,      # Dimensionality of embeddings
    window=5,             # Context window size
    min_count=5,          # Ignore words with freq < 5
    workers=4,            # Number of CPU cores
    sg=1,                 # 1 = skip-gram, 0 = CBOW
    epochs=10
)

print(f" Trained on {len(w2v_corpus):,} sentences")
print(f"  Vocabulary size: {len(sg_model.wv):,} words")
print(f"  Vector dimensionality: {sg_model.wv.vector_size}")

# CBOW model
print("\n[2/2] Training CBOW model...")
cbow_model = Word2Vec(
    sentences=w2v_corpus,
    vector_size=100,
    window=5,
    min_count=5,
    workers=4,
    sg=0,                 # CBOW
    epochs=10
)

print(f" Trained on {len(w2v_corpus):,} sentences")
print(f" Vocabulary size: {len(cbow_model.wv):,} words")
print(f" Vector dimensionality: {cbow_model.wv.vector_size}")

### 2.3. Loading Pre-trained Embeddings

Use pre-trained Word2Vec model (Google News) or GloVe.

In [None]:
# Load pre-trained embeddings (this may take a while on first run)
print("LOADING PRE-TRAINED EMBEDDINGS")

print("\nAvailable pre-trained models:")
print("  - glove-wiki-gigaword-50 (66 MB)")
print("  - glove-wiki-gigaword-100 (128 MB)")
print("  - word2vec-google-news-300 (1.6 GB)")

try:
    print("\nLoading GloVe embeddings (50-dim)...")
    print("(This may take a minute on first download...)")
    glove_model = api.load("glove-wiki-gigaword-50")
    
    print(f"  Loaded GloVe model")
    print(f"  Vocabulary size: {len(glove_model):,} words")
    print(f"  Vector dimensionality: {glove_model.vector_size}")
    
    # Show sample vectors
    print("\nSample word vectors:")
    for word in ['king', 'queen', 'computer', 'python']:
        if word in glove_model:
            vector = glove_model[word]
            print(f"  '{word}': [{vector[0]:.3f}, {vector[1]:.3f}, {vector[2]:.3f}, ...]")
            
except Exception as e:
    print(f"Could not load pre-trained model: {e}")
    print("Will use trained model instead.")
    glove_model = sg_model.wv

### 2.4. Word Similarity and Analogies

Explore semantic relationships captured by word embeddings.

In [None]:
# Word Similarity
print("WORD SIMILARITY")

# Use GloVe or trained model
model = glove_model

test_words = ['king', 'queen', 'man', 'woman', 'computer', 'dog', 'cat']

print("\nFinding similar words:\n")
for word in test_words:
    if word in model:
        similar = model.most_similar(word, topn=5)
        print(f"'{word}' → ", end="")
        print(", ".join([f"{w} ({s:.3f})" for w, s in similar]))
    else:
        print(f"'{word}' → (not in vocabulary)")

# Pairwise similarity
print("PAIRWISE SIMILARITY (Cosine)")

word_pairs = [
    ('king', 'queen'),
    ('man', 'woman'),
    ('dog', 'cat'),
    ('computer', 'keyboard'),
    ('happy', 'sad'),
    ('good', 'bad'),
    ('king', 'computer')  # Unrelated
]

print(f"\n{'Word 1':<15} {'Word 2':<15} {'Similarity':<15}")
for w1, w2 in word_pairs:
    if w1 in model and w2 in model:
        similarity = model.similarity(w1, w2)
        print(f"{w1:<15} {w2:<15} {similarity:.4f}")
    else:
        print(f"{w1:<15} {w2:<15} (not in vocab)")

In [None]:
# Word Analogies
print("WORD ANALOGIES")
print("\nFormat: A is to B as C is to ?")
print("Computed as: vec(B) - vec(A) + vec(C) ≈ vec(?)\n")

analogies = [
    ('king', 'man', 'queen'),        # king - man + queen ≈ woman
    ('man', 'woman', 'king'),        # man - woman + king ≈ ?
    ('paris', 'france', 'london'),   # paris - france + london ≈ england
    ('good', 'better', 'bad'),       # good - better + bad ≈ worse
    ('walk', 'walked', 'go'),        # walk - walked + go ≈ went
]

for a, b, c in analogies:
    if all(w in model for w in [a, b, c]):
        try:
            # Compute: b - a + c
            result = model.most_similar(positive=[b, c], negative=[a], topn=3)
            
            print(f"{a} - {b} + {c} ≈")
            for word, score in result:
                print(f"  {word:<15} (similarity: {score:.4f})")
            print()
        except:
            print(f"{a} - {b} + {c} ≈ (error computing)")
            print()
    else:
        print(f"{a} - {b} + {c} ≈ (words not in vocabulary)")
        print()

### 2.5. Visualization with t-SNE

Visualize word embeddings in 2D space using t-SNE dimensionality reduction.

In [None]:
# Visualize word embeddings with t-SNE
print("Creating t-SNE visualization...")

# Select words to visualize
words_to_plot = [
    # Animals
    'dog', 'cat', 'lion', 'tiger', 'elephant',
    # Countries
    'america', 'china', 'france', 'germany', 'japan',
    # Royalty
    'king', 'queen', 'prince', 'princess',
    # Family
    'man', 'woman', 'boy', 'girl', 'father', 'mother',
    # Colors
    'red', 'blue', 'green', 'yellow',
    # Tech
    'computer', 'software', 'internet', 'program'
]

# Filter words that exist in model
words_in_vocab = [w for w in words_to_plot if w in model]
print(f"Plotting {len(words_in_vocab)} words...")

# Get word vectors
word_vectors = np.array([model[w] for w in words_in_vocab])

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(words_in_vocab)-1))
word_vectors_2d = tsne.fit_transform(word_vectors)

# Create plot
plt.figure(figsize=(14, 10))
plt.scatter(word_vectors_2d[:, 0], word_vectors_2d[:, 1], alpha=0.6, s=100)

# Add labels
for i, word in enumerate(words_in_vocab):
    plt.annotate(word, 
                xy=(word_vectors_2d[i, 0], word_vectors_2d[i, 1]),
                xytext=(5, 2),
                textcoords='offset points',
                ha='left',
                fontsize=10,
                weight='bold')

plt.title('Word Embeddings Visualization (t-SNE)', fontsize=16, fontweight='bold')
plt.xlabel('t-SNE Dimension 1', fontsize=12)
plt.ylabel('t-SNE Dimension 2', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### 2.6. Applications

Practical applications of word embeddings.

In [None]:
# Application 1: Sentence Embeddings (Simple Averaging)
print("APPLICATION 1: SENTENCE SIMILARITY")

def get_sentence_vector(sentence, model):
    """
    Get sentence embedding by averaging word vectors.
    """
    words = sentence.lower().split()
    vectors = []
    
    for word in words:
        if word in model:
            vectors.append(model[word])
    
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

def cosine_similarity(v1, v2):
    """Calculate cosine similarity between two vectors."""
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    
    if norm_v1 == 0 or norm_v2 == 0:
        return 0.0
    
    return dot_product / (norm_v1 * norm_v2)

# Test sentences
sentences = [
    "The cat sits on the mat",
    "A dog lies on the carpet",
    "I love programming in Python",
    "She enjoys coding with Python",
    "The weather is sunny today"
]

print("\nCalculating sentence similarities...\n")

# Get sentence vectors
sent_vectors = [get_sentence_vector(s, model) for s in sentences]

# Calculate pairwise similarities
print(f"{'Sentence 1':<40} {'Sentence 2':<40} {'Similarity':<10}")
print("-"*90)

for i in range(len(sentences)):
    for j in range(i+1, len(sentences)):
        sim = cosine_similarity(sent_vectors[i], sent_vectors[j])
        s1_short = sentences[i][:37] + "..." if len(sentences[i]) > 40 else sentences[i]
        s2_short = sentences[j][:37] + "..." if len(sentences[j]) > 40 else sentences[j]
        print(f"{s1_short:<40} {s2_short:<40} {sim:.4f}")

In [None]:
# Application 2: Word Clustering
print("APPLICATION 2: WORD CLUSTERING")

# Select words from different semantic categories
cluster_words = [
    # Animals
    'dog', 'cat', 'lion', 'tiger', 'elephant', 'bird',
    # Food
    'apple', 'banana', 'orange', 'bread', 'cheese',
    # Colors
    'red', 'blue', 'green', 'yellow', 'black',
    # Numbers
    'one', 'two', 'three', 'four', 'five'
]

# Filter words in vocabulary
cluster_words = [w for w in cluster_words if w in model]

if len(cluster_words) > 10:
    print(f"\nClustering {len(cluster_words)} words into semantic groups...")
    
    # Get word vectors
    word_vecs = np.array([model[w] for w in cluster_words])
    
    # Apply t-SNE
    tsne = TSNE(n_components=2, random_state=42, perplexity=min(5, len(cluster_words)-1))
    word_vecs_2d = tsne.fit_transform(word_vecs)
    
    # Visualize
    plt.figure(figsize=(12, 8))
    
    # Define categories for coloring (if word contains certain patterns)
    colors_map = []
    for word in cluster_words:
        if word in ['dog', 'cat', 'lion', 'tiger', 'elephant', 'bird']:
            colors_map.append('red')
        elif word in ['apple', 'banana', 'orange', 'bread', 'cheese']:
            colors_map.append('green')
        elif word in ['red', 'blue', 'green', 'yellow', 'black']:
            colors_map.append('blue')
        elif word in ['one', 'two', 'three', 'four', 'five']:
            colors_map.append('orange')
        else:
            colors_map.append('gray')
    
    scatter = plt.scatter(word_vecs_2d[:, 0], word_vecs_2d[:, 1], 
                         c=colors_map, alpha=0.6, s=150)
    
    # Add labels
    for i, word in enumerate(cluster_words):
        plt.annotate(word,
                    xy=(word_vecs_2d[i, 0], word_vecs_2d[i, 1]),
                    xytext=(5, 2),
                    textcoords='offset points',
                    ha='left',
                    fontsize=11,
                    weight='bold')
    
    # Add legend
    from matplotlib.patches import Patch
    legend_elements = [
        Patch(facecolor='red', label='Animals'),
        Patch(facecolor='green', label='Food'),
        Patch(facecolor='blue', label='Colors'),
        Patch(facecolor='orange', label='Numbers')
    ]
    plt.legend(handles=legend_elements, loc='best')
    
    plt.title('Word Clustering by Semantic Category', fontsize=16, fontweight='bold')
    plt.xlabel('t-SNE Dimension 1', fontsize=12)
    plt.ylabel('t-SNE Dimension 2', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
else:
    print("Not enough words in vocabulary for clustering")