
Text representation or Feature extraction is the process of converting raw text into a numerical format that a computer can understand and process.

1.One-hot encoding
It is a simple text representation technique that converts categorical data, like words, into a numerical format. It creates a binary vector for each word in a vocabulary, with a length equal to the size of the vocabulary.

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample corpus
corpus = ["The cat sat on the mat", "The dog sat on the log"]

# Create a CountVectorizer instance
# The 'binary=True' argument makes it a one-hot-like encoding (presence/absence)
vectorizer = CountVectorizer(binary=True)

# Fit the vectorizer to the corpus and transform the text
X = vectorizer.fit_transform(corpus)

# Get the feature names (the vocabulary)
vocabulary = vectorizer.get_feature_names_out()

# Print the vocabulary
print("Vocabulary:", vocabulary)

# Print the one-hot encoded vectors (as a sparse matrix)
print("One-Hot Encoded Vectors:")
print(X.toarray())

Vocabulary: ['cat' 'dog' 'log' 'mat' 'on' 'sat' 'the']
One-Hot Encoded Vectors:
[[1 0 0 1 1 1 1]
 [0 1 1 0 1 1 1]]


2. Bag of Words [BoW]
This model is a simple text representation technique that represents a document as an unordered collection of words, or a ‚Äúbag.‚Äù It completely ignores grammar and word order but keeps track of word frequencies.

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

# Example emails
corpus = [
    "Free money, claim your prize now!",
    "Please confirm your meeting attendance."
]

print("=== 1. Basic Bag of Words (Unigrams) ===")
# Create a CountVectorizer instance for unigrams (single words)
vectorizer_unigram = CountVectorizer(binary=False, lowercase=True)

# Learn the vocabulary from the corpus and transform the text
X_unigram = vectorizer_unigram.fit_transform(corpus)

# Print the vocabulary (feature names)
print("Vocabulary:", vectorizer_unigram.get_feature_names_out())

# Print the BoW matrix (as a dense array for readability)
print("BoW Vectors (Unigrams):")
print(X_unigram.toarray())

print("\n" + "="*50)
print("=== 2. Bag of Bigrams (2-grams) ===")
# Create vectorizer for bigrams only
vectorizer_bigram = CountVectorizer(ngram_range=(2, 2), lowercase=True)
X_bigram = vectorizer_bigram.fit_transform(corpus)

print("Bigram Vocabulary:", vectorizer_bigram.get_feature_names_out())
print("BoW Vectors (Bigrams):")
print(X_bigram.toarray())

print("\n" + "="*50)
print("=== 3. Bag of Trigrams (3-grams) ===")
# Create vectorizer for trigrams only
vectorizer_trigram = CountVectorizer(ngram_range=(3, 3), lowercase=True)
X_trigram = vectorizer_trigram.fit_transform(corpus)

print("Trigram Vocabulary:", vectorizer_trigram.get_feature_names_out())
print("BoW Vectors (Trigrams):")
print(X_trigram.toarray())

print("\n" + "="*50)
print("=== 4. Combined Unigrams + Bigrams ===")
# Create vectorizer for both unigrams and bigrams
vectorizer_combined = CountVectorizer(ngram_range=(1, 2), lowercase=True)
X_combined = vectorizer_combined.fit_transform(corpus)

print("Combined Vocabulary (Unigrams + Bigrams):")
print(vectorizer_combined.get_feature_names_out())
print("BoW Vectors (Unigrams + Bigrams):")
print(X_combined.toarray())

print("\n" + "="*50)
print("=== 5. Analysis of N-grams ===")
print("Email 1: 'Free money, claim your prize now!'")
print("Email 2: 'Please confirm your meeting attendance.'")
print()
print("Key Insights:")
print("‚Ä¢ Unigrams: Capture individual words")
print("‚Ä¢ Bigrams: Capture word pairs like 'free money', 'your prize'")
print("‚Ä¢ Trigrams: Capture longer phrases like 'claim your prize'")
print("‚Ä¢ Combined: Provides both word-level and phrase-level features")
print("‚Ä¢ Higher n-grams capture more context but increase feature dimensionality")

=== 1. Basic Bag of Words (Unigrams) ===
Vocabulary: ['attendance' 'claim' 'confirm' 'free' 'meeting' 'money' 'now' 'please'
 'prize' 'your']
BoW Vectors (Unigrams):
[[0 1 0 1 0 1 1 0 1 1]
 [1 0 1 0 1 0 0 1 0 1]]

=== 2. Bag of Bigrams (2-grams) ===
Bigram Vocabulary: ['claim your' 'confirm your' 'free money' 'meeting attendance'
 'money claim' 'please confirm' 'prize now' 'your meeting' 'your prize']
BoW Vectors (Bigrams):
[[1 0 1 0 1 0 1 0 1]
 [0 1 0 1 0 1 0 1 0]]

=== 3. Bag of Trigrams (3-grams) ===
Trigram Vocabulary: ['claim your prize' 'confirm your meeting' 'free money claim'
 'money claim your' 'please confirm your' 'your meeting attendance'
 'your prize now']
BoW Vectors (Trigrams):
[[1 0 1 1 0 0 1]
 [0 1 0 0 1 1 0]]

=== 4. Combined Unigrams + Bigrams ===
Combined Vocabulary (Unigrams + Bigrams):
['attendance' 'claim' 'claim your' 'confirm' 'confirm your' 'free'
 'free money' 'meeting' 'meeting attendance' 'money' 'money claim' 'now'
 'please' 'please confirm' 'prize' 'prize

In [3]:
# Practical N-grams Example: Sentiment Analysis Context
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# More realistic text examples
reviews = [
    "This movie is really good and entertaining",
    "The movie is not good at all",
    "Really bad movie, not entertaining",
    "This is a really bad experience",
    "Good movie, really entertaining and good"
]

print("=== N-grams for Sentiment Analysis ===")
print("Reviews:")
for i, review in enumerate(reviews, 1):
    print(f"{i}. {review}")

print("\n" + "="*60)

# Demonstrate different n-gram ranges
n_gram_ranges = [(1, 1), (2, 2), (3, 3), (1, 2), (1, 3)]
range_names = ["Unigrams", "Bigrams", "Trigrams", "Uni+Bigrams", "Uni+Bi+Trigrams"]

for ngram_range, name in zip(n_gram_ranges, range_names):
    print(f"\n=== {name} {ngram_range} ===")
    
    vectorizer = CountVectorizer(ngram_range=ngram_range, lowercase=True)
    X = vectorizer.fit_transform(reviews)
    
    feature_names = vectorizer.get_feature_names_out()
    print(f"Number of features: {len(feature_names)}")
    print(f"Features: {list(feature_names)}")
    
    # Show matrix shape and some sample values
    print(f"Matrix shape: {X.shape}")
    
    # Convert to dense for better readability (only for small examples)
    if len(feature_names) <= 15:  # Only show dense matrix if not too large
        print("Feature Matrix:")
        df = pd.DataFrame(X.toarray(), 
                         columns=feature_names,
                         index=[f"Review {i+1}" for i in range(len(reviews))])
        print(df)
    else:
        print("Matrix too large to display - showing first 5 features only")
        df = pd.DataFrame(X.toarray()[:, :5], 
                         columns=feature_names[:5],
                         index=[f"Review {i+1}" for i in range(len(reviews))])
        print(df)

print("\n" + "="*60)
print("=== Key Observations ===")
print("‚Ä¢ Unigrams: 'good', 'bad', 'really' - basic sentiment words")
print("‚Ä¢ Bigrams: 'really good', 'really bad', 'not good' - stronger sentiment context") 
print("‚Ä¢ Trigrams: 'is really good', 'not good at' - even more context")
print("‚Ä¢ Trade-off: More n-grams = more features but better context capture")


=== N-grams for Sentiment Analysis ===
Reviews:
1. This movie is really good and entertaining
2. The movie is not good at all
3. Really bad movie, not entertaining
4. This is a really bad experience
5. Good movie, really entertaining and good


=== Unigrams (1, 1) ===
Number of features: 13
Features: ['all', 'and', 'at', 'bad', 'entertaining', 'experience', 'good', 'is', 'movie', 'not', 'really', 'the', 'this']
Matrix shape: (5, 13)
Feature Matrix:
          all  and  at  bad  entertaining  experience  good  is  movie  not  \
Review 1    0    1   0    0             1           0     1   1      1    0   
Review 2    1    0   1    0             0           0     1   1      1    1   
Review 3    0    0   0    1             1           0     0   0      1    1   
Review 4    0    0   0    1             0           1     0   1      0    0   
Review 5    0    1   0    0             1           0     2   0      1    0   

          really  the  this  
Review 1       1    0     1  
Review 2    

3. TF-IDF (Term Frequency-Inverse Document Frequency)
TF-IDF is a numerical statistic used to reflect how important a word is to a document in a corpus.

Term Frequency measures how often a word appears in a specific document. The more frequent the word, the higher its TF score



In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Example corpus designed to show different TF scores
corpus = [
    "Python is a programming language. Python is easy to learn. Python Python Python is popular.",  # Document 1: High TF for "Python"
    "Java is also a programming language. Java developers use Java for enterprise applications."     # Document 2: High TF for "Java"
]

# Create a TfidfVectorizer instance
vectorizer = TfidfVectorizer()

# Fit and transform the corpus
X = vectorizer.fit_transform(corpus)

# Get the feature names (the vocabulary)
feature_names = vectorizer.get_feature_names_out()

# Print the vocabulary
print("Vocabulary:", feature_names)

# Print the TF-IDF vectors (as a dense array for readability)
print("\nTF-IDF Vectors:")
tfidf_matrix = X.toarray()
print(tfidf_matrix)

# Create a detailed analysis showing TF scores
print("\n" + "="*60)
print("=== DETAILED TF-IDF ANALYSIS ===")

# Show TF-IDF scores for each document
import pandas as pd

# Create DataFrame for better visualization
df = pd.DataFrame(tfidf_matrix, 
                 columns=feature_names,
                 index=['Document 1 (Python-focused)', 'Document 2 (Java-focused)'])

print("\nTF-IDF Score Matrix:")
print(df.round(4))

# Analyze highest scoring words in each document
print("\n=== TOP WORDS BY TF-IDF SCORE ===")
for i, doc_name in enumerate(['Document 1 (Python-focused)', 'Document 2 (Java-focused)']):
    print(f"\n{doc_name}:")
    word_scores = [(feature_names[j], tfidf_matrix[i][j]) for j in range(len(feature_names))]
    word_scores.sort(key=lambda x: x[1], reverse=True)
    
    for word, score in word_scores[:5]:  # Top 5 words
        if score > 0:
            print(f"  {word}: {score:.4f}")

# Show raw term frequencies for comparison
print("\n" + "="*60)
print("=== RAW TERM FREQUENCIES (TF) ===")

# Calculate raw TF using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
tf_matrix = count_vectorizer.fit_transform(corpus).toarray()

tf_df = pd.DataFrame(tf_matrix, 
                    columns=count_vectorizer.get_feature_names_out(),
                    index=['Document 1', 'Document 2'])

print("Term Frequency Matrix (raw counts):")
print(tf_df)

print("\n=== KEY OBSERVATIONS ===")
print("‚Ä¢ 'Python' appears 5 times in Document 1 ‚Üí High TF score")
print("‚Ä¢ 'Java' appears 3 times in Document 2 ‚Üí High TF score") 
print("‚Ä¢ 'programming' and 'language' appear in both ‚Üí Lower IDF, moderate TF-IDF")
print("‚Ä¢ Words like 'easy', 'learn', 'enterprise' are unique ‚Üí Higher IDF, higher TF-IDF")
print("‚Ä¢ TF-IDF = TF √ó IDF (balances frequency with rarity)")

# You can also get the IDF scores
print("\nIDF Scores (Inverse Document Frequency):")
for i, name in enumerate(feature_names):
    print(f"{name}: {vectorizer.idf_[i]:.4f}")

Vocabulary: ['also' 'applications' 'developers' 'easy' 'enterprise' 'for' 'is' 'java'
 'language' 'learn' 'popular' 'programming' 'python' 'to' 'use']

TF-IDF Vectors:
[[0.         0.         0.         0.17008209 0.         0.
  0.36304442 0.         0.12101481 0.17008209 0.17008209 0.12101481
  0.85041044 0.17008209 0.        ]
 [0.24604336 0.24604336 0.24604336 0.         0.24604336 0.24604336
  0.17506188 0.73813008 0.17506188 0.         0.         0.17506188
  0.         0.         0.24604336]]

=== DETAILED TF-IDF ANALYSIS ===

TF-IDF Score Matrix:
                              also  applications  developers    easy  \
Document 1 (Python-focused)  0.000         0.000       0.000  0.1701   
Document 2 (Java-focused)    0.246         0.246       0.246  0.0000   

                             enterprise    for      is    java  language  \
Document 1 (Python-focused)       0.000  0.000  0.3630  0.0000    0.1210   
Document 2 (Java-focused)         0.246  0.246  0.1751  0.7381    0.17

4. Word Embedding
Word Embedding is a text representation technique where words or phrases from the
 vocabulary are mapped to vectors of real numbers.

In [6]:
from gensim.models import Word2Vec

# Sample corpus (tokenized sentences)
corpus = [
    ["the", "cat", "sat", "on", "the", "mat"],
    ["the", "dog", "walked", "on", "the", "street"],
    ["a", "car", "drove", "by", "a", "truck"],
    ["man", "is", "the", "king", "of", "jungle"],
    ["woman", "is", "the", "queen", "of", "a", "family"]
]

# Train the Word2Vec model
# We use the default Skip-gram model (sg=0 for CBOW)
model = Word2Vec(sentences=corpus,
                 vector_size=100,  # Dimensionality of the word vectors
                 window=5,         # Maximum distance between the current and predicted word
                 min_count=1,      # Ignores all words with total frequency lower than this
                 workers=4)        # Use 4 CPU cores for training

# Get the vector for a word
print("Vector for 'cat':\n", model.wv['cat'])

# Find the most similar words
print("\nWords most similar to 'cat':\n", model.wv.most_similar('cat'))

Vector for 'cat':
 [ 0.00257251  0.00084847 -0.00253914  0.00935892  0.00275784  0.00409409
 -0.00118331  0.00090606  0.00662316 -0.00072743  0.00334267 -0.00067134
  0.00524796  0.00363926  0.002584   -0.0053113  -0.00470893  0.00430647
 -0.0059082  -0.00018227 -0.00063462  0.00349116 -0.00844191  0.00881516
 -0.00145111 -0.00533294  0.00405283 -0.00193385 -0.0077646  -0.00449672
 -0.00038841 -0.00894825  0.00057069  0.00244194 -0.00322519  0.00257062
  0.00248097  0.00998819  0.00142857  0.0020191   0.00277751 -0.0020782
 -0.0086982   0.00802494 -0.00197519 -0.0096929  -0.00654969 -0.00394582
  0.00395376  0.00504065  0.00608667 -0.00677156  0.00069044 -0.00277415
 -0.0052109   0.0069812   0.00395213 -0.00310513 -0.00827734 -0.00514148
 -0.00064909  0.007812    0.00604447 -0.00845231 -0.009565    0.00713558
 -0.00232571 -0.00369028  0.00574776 -0.00584365  0.00509312 -0.00024008
 -0.00687449 -0.0003305   0.00635958  0.00929556  0.00221949  0.00505199
 -0.00497511 -0.00079866 -0.00531

### Word2Vec Code Explanation

The following code demonstrates how to train and use a Word2Vec model using the Gensim library. Word2Vec creates dense vector representations of words by learning from their context in sentences.

In [7]:
# Let's break down the Word2Vec code step by step
from gensim.models import Word2Vec

print("=== STEP 1: DATA PREPARATION ===")
# Sample corpus (tokenized sentences)
# Important: Word2Vec requires PRE-TOKENIZED text (list of lists)
corpus = [
    ["the", "cat", "sat", "on", "the", "mat"],        # Sentence 1: tokenized into words
    ["the", "dog", "walked", "on", "the", "street"],  # Sentence 2: tokenized into words
    ["a", "car", "drove", "by", "a", "truck"],        # Sentence 3: tokenized into words
    ["man", "is", "the", "king", "of", "jungle"],     # Sentence 4: tokenized into words
    ["woman", "is", "the", "queen", "of", "a", "family"] # Sentence 5: tokenized into words
]

print("Training corpus (each sentence is a list of words):")
for i, sentence in enumerate(corpus, 1):
    print(f"  Sentence {i}: {sentence}")

print(f"\nTotal sentences: {len(corpus)}")
print(f"Total unique words: {len(set([word for sentence in corpus for word in sentence]))}")

print("\n" + "="*60)
print("=== STEP 2: MODEL TRAINING ===")

# Train the Word2Vec model with detailed explanation of each parameter
model = Word2Vec(
    sentences=corpus,        # Input: list of tokenized sentences
    vector_size=100,        # Output: each word becomes a 100-dimensional vector
    window=5,               # Context: look at 5 words before and after target word
    min_count=1,            # Vocabulary: include words that appear at least 1 time
    workers=4,              # Performance: use 4 CPU cores for training
    sg=0                    # Architecture: sg=0 for CBOW, sg=1 for Skip-gram
)

print("Model training completed!")
print(f"Vocabulary size: {len(model.wv.key_to_index)}")
print(f"Vector dimensions: {model.wv.vector_size}")

print("\n" + "="*60)
print("=== STEP 3: EXPLORING WORD VECTORS ===")

# Get the vector for a specific word
word = 'cat'
vector = model.wv[word]
print(f"Vector for '{word}' (first 10 dimensions):")
print(f"Shape: {vector.shape}")
print(f"Values: {vector[:10]}")  # Show only first 10 dimensions for readability

print("\n" + "="*60)
print("=== STEP 4: FINDING SIMILAR WORDS ===")

# Find words most similar to 'cat'
try:
    similar_words = model.wv.most_similar('cat', topn=3)
    print(f"Words most similar to 'cat':")
    for word, similarity in similar_words:
        print(f"  {word}: {similarity:.4f}")
except:
    print("Not enough data to compute similarities (need larger corpus)")

print("\n" + "="*60)
print("=== STEP 5: UNDERSTANDING THE PARAMETERS ===")
print(" KEY PARAMETERS EXPLAINED:")
print("‚Ä¢ vector_size=100: Each word ‚Üí 100-dimensional dense vector")
print("‚Ä¢ window=5: Uses 5 words before + 5 words after for context")
print("‚Ä¢ min_count=1: Include words appearing ‚â•1 times (use higher for large corpora)")
print("‚Ä¢ workers=4: Parallel processing using 4 CPU cores")
print("‚Ä¢ sg=0: CBOW architecture (sg=1 would be Skip-gram)")

print("\n WHAT HAPPENS DURING TRAINING:")
print("1. Model sees: ['the', 'cat', 'sat', 'on', 'the']")
print("2. For word 'cat', context window includes: ['the', 'sat', 'on', 'the']")
print("3. Neural network learns: 'cat' should be similar to words in similar contexts")
print("4. Result: Words with similar contexts get similar vectors")

print("\n SEMANTIC RELATIONSHIPS CAPTURED:")
print("‚Ä¢ 'cat' and 'dog' should be similar (both animals)")
print("‚Ä¢ 'king' and 'queen' should be similar (both royalty)")
print("‚Ä¢ 'sat' and 'walked' should be similar (both actions)")

=== STEP 1: DATA PREPARATION ===
Training corpus (each sentence is a list of words):
  Sentence 1: ['the', 'cat', 'sat', 'on', 'the', 'mat']
  Sentence 2: ['the', 'dog', 'walked', 'on', 'the', 'street']
  Sentence 3: ['a', 'car', 'drove', 'by', 'a', 'truck']
  Sentence 4: ['man', 'is', 'the', 'king', 'of', 'jungle']
  Sentence 5: ['woman', 'is', 'the', 'queen', 'of', 'a', 'family']

Total sentences: 5
Total unique words: 21

=== STEP 2: MODEL TRAINING ===
Model training completed!
Vocabulary size: 21
Vector dimensions: 100

=== STEP 3: EXPLORING WORD VECTORS ===
Vector for 'cat' (first 10 dimensions):
Shape: (100,)
Values: [ 0.00257251  0.00084847 -0.00253914  0.00935892  0.00275784  0.00409409
 -0.00118331  0.00090606  0.00662316 -0.00072743]

=== STEP 4: FINDING SIMILAR WORDS ===
Words most similar to 'cat':
  of: 0.1487
  truck: 0.1121
  walked: 0.1075

=== STEP 5: UNDERSTANDING THE PARAMETERS ===
 KEY PARAMETERS EXPLAINED:
‚Ä¢ vector_size=100: Each word ‚Üí 100-dimensional dense ve

### Key Differences: CBOW vs Skip-gram

**CBOW (Continuous Bag of Words) - sg=0:**
- **Input**: Context words ‚Üí **Output**: Target word
- **Example**: Given ["the", "sat", "on", "the"] ‚Üí Predict "cat"
- **Speed**: Faster training
- **Best for**: Frequent words, smaller datasets

**Skip-gram - sg=1:**
- **Input**: Target word ‚Üí **Output**: Context words  
- **Example**: Given "cat" ‚Üí Predict ["the", "sat", "on", "the"]
- **Quality**: Better word representations
- **Best for**: Rare words, larger datasets

In [None]:
# CBOW (Continuous Bag of Words) Example Program
from gensim.models import Word2Vec
import numpy as np

print("=== CBOW ARCHITECTURE DEMONSTRATION ===")
print("CBOW: Context Words ‚Üí Target Word")
print()

# Larger corpus for better CBOW demonstration
sentences = [
    ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"],
    ["a", "brown", "fox", "runs", "quickly", "through", "the", "forest"],
    ["the", "lazy", "dog", "sleeps", "under", "the", "warm", "sun"],
    ["quick", "animals", "jump", "over", "obstacles", "in", "the", "wild"],
    ["brown", "animals", "run", "fast", "through", "green", "forests"],
    ["lazy", "pets", "sleep", "peacefully", "in", "warm", "houses"],
    ["the", "dog", "and", "fox", "are", "both", "smart", "animals"],
    ["quick", "brown", "foxes", "are", "clever", "forest", "animals"]
]

print("Training corpus:")
for i, sentence in enumerate(sentences[:3], 1):  # Show first 3 sentences
    print(f"  {i}. {' '.join(sentence)}")
print(f"  ... and {len(sentences)-3} more sentences")

print("\n" + "="*60)
print("=== TRAINING CBOW MODEL ===")

# Train CBOW model (sg=0)
cbow_model = Word2Vec(
    sentences=sentences,
    vector_size=50,      # Smaller for demonstration
    window=3,            # Context window of 3 words on each side
    min_count=1,         # Include all words
    workers=1,           # Single thread for consistent results
    sg=0,                # CBOW architecture
    epochs=100           # More training epochs
)

print("CBOW Model trained successfully!")
print(f"Vocabulary size: {len(cbow_model.wv.key_to_index)}")
print(f"Vector dimensions: {cbow_model.wv.vector_size}")

print("\n" + "="*60)
print("=== HOW CBOW WORKS: STEP BY STEP ===")

# Demonstrate CBOW concept with a specific example
target_word = "fox"
example_sentence = ["the", "quick", "brown", "fox", "jumps", "over", "the"]
window_size = 3

print(f"Example sentence: {' '.join(example_sentence)}")
print(f"Target word: '{target_word}'")
print(f"Window size: {window_size}")

# Find the target word position
target_index = example_sentence.index(target_word)
print(f"Target word position: {target_index}")

# Extract context words
start_idx = max(0, target_index - window_size)
end_idx = min(len(example_sentence), target_index + window_size + 1)
context_words = [example_sentence[i] for i in range(start_idx, end_idx) if i != target_index]

print(f"Context words: {context_words}")
print(f"CBOW Learning: {context_words} ‚Üí '{target_word}'")

print("\n" + "="*60)
print("=== WORD SIMILARITIES (CBOW) ===")

# Test word similarities
test_words = ['fox', 'dog', 'brown', 'quick']
for word in test_words:
    if word in cbow_model.wv:
        try:
            similar = cbow_model.wv.most_similar(word, topn=3)
            print(f"\nWords similar to '{word}':")
            for sim_word, score in similar:
                print(f"  {sim_word}: {score:.4f}")
        except:
            print(f"\nNot enough data for '{word}' similarities")

print("\n" + "="*60)
print("=== CBOW VS SKIP-GRAM COMPARISON ===")

# Train Skip-gram model for comparison
print("Training Skip-gram model for comparison...")
skipgram_model = Word2Vec(
    sentences=sentences,
    vector_size=50,
    window=3,
    min_count=1,
    workers=1,
    sg=1,                # Skip-gram architecture
    epochs=100
)

print("\nComparing models on word 'fox':")
if 'fox' in cbow_model.wv and 'fox' in skipgram_model.wv:
    try:
        cbow_similar = cbow_model.wv.most_similar('fox', topn=2)
        skipgram_similar = skipgram_model.wv.most_similar('fox', topn=2)
        
        print("CBOW similar words:")
        for word, score in cbow_similar:
            print(f"  {word}: {score:.4f}")
            
        print("Skip-gram similar words:")
        for word, score in skipgram_similar:
            print(f"  {word}: {score:.4f}")
    except:
        print("Not enough data for comparison")

print("\n" + "="*60)
print("=== KEY CBOW INSIGHTS ===")
print("üéØ CBOW Architecture:")
print("‚Ä¢ Input: Context words ['the', 'quick', 'brown', 'jumps']")
print("‚Ä¢ Output: Target word 'fox'")
print("‚Ä¢ Learning: Predicts center word from surrounding context")
print()
print("‚ö° CBOW Advantages:")
print("‚Ä¢ Faster training than Skip-gram")
print("‚Ä¢ Better for frequent words")
print("‚Ä¢ Good performance with smaller datasets")
print()
print("üîç CBOW Process:")
print("1. Take context words around target")
print("2. Average their vectors")
print("3. Predict the target word")
print("4. Update weights based on prediction accuracy")

GloVe (Global Vectors for Word Representation)
GloVe (Global Vectors for Word Representation) is an unsupervised learning model 
that generates word embeddings by combining the advantages of 2 major embedding
 approaches: 
 global matrix factorization (like Latent Semantic Analysis) and
 local context window methods (like Word2Vec).

In [None]:
# Alternative GloVe implementation using available libraries
# Since the glove package has build issues, we'll demonstrate GloVe concepts

import numpy as np
from collections import Counter, defaultdict
from gensim.models import Word2Vec

print("=== GloVe Concept Demonstration ===")
print("Note: This demonstrates GloVe concepts using available libraries")
print()

# Sample corpus for demonstration
sentences = [
    ['the', 'cat', 'sat', 'on', 'the', 'mat'],
    ['a', 'dog', 'ran', 'down', 'the', 'street'],  
    ['the', 'cat', 'was', 'chasing', 'a', 'mouse'],
    ['the', 'dog', 'barked', 'at', 'the', 'cat'],
    ['a', 'mouse', 'ran', 'from', 'the', 'cat']
]

# 1. Build co-occurrence matrix (core concept of GloVe)
def build_cooccurrence_matrix(sentences, window_size=2):
    """Build word co-occurrence matrix like GloVe does"""
    word_counts = Counter()
    cooccurrence = defaultdict(Counter)
    
    # Count all words and build co-occurrence
    for sentence in sentences:
        for i, word in enumerate(sentence):
            word_counts[word] += 1
            
            # Look at surrounding words within window
            start = max(0, i - window_size)
            end = min(len(sentence), i + window_size + 1)
            
            for j in range(start, end):
                if i != j:  # Don't count the word with itself
                    context_word = sentence[j]
                    distance = abs(i - j)
                    weight = 1.0 / distance  # Closer words get higher weight
                    cooccurrence[word][context_word] += weight
    
    return word_counts, cooccurrence

# Build the co-occurrence matrix
word_counts, cooccurrence = build_cooccurrence_matrix(sentences, window_size=2)

print("Word frequencies:")
for word, count in sorted(word_counts.items()):
    print(f"  {word}: {count}")

print(f"\nCo-occurrence matrix built with {len(cooccurrence)} unique words")

# Show co-occurrence for 'cat'
print(f"\nWords that co-occur with 'cat':")
if 'cat' in cooccurrence:
    for word, weight in sorted(cooccurrence['cat'].items(), key=lambda x: x[1], reverse=True):
        print(f"  {word}: {weight:.2f}")

# 2. For comparison, let's also show Word2Vec results on the same data
print("\n=== Word2Vec Comparison ===")
model = Word2Vec(sentences, vector_size=50, window=2, min_count=1, workers=1)

print("Words most similar to 'cat' (Word2Vec):")
try:
    similar_words = model.wv.most_similar('cat', topn=3)
    for word, similarity in similar_words:
        print(f"  {word}: {similarity:.3f}")
except:
    print("  Not enough data for similarity calculation")

# 3. Show the concept of GloVe vs Word2Vec
print("\n=== Key Differences ===")
print("GloVe:")
print("  - Uses global co-occurrence statistics")
print("  - Builds explicit co-occurrence matrix")  
print("  - Combines global matrix factorization with local context")

print("\nWord2Vec:")
print("  - Uses local context windows")
print("  - Predicts words from context (or vice versa)")
print("  - No explicit global co-occurrence matrix")

print(f"\nNote: To use actual GloVe embeddings, you can download pre-trained vectors")
print("from Stanford's GloVe website and load them with gensim.models.KeyedVectors")

4. ELMo
ELMo was one of the first major breakthroughs in contextualized embeddings. It uses a bidirectional LSTM (Long Short-Term Memory) network to create a word‚Äôs vector representation. Unlike static embeddings, an ELMo vector for a word is a function of the entire sentence it appears in.

In [None]:
# ELMo (Embeddings from Language Models) Example Program
import numpy as np

print("=== ELMo CONTEXTUALIZED EMBEDDINGS DEMONSTRATION ===")
print("ELMo: Context-dependent word representations")
print()

# Check if ELMo libraries are available
try:
    # Try importing TensorFlow Hub (most common ELMo implementation)
    import tensorflow_hub as hub
    import tensorflow as tf
    elmo_available = True
    print("‚úÖ TensorFlow Hub available for ELMo")
except ImportError:
    elmo_available = False
    print("‚ùå TensorFlow Hub not available")

try:
    # Alternative: allennlp ELMo
    from allennlp.modules.elmo import Elmo, batch_to_ids
    allennlp_available = True
    print("‚úÖ AllenNLP ELMo available")
except ImportError:
    allennlp_available = False
    print("‚ùå AllenNLP not available")

print()

if elmo_available:
    print("=== TENSORFLOW HUB ELMo IMPLEMENTATION ===")
    try:
        # Load pre-trained ELMo model from TensorFlow Hub
        print("Loading ELMo model from TensorFlow Hub...")
        elmo_url = "https://tfhub.dev/google/elmo/3"
        
        # This is a simplified example - actual implementation would require download
        print("Note: This requires internet connection and model download")
        print("Model URL:", elmo_url)
        
        # Sample sentences showing context dependency
        sentences = [
            ["The", "bank", "was", "built", "near", "the", "river"],
            ["I", "need", "to", "go", "to", "the", "bank", "for", "money"],
            ["The", "bat", "flew", "out", "of", "the", "cave"],
            ["He", "hit", "the", "ball", "with", "a", "wooden", "bat"]
        ]
        
        print("\nExample sentences showing context dependency:")
        for i, sentence in enumerate(sentences, 1):
            print(f"{i}. {' '.join(sentence)}")
        
        print("\nELMo would generate different vectors for:")
        print("‚Ä¢ 'bank' in sentence 1 (river bank) vs sentence 2 (financial bank)")
        print("‚Ä¢ 'bat' in sentence 3 (animal) vs sentence 4 (sports equipment)")
        
    except Exception as e:
        print(f"Error loading TensorFlow Hub ELMo: {e}")

elif allennlp_available:
    print("=== ALLENNLP ELMo IMPLEMENTATION ===")
    try:
        # AllenNLP ELMo implementation
        options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
        weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
        
        print("AllenNLP ELMo configuration:")
        print(f"Options: {options_file}")
        print(f"Weights: {weight_file}")
        print("Note: Actual usage requires downloading these files")
        
    except Exception as e:
        print(f"Error with AllenNLP ELMo: {e}")

else:
    print("=== ELMo CONCEPT DEMONSTRATION ===")
    print("Since ELMo packages aren't available, let's demonstrate the concept")

print("\n" + "="*60)
print("=== ELMo CONCEPT: CONTEXTUALIZED EMBEDDINGS ===")

# Demonstrate ELMo concept with examples
print("üîç KEY CONCEPT: Same word, different meanings based on context")
print()

examples = [
    {
        "word": "bank",
        "sentences": [
            "The bank of the river was muddy",
            "I deposited money at the bank"
        ],
        "meanings": ["geographical feature", "financial institution"]
    },
    {
        "word": "bat",
        "sentences": [
            "The bat flew through the night",
            "He swung the bat at the ball"
        ],
        "meanings": ["flying mammal", "sports equipment"]
    },
    {
        "word": "rock",
        "sentences": [
            "The ship hit a large rock",
            "Let's rock and roll tonight"
        ],
        "meanings": ["stone/mineral", "music/movement"]
    }
]

for example in examples:
    word = example["word"]
    sentences = example["sentences"]
    meanings = example["meanings"]
    
    print(f"Word: '{word}'")
    for i, (sentence, meaning) in enumerate(zip(sentences, meanings)):
        print(f"  {i+1}. \"{sentence}\"")
        print(f"     ‚Üí Meaning: {meaning}")
        print(f"     ‚Üí ELMo generates DIFFERENT vectors for '{word}' here")
    print()

print("=== HOW ELMo WORKS ===")
print("üß† ARCHITECTURE:")
print("1. Bidirectional LSTM processes text in both directions")
print("2. Forward LSTM: reads left ‚Üí right")
print("3. Backward LSTM: reads right ‚Üê left")
print("4. Combines both directions for context-aware representation")
print()

print("üìä ELMo PROCESS:")
sentence_example = "The bank was built near the river"
words = sentence_example.split()

print(f"Example: \"{sentence_example}\"")
print()
print("Forward LSTM (left to right):")
for i, word in enumerate(words):
    context = " ".join(words[:i+1])
    print(f"  Step {i+1}: {context} ‚Üí processes '{word}'")

print()
print("Backward LSTM (right to left):")
for i, word in enumerate(reversed(words)):
    remaining = words[len(words)-i-1:]
    context = " ".join(remaining)
    print(f"  Step {i+1}: {context} ‚Üê processes '{word}'")

print()
print("Final representation for 'bank':")
print("  = combination of forward + backward representations")
print("  = captures that 'bank' is near 'river' (geographical context)")

print("\n" + "="*60)
print("=== ELMo VS STATIC EMBEDDINGS ===")

comparison_data = [
    ["Aspect", "Static Embeddings (Word2Vec/GloVe)", "ELMo"],
    ["Context", "Same vector regardless of context", "Different vector per context"],
    ["Polysemy", "One vector for all meanings", "Multiple vectors for different meanings"],
    ["Example", "'bank' always same vector", "'bank' different in financial vs river context"],
    ["Architecture", "Single embedding layer", "Bidirectional LSTM"],
    ["Computation", "Fast lookup", "Requires forward pass through network"],
    ["Memory", "Small (vocabulary √ó dimensions)", "Large (full neural network)"]
]

for row in comparison_data:
    if row[0] == "Aspect":  # Header
        print(f"{row[0]:<12} | {row[1]:<35} | {row[2]}")
        print("-" * 85)
    else:
        print(f"{row[0]:<12} | {row[1]:<35} | {row[2]}")

print("\n" + "="*60)
print("=== PRACTICAL ELMo USAGE ===")
print("üíª INSTALLATION:")
print("pip install tensorflow-hub  # For TensorFlow Hub ELMo")
print("pip install allennlp        # For AllenNLP ELMo")
print()

print("üîß BASIC USAGE TEMPLATE:")
print("""
# TensorFlow Hub approach
import tensorflow_hub as hub
elmo = hub.load("https://tfhub.dev/google/elmo/3")
embeddings = elmo(["Hello world", "How are you"])

# AllenNLP approach  
from allennlp.modules.elmo import Elmo, batch_to_ids
elmo = Elmo(options_file, weight_file, 2, dropout=0)
character_ids = batch_to_ids(sentences)
embeddings = elmo(character_ids)
""")

print("üéØ APPLICATIONS:")
print("‚Ä¢ Named Entity Recognition (NER)")
print("‚Ä¢ Part-of-Speech Tagging")
print("‚Ä¢ Sentiment Analysis")
print("‚Ä¢ Question Answering")
print("‚Ä¢ Any task requiring context understanding")

print("\n" + "="*60)
print("=== ELMo ADVANTAGES & LIMITATIONS ===")
print("‚úÖ ADVANTAGES:")
print("‚Ä¢ Handles polysemy (multiple meanings)")
print("‚Ä¢ Context-dependent representations")
print("‚Ä¢ Improves downstream task performance")
print("‚Ä¢ Works with out-of-vocabulary words")
print()

print("‚ö†Ô∏è LIMITATIONS:")
print("‚Ä¢ Computationally expensive")
print("‚Ä¢ Requires large memory")
print("‚Ä¢ Slower than static embeddings")
print("‚Ä¢ Superseded by Transformer models (BERT, GPT)")

print(f"\nüöÄ EVOLUTION:")
print("Word2Vec/GloVe ‚Üí ELMo ‚Üí BERT/GPT ‚Üí Modern Transformers")

5. Transformer-Based Models: These models are the current state of the art for text representation. They rely on the Transformer architecture and its self-attention mechanism, which can weigh the importance of different words in a sentence when encoding a specific word.

In [None]:
# BERT and Transformer-Based Models Example Program
import numpy as np

print("=== BERT & TRANSFORMER-BASED MODELS DEMONSTRATION ===")
print("BERT: Bidirectional Encoder Representations from Transformers")
print()

# Check if transformers library is available
try:
    from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
    import torch
    transformers_available = True
    print("‚úÖ Transformers library available")
except ImportError:
    transformers_available = False
    print("‚ùå Transformers library not available")
    print("   Install with: pip install transformers torch")

print()

if transformers_available:
    print("=== BERT MODEL IMPLEMENTATION ===")
    try:
        # Load pre-trained BERT model and tokenizer
        print("Loading BERT-base-uncased model...")
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')
        
        print("‚úÖ BERT model loaded successfully!")
        print(f"Model: {model.config.name_or_path}")
        print(f"Hidden size: {model.config.hidden_size}")
        print(f"Number of layers: {model.config.num_hidden_layers}")
        print(f"Number of attention heads: {model.config.num_attention_heads}")
        
        # Example sentences showing context dependency
        sentences = [
            "The bank near the river is beautiful",
            "I need to visit the bank for a loan",
            "The bat flew out of the cave",
            "He hit a home run with the bat"
        ]
        
        print("\n=== BERT TOKENIZATION ===")
        for i, sentence in enumerate(sentences[:2], 1):
            print(f"Sentence {i}: \"{sentence}\"")
            
            # Tokenize
            tokens = tokenizer.tokenize(sentence)
            print(f"  Tokens: {tokens}")
            
            # Convert to input IDs
            input_ids = tokenizer.encode(sentence, add_special_tokens=True)
            print(f"  Input IDs: {input_ids}")
            
            # Decode back to text
            decoded = tokenizer.decode(input_ids)
            print(f"  Decoded: \"{decoded}\"")
            print()
        
        print("=== BERT EMBEDDINGS ===")
        # Get embeddings for a sentence
        sentence = "The bank near the river is beautiful"
        print(f"Analyzing: \"{sentence}\"")
        
        # Tokenize and prepare inputs
        inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
        
        # Get BERT embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state
        
        print(f"Embedding shape: {embeddings.shape}")
        print(f"Sequence length: {embeddings.shape[1]} tokens")
        print(f"Hidden dimension: {embeddings.shape[2]}")
        
        # Show embeddings for the word "bank"
        tokens = tokenizer.tokenize(sentence)
        if 'bank' in tokens:
            bank_idx = tokens.index('bank') + 1  # +1 for [CLS] token
            bank_embedding = embeddings[0, bank_idx, :]
            print(f"\nEmbedding for 'bank' (first 10 dimensions):")
            print(f"  {bank_embedding[:10].numpy()}")
        
    except Exception as e:
        print(f"Error with BERT implementation: {e}")
        print("Note: BERT models are large and require significant memory")

else:
    print("=== TRANSFORMER CONCEPT DEMONSTRATION ===")
    print("Since transformers library isn't available, let's demonstrate concepts")

print("\n" + "="*60)
print("=== TRANSFORMER ARCHITECTURE EXPLAINED ===")

print("üèóÔ∏è KEY COMPONENTS:")
print("1. Self-Attention Mechanism")
print("2. Multi-Head Attention")
print("3. Position Encodings")
print("4. Feed-Forward Networks")
print("5. Layer Normalization")
print("6. Residual Connections")
print()

print("üîç SELF-ATTENTION CONCEPT:")
sentence = "The cat sat on the mat"
words = sentence.split()

print(f"Example: \"{sentence}\"")
print("Self-attention asks: How much should each word attend to every other word?")
print()

# Simulate attention weights (simplified)
print("Attention matrix (simplified concept):")
print("     ", " ".join(f"{word:>6}" for word in words))

# Mock attention weights for demonstration
attention_weights = [
    [0.8, 0.1, 0.05, 0.02, 0.02, 0.01],  # The
    [0.2, 0.6, 0.1, 0.05, 0.03, 0.02],   # cat
    [0.1, 0.3, 0.4, 0.1, 0.05, 0.05],    # sat
    [0.05, 0.1, 0.2, 0.5, 0.1, 0.05],    # on
    [0.1, 0.05, 0.05, 0.1, 0.6, 0.1],    # the
    [0.05, 0.2, 0.1, 0.1, 0.1, 0.45]     # mat
]

for i, word in enumerate(words):
    weights_str = " ".join(f"{w:>6.2f}" for w in attention_weights[i])
    print(f"{word:>4}: {weights_str}")

print()
print("High values = strong attention (e.g., 'cat' attends strongly to 'sat')")
print("Low values = weak attention")

print("\n" + "="*60)
print("=== BERT SPECIAL FEATURES ===")

print("üéØ BIDIRECTIONAL CONTEXT:")
print("Unlike ELMo, BERT sees the ENTIRE sentence at once")
print("Example: 'The [MASK] sat on the mat'")
print("‚Ä¢ BERT can use both 'The' and 'sat on the mat' to predict [MASK]")
print("‚Ä¢ This bidirectional understanding is BERT's key innovation")
print()

print("üî§ SPECIAL TOKENS:")
special_tokens = [
    ("[CLS]", "Classification token - represents entire sentence"),
    ("[SEP]", "Separator token - separates sentences"),
    ("[MASK]", "Mask token - used for masked language modeling"),
    ("[PAD]", "Padding token - used to make sequences same length"),
    ("[UNK]", "Unknown token - for words not in vocabulary")
]

for token, description in special_tokens:
    print(f"  {token:>6}: {description}")

print("\n" + "="*60)
print("=== TRANSFORMER VS PREVIOUS MODELS ===")

comparison = [
    ["Feature", "Word2Vec/GloVe", "ELMo", "BERT/Transformers"],
    ["Context", "None", "Sequential", "Bidirectional"],
    ["Architecture", "Shallow", "LSTM", "Self-Attention"],
    ["Training", "Word prediction", "Language modeling", "Masked LM + NSP"],
    ["Parallelization", "Not applicable", "Sequential", "Fully parallel"],
    ["Context window", "Fixed", "Sequential", "Full sequence"],
    ["Performance", "Good", "Better", "State-of-the-art"]
]

for row in comparison:
    if row[0] == "Feature":  # Header
        print(f"{row[0]:<15} | {row[1]:<15} | {row[2]:<12} | {row[3]}")
        print("-" * 70)
    else:
        print(f"{row[0]:<15} | {row[1]:<15} | {row[2]:<12} | {row[3]}")

print("\n" + "="*60)
print("=== BERT PRE-TRAINING TASKS ===")

print("üé≠ MASKED LANGUAGE MODELING (MLM):")
examples = [
    "Original: The cat sat on the mat",
    "Masked:   The [MASK] sat on the mat",
    "Task:     Predict 'cat' using bidirectional context"
]
for example in examples:
    print(f"  {example}")

print()
print("üîó NEXT SENTENCE PREDICTION (NSP):")
examples = [
    "Sentence A: The cat sat on the mat",
    "Sentence B: It was very comfortable",
    "Task:       Predict if B follows A (IsNext vs NotNext)"
]
for example in examples:
    print(f"  {example}")

print("\n" + "="*60)
print("=== POPULAR TRANSFORMER MODELS ===")

models = [
    ("BERT", "Bidirectional encoder, great for understanding"),
    ("GPT", "Autoregressive decoder, great for generation"),
    ("T5", "Text-to-text transformer, versatile"),
    ("RoBERTa", "Robustly optimized BERT, improved training"),
    ("ELECTRA", "Efficient pre-training, faster than BERT"),
    ("DistilBERT", "Distilled BERT, smaller and faster"),
    ("ALBERT", "A Lite BERT, parameter sharing"),
    ("DeBERTa", "Decoding-enhanced BERT with attention")
]

for model, description in models:
    print(f"  {model:<12}: {description}")

print("\n" + "="*60)
print("=== PRACTICAL TRANSFORMER USAGE ===")

print("üíª INSTALLATION:")
print("pip install transformers torch")
print("pip install transformers[torch]  # With PyTorch")
print()

print("üîß BASIC USAGE TEMPLATE:")
print("""
from transformers import AutoTokenizer, AutoModel

# Load any transformer model
model_name = 'bert-base-uncased'  # or 'gpt2', 'roberta-base', etc.
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Tokenize and encode
text = "Hello, world!"
inputs = tokenizer(text, return_tensors='pt')

# Get embeddings
outputs = model(**inputs)
embeddings = outputs.last_hidden_state
""")

print("üéØ APPLICATIONS:")
applications = [
    "Text Classification (sentiment, spam detection)",
    "Named Entity Recognition (NER)",
    "Question Answering",
    "Text Summarization",
    "Machine Translation",
    "Text Generation",
    "Semantic Search",
    "Language Understanding"
]

for app in applications:
    print(f"  ‚Ä¢ {app}")

print("\n" + "="*60)
print("=== TRANSFORMER ADVANTAGES & LIMITATIONS ===")

print("‚úÖ ADVANTAGES:")
advantages = [
    "State-of-the-art performance on most NLP tasks",
    "Bidirectional context understanding",
    "Transfer learning capabilities",
    "Parallel processing during training",
    "Rich pre-trained models available",
    "Fine-tuning for specific tasks"
]

for adv in advantages:
    print(f"  ‚Ä¢ {adv}")

print()
print("‚ö†Ô∏è LIMITATIONS:")
limitations = [
    "Very large model sizes (millions/billions of parameters)",
    "High computational requirements",
    "Significant memory usage",
    "Long training times",
    "Quadratic complexity with sequence length",
    "Black box nature (interpretability challenges)"
]

for lim in limitations:
    print(f"  ‚Ä¢ {lim}")

print(f"\nüöÄ EVOLUTION TIMELINE:")
timeline = [
    "2017: Transformer architecture introduced",
    "2018: BERT revolutionizes NLP",
    "2019: GPT-2, RoBERTa, ALBERT",
    "2020: GPT-3, T5, DeBERTa",
    "2021: GPT-3.5, PaLM",
    "2022: ChatGPT, GPT-4",
    "2023+: Even larger and more efficient models"
]

for year in timeline:
    print(f"  {year}")

print(f"\nüí° KEY INSIGHT:")
print("Transformers revolutionized NLP by enabling bidirectional context")
print("understanding and massive scale pre-training with transfer learning!")