# Assignment 2: Bag-of-Words, TF-IDF, and Word2Vec

This notebook demonstrates:
- **Bag-of-Words**: Count occurrence, Normalized count occurrence
- **TF-IDF**: Term Frequency-Inverse Document Frequency
- **Word2Vec**: Word embeddings

In [9]:
# Import required libraries
import nltk
import numpy as np
from collections import Counter

nltk.download('punkt')
nltk.download('punkt_tab')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Sample Corpus

In [10]:
corpus = [
    "Natural language processing is a field of artificial intelligence.",
    "Machine learning and deep learning are subfields of artificial intelligence.",
    "Text processing involves tokenization and stemming.",
    "Word embeddings capture semantic meaning of words.",
    "NLP applications include sentiment analysis and machine translation."
]

print("CORPUS:")
for i, doc in enumerate(corpus):
    print(f"  Doc {i+1}: {doc}")

CORPUS:
  Doc 1: Natural language processing is a field of artificial intelligence.
  Doc 2: Machine learning and deep learning are subfields of artificial intelligence.
  Doc 3: Text processing involves tokenization and stemming.
  Doc 4: Word embeddings capture semantic meaning of words.
  Doc 5: NLP applications include sentiment analysis and machine translation.


## 1. Bag-of-Words: Count Occurrence

In [11]:
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(corpus)

print("Vocabulary:")
vocabulary = count_vectorizer.get_feature_names_out()
print(f"  {list(vocabulary)}")

print("\nCount Occurrence Matrix:")
count_array = count_matrix.toarray()
print(f"  Shape: {count_array.shape}")
print("\n  Document-Term Matrix:")
for i, row in enumerate(count_array):
    print(f"  Doc {i+1}: {row}")

Vocabulary:
  ['analysis', 'and', 'applications', 'are', 'artificial', 'capture', 'deep', 'embeddings', 'field', 'include', 'intelligence', 'involves', 'is', 'language', 'learning', 'machine', 'meaning', 'natural', 'nlp', 'of', 'processing', 'semantic', 'sentiment', 'stemming', 'subfields', 'text', 'tokenization', 'translation', 'word', 'words']

Count Occurrence Matrix:
  Shape: (5, 30)

  Document-Term Matrix:
  Doc 1: [0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0]
  Doc 2: [0 1 0 1 1 0 1 0 0 0 1 0 0 0 2 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0]
  Doc 3: [0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 0]
  Doc 4: [0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 1]
  Doc 5: [1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0]


## 2. Bag-of-Words: Normalized Count Occurrence

In [12]:
# Normalize by dividing each count by the total count in that document
normalized_count = count_array / count_array.sum(axis=1, keepdims=True)

print("Normalized Count Occurrence Matrix:")
print(f"  Shape: {normalized_count.shape}")
print("\n  Document-Term Matrix (Normalized):")
for i, row in enumerate(normalized_count):
    print(f"  Doc {i+1}: {np.round(row, 3)}")

Normalized Count Occurrence Matrix:
  Shape: (5, 30)

  Document-Term Matrix (Normalized):
  Doc 1: [0.    0.    0.    0.    0.125 0.    0.    0.    0.125 0.    0.125 0.
 0.125 0.125 0.    0.    0.    0.125 0.    0.125 0.125 0.    0.    0.
 0.    0.    0.    0.    0.    0.   ]
  Doc 2: [0.  0.1 0.  0.1 0.1 0.  0.1 0.  0.  0.  0.1 0.  0.  0.  0.2 0.1 0.  0.
 0.  0.1 0.  0.  0.  0.  0.1 0.  0.  0.  0.  0. ]
  Doc 3: [0.    0.167 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.167
 0.    0.    0.    0.    0.    0.    0.    0.    0.167 0.    0.    0.167
 0.    0.167 0.167 0.    0.    0.   ]
  Doc 4: [0.    0.    0.    0.    0.    0.143 0.    0.143 0.    0.    0.    0.
 0.    0.    0.    0.    0.143 0.    0.    0.143 0.    0.143 0.    0.
 0.    0.    0.    0.    0.143 0.143]
  Doc 5: [0.125 0.125 0.125 0.    0.    0.    0.    0.    0.    0.125 0.    0.
 0.    0.    0.    0.125 0.    0.    0.125 0.    0.    0.    0.125 0.
 0.    0.    0.    0.125 0.    0.   ]


## 3. TF-IDF (Term Frequency - Inverse Document Frequency)

In [13]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

print("Vocabulary:")
tfidf_vocabulary = tfidf_vectorizer.get_feature_names_out()
print(f"  {list(tfidf_vocabulary)}")

print("\nTF-IDF Matrix:")
tfidf_array = tfidf_matrix.toarray()
print(f"  Shape: {tfidf_array.shape}")
print("\n  Document-Term Matrix (TF-IDF scores):")
for i, row in enumerate(tfidf_array):
    print(f"  Doc {i+1}: {np.round(row, 3)}")

# Show top TF-IDF terms per document
print("\nTop 3 TF-IDF Terms per Document:")
for i, row in enumerate(tfidf_array):
    top_indices = row.argsort()[-3:][::-1]
    top_terms = [(tfidf_vocabulary[idx], round(row[idx], 3)) for idx in top_indices if row[idx] > 0]
    print(f"  Doc {i+1}: {top_terms}")

Vocabulary:
  ['analysis', 'and', 'applications', 'are', 'artificial', 'capture', 'deep', 'embeddings', 'field', 'include', 'intelligence', 'involves', 'is', 'language', 'learning', 'machine', 'meaning', 'natural', 'nlp', 'of', 'processing', 'semantic', 'sentiment', 'stemming', 'subfields', 'text', 'tokenization', 'translation', 'word', 'words']

TF-IDF Matrix:
  Shape: (5, 30)

  Document-Term Matrix (TF-IDF scores):
  Doc 1: [0.    0.    0.    0.    0.319 0.    0.    0.    0.395 0.    0.319 0.
 0.395 0.395 0.    0.    0.    0.395 0.    0.265 0.319 0.    0.    0.
 0.    0.    0.    0.    0.    0.   ]
  Doc 2: [0.    0.213 0.    0.319 0.257 0.    0.319 0.    0.    0.    0.257 0.
 0.    0.    0.637 0.257 0.    0.    0.    0.213 0.    0.    0.    0.
 0.319 0.    0.    0.    0.    0.   ]
  Doc 3: [0.    0.297 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.443
 0.    0.    0.    0.    0.    0.    0.    0.    0.357 0.    0.    0.443
 0.    0.443 0.443 0.    0.    0.   ]
  Doc 4: [0

## 4. Word2Vec Embeddings

In [None]:
# Tokenize the corpus for Word2Vec
tokenized_corpus = [nltk.word_tokenize(doc.lower()) for doc in corpus]

print("Tokenized Corpus:")
for i, tokens in enumerate(tokenized_corpus):
    print(f"  Doc {i+1}: {tokens}")

# Train Word2Vec model
word2vec_model = Word2Vec(
    sentences=tokenized_corpus,
    vector_size=100,  # Embedding dimension
    window=5,         # Context window
    min_count=1,      # Minimum word count
    sg=0,             # 0=CBOW, 1=Skip-gram
    epochs=100
)

print("\n Word2Vec Model Info:")
print(f"  Vocabulary Size: {len(word2vec_model.wv)}")
print(f"  Embedding Dimension: {word2vec_model.wv.vector_size}")

# Show embeddings for some words
sample_words = ["learning", "intelligence", "processing", "words"]
print("\nSample Word Embeddings (first 10 dimensions):")
for word in sample_words:
    if word in word2vec_model.wv:
        embedding = word2vec_model.wv[word][:10]
        print(f"  '{word}': {np.round(embedding, 3)}")

# Find similar words
print("\nMost Similar Words:")
similar_words_list = [("learning", 3), ("intelligence", 3), ("processing", 3)]
for word, topn in similar_words_list:
    if word in word2vec_model.wv:
        similar = word2vec_model.wv.most_similar(word, topn=topn)
        print(f"  Similar to '{word}': {similar}")

# Save Word2Vec model
word2vec_model.save("word2vec_model.model")
print("\nWord2Vec model saved as 'word2vec_model.model'")

Tokenized Corpus:
  Doc 1: ['natural', 'language', 'processing', 'is', 'a', 'field', 'of', 'artificial', 'intelligence', '.']
  Doc 2: ['machine', 'learning', 'and', 'deep', 'learning', 'are', 'subfields', 'of', 'artificial', 'intelligence', '.']
  Doc 3: ['text', 'processing', 'involves', 'tokenization', 'and', 'stemming', '.']
  Doc 4: ['word', 'embeddings', 'capture', 'semantic', 'meaning', 'of', 'words', '.']
  Doc 5: ['nlp', 'applications', 'include', 'sentiment', 'analysis', 'and', 'machine', 'translation', '.']

Word2Vec Model Info:
  Vocabulary Size: 32
  Embedding Dimension: 100

Sample Word Embeddings (first 10 dimensions):
  'learning': [-0.008  0.01   0.    -0.001  0.004 -0.005  0.003  0.008  0.005 -0.008]
  'intelligence': [-0.009  0.002 -0.    -0.009 -0.01  -0.002  0.005  0.004 -0.007 -0.008]
  'processing': [ 0.008 -0.004  0.009  0.009 -0.005 -0.     0.005 -0.003 -0.006 -0.007]
  'words': [ 0.008  0.009  0.001 -0.008  0.008 -0.004  0.006  0.005  0.009 -0.01 ]

Most Simil