In [None]:
# Import libraries
import spacy
from gensim import corpora
import numpy as np

In [2]:
# Sample documents
doc1 = "cats and dogs are not allowed"
doc2 = "cats and dogs are antagonistic"
documents = [doc1, doc2]
print("Document 1:", doc1)
print("Document 2:", doc2)

Document 1: cats and dogs are not allowed
Document 2: cats and dogs are antagonistic


In [None]:
# ============ BAG OF WORDS (BoW) MODEL ============

# Tokenize documents - break text into individual words
doc1_tokens = doc1.lower().split()
doc2_tokens = doc2.lower().split()
print("Tokenized Document 1:", doc1_tokens)
print("Tokenized Document 2:", doc2_tokens)

In [None]:
# Create vocabulary - unique words across all documents
vocabulary = sorted(set(doc1_tokens + doc2_tokens))
print("\nVocabulary (all unique words):", vocabulary)
print("Vocabulary size:", len(vocabulary))

In [None]:
# Create feature vectors manually - count word occurrences in each document
# Each position in vector corresponds to a word in vocabulary
def create_bow_vector(tokens, vocab):
    # Initialize vector with zeros for each word in vocabulary
    vector = [0] * len(vocab)
    for token in tokens:
        if token in vocab:
            idx = vocab.index(token)
            vector[idx] += 1
    return vector

vector1 = create_bow_vector(doc1_tokens, vocabulary)
vector2 = create_bow_vector(doc2_tokens, vocabulary)

print("\nFeature Vector for Document 1:")
print(vector1)
print("\nFeature Vector for Document 2:")
print(vector2)

# Display word-to-count mapping
print("\nWord-to-count mapping for Doc1:")
for word, count in zip(vocabulary, vector1):
    if count > 0:
        print(f"  {word}: {count}")

In [None]:
# ============ GENSIM BoW MODEL ============

# Create Gensim Dictionary - maps each unique word to an ID
tokenized_docs = [doc1_tokens, doc2_tokens]
dictionary = corpora.Dictionary(tokenized_docs)

print("\nGensim Dictionary (word -> ID mapping):")
print(dictionary.token2id)
print(f"\nDictionary size: {len(dictionary)}")

In [None]:
# doc2bow - converts document to bag of words representation (word_id, frequency)
bow_doc1 = dictionary.doc2bow(doc1_tokens)
bow_doc2 = dictionary.doc2bow(doc2_tokens)

print("\ndoc2bow for Document 1 (word_id, count):")
print(bow_doc1)
print("\ndoc2bow for Document 2 (word_id, count):")
print(bow_doc2)

# Decode to show which words the IDs represent
print("\nDecoded BoW for Document 1:")
for word_id, freq in bow_doc1:
    print(f"  {dictionary[word_id]}: {freq}")

In [None]:
# ============ N-GRAMS MODEL ============

from collections import defaultdict

# Generate n-grams - consecutive sequences of n words
def generate_ngrams(tokens, n):
    # Returns list of n-grams as tuples
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngrams.append(tuple(tokens[i:i+n]))
    return ngrams

# Bigrams (2-grams)
bigrams_doc1 = generate_ngrams(doc1_tokens, 2)
bigrams_doc2 = generate_ngrams(doc2_tokens, 2)

print("\n========== BIGRAMS (2-grams) ==========")
print("Document 1 bigrams:", bigrams_doc1)
print("Document 2 bigrams:", bigrams_doc2)

In [None]:
# Trigrams (3-grams)
trigrams_doc1 = generate_ngrams(doc1_tokens, 3)
trigrams_doc2 = generate_ngrams(doc2_tokens, 3)

print("\n========== TRIGRAMS (3-grams) ==========")
print("Document 1 trigrams:", trigrams_doc1)
print("Document 2 trigrams:", trigrams_doc2)

In [None]:
# ============ NEXT WORD PREDICTION USING N-GRAMS ============

# Build n-gram language model - stores frequency of each n-gram
def build_ngram_model(documents, n):
    # Count occurrences of each n-gram across all documents
    model = defaultdict(int)
    for doc in documents:
        tokens = doc.lower().split()
        ngrams = generate_ngrams(tokens, n)
        for ngram in ngrams:
            model[ngram] += 1
    return model

# Build conditional probability model for next word prediction
def build_next_word_model(documents, n):
    # Maps (n-1) words to possible next words with their counts
    next_word_model = defaultdict(lambda: defaultdict(int))
    
    for doc in documents:
        tokens = doc.lower().split()
        ngrams = generate_ngrams(tokens, n)
        
        for ngram in ngrams:
            # Context: first (n-1) words
            context = ngram[:-1]
            # Next word: last word
            next_word = ngram[-1]
            next_word_model[context][next_word] += 1
    
    return next_word_model

# Build bigram model for next word prediction
bigram_model = build_next_word_model(documents, 2)

print("\n========== NEXT WORD PREDICTION MODEL ==========")
print("\nBigram Model (context -> next_word: count):")
for context, next_words in bigram_model.items():
    print(f"\nAfter '{' '.join(context)}':")
    for word, count in next_words.items():
        print(f"  -> '{word}': {count} time(s)")

In [None]:
# Predict next word given context - returns most likely next word
def predict_next_word(context, model):
    # Convert context string to tuple for lookup
    context_tuple = tuple(context.lower().split())
    
    if context_tuple not in model:
        return None, "Context not found in model"
    
    # Get all possible next words with their counts
    next_words = model[context_tuple]
    
    # Find word with highest count
    predicted_word = max(next_words, key=next_words.get)
    count = next_words[predicted_word]
    
    return predicted_word, count

# Test predictions
print("\n========== NEXT WORD PREDICTIONS ==========")

test_contexts = ["cats and", "and dogs", "dogs are", "are not"]

for context in test_contexts:
    predicted, info = predict_next_word(context, bigram_model)
    if predicted:
        print(f"\nContext: '{context}'")
        print(f"Predicted next word: '{predicted}' (appeared {info} time(s))")
    else:
        print(f"\nContext: '{context}' - {info}")

In [None]:
# Build trigram model for more context-aware predictions
trigram_model = build_next_word_model(documents, 3)

print("\n========== TRIGRAM PREDICTIONS (2-word context) ==========")
print("\nTrigram Model:")
for context, next_words in trigram_model.items():
    print(f"\nAfter '{' '.join(context)}':")
    for word, count in next_words.items():
        print(f"  -> '{word}': {count} time(s)")

# Test with 2-word contexts
test_contexts_trigram = ["cats and dogs", "and dogs are", "dogs are not"]

print("\nPredictions:")
for context in test_contexts_trigram:
    predicted, info = predict_next_word(context, trigram_model)
    if predicted:
        print(f"\nContext: '{context}'")
        print(f"Predicted next word: '{predicted}'")

In [None]:
# ============ USING SPACY FOR BoW ============

# Load spaCy model for text processing (tokenization, lemmatization)
nlp = spacy.load("en_core_web_sm")

# Process documents with spaCy
doc1_spacy = nlp(doc1)
doc2_spacy = nlp(doc2)

# Extract tokens (excluding punctuation and stopwords for cleaner analysis)
doc1_tokens_spacy = [token.text.lower() for token in doc1_spacy if not token.is_punct]
doc2_tokens_spacy = [token.text.lower() for token in doc2_spacy if not token.is_punct]

print("\n========== SPACY PROCESSING ==========")
print("\nSpacy tokens Doc1:", doc1_tokens_spacy)
print("Spacy tokens Doc2:", doc2_tokens_spacy)

# Create vocabulary from spaCy tokens
vocab_spacy = sorted(set(doc1_tokens_spacy + doc2_tokens_spacy))
print("\nSpacy Vocabulary:", vocab_spacy)

# Create BoW vectors
vector1_spacy = create_bow_vector(doc1_tokens_spacy, vocab_spacy)
vector2_spacy = create_bow_vector(doc2_tokens_spacy, vocab_spacy)

print("\nSpacy BoW Vector Doc1:", vector1_spacy)
print("Spacy BoW Vector Doc2:", vector2_spacy)

In [None]:
# ============ WORD2VEC DEMONSTRATION ============

from gensim.models import Word2Vec

# Prepare training data - list of tokenized sentences
training_data = [doc1_tokens, doc2_tokens]

# Train Word2Vec model - creates dense vector representations for words
# vector_size: dimension of word vectors
# window: maximum distance between current and predicted word
# min_count: ignores words with frequency less than this
# sg: 0 for CBOW, 1 for Skip-gram
word2vec_model = Word2Vec(sentences=training_data, vector_size=10, window=2, min_count=1, sg=0)

print("\n========== WORD2VEC ==========")
print("\nWord2Vec Vocabulary:", list(word2vec_model.wv.index_to_key))

# Get vector representation for a word
print("\nVector for 'cats':", word2vec_model.wv['cats'])
print("Vector for 'dogs':", word2vec_model.wv['dogs'])

# Find similar words (with limited data, results may not be meaningful)
print("\nVector dimensions:", word2vec_model.wv.vector_size)

In [None]:
# ============ DOC2VEC DEMONSTRATION ============

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Prepare tagged documents - each document needs a unique tag/ID
tagged_docs = [
    TaggedDocument(words=doc1_tokens, tags=['DOC1']),
    TaggedDocument(words=doc2_tokens, tags=['DOC2'])
]

# Train Doc2Vec model - creates vector representations for entire documents
# vector_size: dimension of document vectors
# min_count: ignores words with frequency less than this
# epochs: number of training iterations
doc2vec_model = Doc2Vec(tagged_docs, vector_size=10, min_count=1, epochs=40)

print("\n========== DOC2VEC ==========")
print("\nDoc2Vec Vocabulary:", list(doc2vec_model.wv.index_to_key))

# Get vector representation for documents
doc1_vector = doc2vec_model.dv['DOC1']
doc2_vector = doc2vec_model.dv['DOC2']

print("\nDocument 1 vector:", doc1_vector)
print("Document 2 vector:", doc2_vector)

# Calculate similarity between documents using cosine similarity
from numpy import dot
from numpy.linalg import norm

cosine_sim = dot(doc1_vector, doc2_vector) / (norm(doc1_vector) * norm(doc2_vector))
print(f"\nCosine similarity between documents: {cosine_sim:.4f}")

In [None]:
# ============ SUMMARY COMPARISON ============

print("\n========== SUMMARY ==========")
print("\n1. BAG OF WORDS (BoW):")
print("   - Represents documents as word frequency vectors")
print("   - Ignores word order and context")
print("   - Simple and fast")
print(f"   - Doc1 vector: {vector1}")
print(f"   - Doc2 vector: {vector2}")

print("\n2. N-GRAMS:")
print("   - Captures word sequences and local context")
print("   - Bigrams capture 2-word patterns")
print("   - Trigrams capture 3-word patterns")
print(f"   - Doc1 bigrams: {bigrams_doc1}")
print(f"   - Enables next word prediction")

print("\n3. NEXT WORD PREDICTION:")
print("   - Uses n-gram frequencies to predict next word")
print("   - Example: 'cats and' -> 'dogs' (from training data)")

print("\n4. WORD2VEC:")
print("   - Dense vector representation of words")
print("   - Captures semantic relationships")
print(f"   - Vector size: {word2vec_model.wv.vector_size}")

print("\n5. DOC2VEC:")
print("   - Vector representation of entire documents")
print("   - Useful for document similarity")
print(f"   - Document similarity: {cosine_sim:.4f}")