In [14]:
# Import libraries
import spacy
from gensim import corpora
import numpy as np

In [15]:
# Sample documents
doc1 = "cats and dogs are not allowed"
doc2 = "cats and dogs are antagonistic"
documents = [doc1, doc2]
print("Document 1:", doc1)
print("Document 2:", doc2)

Document 1: cats and dogs are not allowed
Document 2: cats and dogs are antagonistic


In [16]:
# ============ BAG OF WORDS (BoW) MODEL ============

# Tokenize documents - break text into individual words
doc1_tokens = doc1.lower().split()
doc2_tokens = doc2.lower().split()
print("Tokenized Document 1:", doc1_tokens)
print("Tokenized Document 2:", doc2_tokens)

Tokenized Document 1: ['cats', 'and', 'dogs', 'are', 'not', 'allowed']
Tokenized Document 2: ['cats', 'and', 'dogs', 'are', 'antagonistic']


In [17]:
# Create vocabulary - unique words across all documents
vocabulary = sorted(set(doc1_tokens + doc2_tokens))
print("\nVocabulary (all unique words):", vocabulary)
print("Vocabulary size:", len(vocabulary))


Vocabulary (all unique words): ['allowed', 'and', 'antagonistic', 'are', 'cats', 'dogs', 'not']
Vocabulary size: 7


In [18]:
# Create feature vectors manually - count word occurrences in each document
# Each position in vector corresponds to a word in vocabulary
def create_bow_vector(tokens, vocab):
    # Initialize vector with zeros for each word in vocabulary
    vector = [0] * len(vocab)
    for token in tokens:
        if token in vocab:
            idx = vocab.index(token)
            vector[idx] += 1
    return vector

vector1 = create_bow_vector(doc1_tokens, vocabulary)
vector2 = create_bow_vector(doc2_tokens, vocabulary)

print("\nFeature Vector for Document 1:")
print(vector1)
print("\nFeature Vector for Document 2:")
print(vector2)

# Display word-to-count mapping
print("\nWord-to-count mapping for Doc1:")
for word, count in zip(vocabulary, vector1):
    if count > 0:
        print(f"  {word}: {count}")


Feature Vector for Document 1:
[1, 1, 0, 1, 1, 1, 1]

Feature Vector for Document 2:
[0, 1, 1, 1, 1, 1, 0]

Word-to-count mapping for Doc1:
  allowed: 1
  and: 1
  are: 1
  cats: 1
  dogs: 1
  not: 1


In [19]:
# ============ GENSIM BoW MODEL ============

# Create Gensim Dictionary - maps each unique word to an ID
tokenized_docs = [doc1_tokens, doc2_tokens]
dictionary = corpora.Dictionary(tokenized_docs)

print("\nGensim Dictionary (word -> ID mapping):")
print(dictionary.token2id)
print(f"\nDictionary size: {len(dictionary)}")


Gensim Dictionary (word -> ID mapping):
{'allowed': 0, 'and': 1, 'are': 2, 'cats': 3, 'dogs': 4, 'not': 5, 'antagonistic': 6}

Dictionary size: 7


In [22]:
# doc2bow - converts document to bag of words representation (word_id, frequency)
bow_doc1 = dictionary.doc2bow(doc1_tokens)
bow_doc2 = dictionary.doc2bow(doc2_tokens)

print("\ndoc2bow for Document 1 (word_id, count):")
print(bow_doc1)
print("\ndoc2bow for Document 2 (word_id, count):")
print(bow_doc2)

# Gensim Function 2: doc2idx - map tokens to dictionary IDs (-1 for OOV)
doc1_ids = dictionary.doc2idx(doc1_tokens)
doc2_ids = dictionary.doc2idx(doc2_tokens)
print("\ndoc2idx for Document 1:", doc1_ids)
print("doc2idx for Document 2:", doc2_ids)

# Gensim Function 3: add_documents - update dictionary with new text
extra_docs = [["cats", "and", "birds", "coexist"]]
dictionary_updated = corpora.Dictionary(tokenized_docs)
dictionary_updated.add_documents(extra_docs)
print("\nUpdated dictionary size after add_documents:", len(dictionary_updated))
print("Updated token2id:", dictionary_updated.token2id)

# Decode to show which words the IDs represent
print("\nDecoded BoW for Document 1:")
for word_id, freq in bow_doc1:
    print(f"  {dictionary[word_id]}: {freq}")


doc2bow for Document 1 (word_id, count):
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]

doc2bow for Document 2 (word_id, count):
[(1, 1), (2, 1), (3, 1), (4, 1), (6, 1)]

doc2idx for Document 1: [3, 1, 4, 2, 5, 0]
doc2idx for Document 2: [3, 1, 4, 2, 6]

Updated dictionary size after add_documents: 9
Updated token2id: {'allowed': 0, 'and': 1, 'are': 2, 'cats': 3, 'dogs': 4, 'not': 5, 'antagonistic': 6, 'birds': 7, 'coexist': 8}

Decoded BoW for Document 1:
  allowed: 1
  and: 1
  are: 1
  cats: 1
  dogs: 1
  not: 1


In [21]:
# ============ N-GRAMS MODEL ============

from collections import defaultdict

# Generate n-grams - consecutive sequences of n words
def generate_ngrams(tokens, n):
    # Returns list of n-grams as tuples
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngrams.append(tuple(tokens[i:i+n]))
    return ngrams

# Bigrams (2-grams)
bigrams_doc1 = generate_ngrams(doc1_tokens, 2)
bigrams_doc2 = generate_ngrams(doc2_tokens, 2)

print("\n========== BIGRAMS (2-grams) ==========")
print("Document 1 bigrams:", bigrams_doc1)
print("Document 2 bigrams:", bigrams_doc2)


Document 1 bigrams: [('cats', 'and'), ('and', 'dogs'), ('dogs', 'are'), ('are', 'not'), ('not', 'allowed')]
Document 2 bigrams: [('cats', 'and'), ('and', 'dogs'), ('dogs', 'are'), ('are', 'antagonistic')]


In [23]:
# Trigrams (3-grams)
trigrams_doc1 = generate_ngrams(doc1_tokens, 3)
trigrams_doc2 = generate_ngrams(doc2_tokens, 3)

print("\n========== TRIGRAMS (3-grams) ==========")
print("Document 1 trigrams:", trigrams_doc1)
print("Document 2 trigrams:", trigrams_doc2)


Document 1 trigrams: [('cats', 'and', 'dogs'), ('and', 'dogs', 'are'), ('dogs', 'are', 'not'), ('are', 'not', 'allowed')]
Document 2 trigrams: [('cats', 'and', 'dogs'), ('and', 'dogs', 'are'), ('dogs', 'are', 'antagonistic')]


In [24]:
# ============ NEXT WORD PREDICTION USING N-GRAMS ============

# Build n-gram language model - stores frequency of each n-gram
def build_ngram_model(documents, n):
    model = defaultdict(int)
    for doc in documents:
        tokens = doc.lower().split()
        ngrams = generate_ngrams(tokens, n)
        for ngram in ngrams:
            model[ngram] += 1
    return model

# Build conditional probability model for next word prediction
def build_next_word_model(documents, n):
    next_word_model = defaultdict(lambda: defaultdict(int))

    for doc in documents:
        tokens = doc.lower().split()
        ngrams = generate_ngrams(tokens, n)

        for ngram in ngrams:
            context = ngram[:-1]
            next_word = ngram[-1]
            next_word_model[context][next_word] += 1

    return next_word_model

# Convert counts to conditional probabilities P(next_word | context)
def build_next_word_probabilities(next_word_model):
    prob_model = {}
    for context, next_words in next_word_model.items():
        total = sum(next_words.values())
        prob_model[context] = {word: count / total for word, count in next_words.items()}
    return prob_model

# Build bigram model and probabilities
bigram_model = build_next_word_model(documents, 2)
bigram_prob_model = build_next_word_probabilities(bigram_model)

print("\n========== NEXT WORD PREDICTION MODEL ==========")
print("\nBigram Model (context -> next_word: count):")
for context, next_words in bigram_model.items():
    print(f"\nAfter '{' '.join(context)}':")
    for word, count in next_words.items():
        print(f"  -> '{word}': {count} time(s)")

print("\n========== BIGRAM CONDITIONAL PROBABILITIES ==========")
for context, probs in bigram_prob_model.items():
    print(f"\nP(next_word | {' '.join(context)}):")
    for word, prob in probs.items():
        print(f"  P({word} | {' '.join(context)}) = {prob:.3f}")



Bigram Model (context -> next_word: count):

After 'cats':
  -> 'and': 2 time(s)

After 'and':
  -> 'dogs': 2 time(s)

After 'dogs':
  -> 'are': 2 time(s)

After 'are':
  -> 'not': 1 time(s)
  -> 'antagonistic': 1 time(s)

After 'not':
  -> 'allowed': 1 time(s)


P(next_word | cats):
  P(and | cats) = 1.000

P(next_word | and):
  P(dogs | and) = 1.000

P(next_word | dogs):
  P(are | dogs) = 1.000

P(next_word | are):
  P(not | are) = 0.500
  P(antagonistic | are) = 0.500

P(next_word | not):
  P(allowed | not) = 1.000


In [26]:
# Predict next word given context - returns most likely next word and probability
def predict_next_word(context, count_model, prob_model):
    context_tuple = tuple(context.lower().split())

    if context_tuple not in count_model:
        return None, None, "Context not found in model"

    next_words = count_model[context_tuple]
    predicted_word = max(next_words, key=next_words.get)
    count = next_words[predicted_word]
    probability = prob_model[context_tuple][predicted_word]

    return predicted_word, probability, count

print("\n========== NEXT WORD PREDICTIONS ==========")

# For bigram model, context length must be 1 word
test_contexts = ["cats", "and", "dogs", "are"]

for context in test_contexts:
    predicted, prob, count = predict_next_word(context, bigram_model, bigram_prob_model)
    if predicted:
        print(f"\nContext: '{context}'")
        print(f"Predicted next word: '{predicted}'")
        print(f"Count: {count}, Probability: {prob:.3f}")
    else:
        print(f"\nContext: '{context}' - {count}")



Context: 'cats'
Predicted next word: 'and'
Count: 2, Probability: 1.000

Context: 'and'
Predicted next word: 'dogs'
Count: 2, Probability: 1.000

Context: 'dogs'
Predicted next word: 'are'
Count: 2, Probability: 1.000

Context: 'are'
Predicted next word: 'not'
Count: 1, Probability: 0.500


In [27]:
# Build trigram model for more context-aware predictions
trigram_model = build_next_word_model(documents, 3)
trigram_prob_model = build_next_word_probabilities(trigram_model)

print("\n========== TRIGRAM PREDICTIONS (2-word context) ==========")
print("\nTrigram Model:")
for context, next_words in trigram_model.items():
    print(f"\nAfter '{' '.join(context)}':")
    for word, count in next_words.items():
        print(f"  -> '{word}': {count} time(s)")

print("\nTrigram Conditional Probabilities:")
for context, probs in trigram_prob_model.items():
    for word, prob in probs.items():
        print(f"P({word} | {' '.join(context)}) = {prob:.3f}")

# Test with 3-word contexts for trigram model
test_contexts_trigram = ["cats and", "and dogs", "dogs are"]

print("\nPredictions:")
for context in test_contexts_trigram:
    predicted, prob, count = predict_next_word(context, trigram_model, trigram_prob_model)
    if predicted:
        print(f"\nContext: '{context}'")
        print(f"Predicted next word: '{predicted}'")
        print(f"Count: {count}, Probability: {prob:.3f}")
    else:
        print(f"\nContext: '{context}' - {count}")



Trigram Model:

After 'cats and':
  -> 'dogs': 2 time(s)

After 'and dogs':
  -> 'are': 2 time(s)

After 'dogs are':
  -> 'not': 1 time(s)
  -> 'antagonistic': 1 time(s)

After 'are not':
  -> 'allowed': 1 time(s)

Trigram Conditional Probabilities:
P(dogs | cats and) = 1.000
P(are | and dogs) = 1.000
P(not | dogs are) = 0.500
P(antagonistic | dogs are) = 0.500
P(allowed | are not) = 1.000

Predictions:

Context: 'cats and'
Predicted next word: 'dogs'
Count: 2, Probability: 1.000

Context: 'and dogs'
Predicted next word: 'are'
Count: 2, Probability: 1.000

Context: 'dogs are'
Predicted next word: 'not'
Count: 1, Probability: 0.500


In [32]:
# ============ USING SPACY FOR BoW ============

# Load spaCy model for text processing (with graceful fallback)
try:
    nlp = spacy.load("en_core_web_sm")
except Exception:
    try:
        from spacy.cli import download
        download("en_core_web_sm")
        nlp = spacy.load("en_core_web_sm")
    except Exception:
        nlp = spacy.blank("en")

# Process documents with spaCy
doc1_spacy = nlp(doc1)
doc2_spacy = nlp(doc2)

# spaCy Function 1: Tokenization
doc1_tokens_spacy = [token.text.lower() for token in doc1_spacy if not token.is_punct]
doc2_tokens_spacy = [token.text.lower() for token in doc2_spacy if not token.is_punct]

print("\n========== SPACY PROCESSING ==========")
print("\nSpacy tokens Doc1:", doc1_tokens_spacy)
print("Spacy tokens Doc2:", doc2_tokens_spacy)

# spaCy Function 2: Lemmatization (token.lemma_)
lemmas_doc1 = [token.lemma_ if token.lemma_ else token.text.lower() for token in doc1_spacy if not token.is_punct]
lemmas_doc2 = [token.lemma_ if token.lemma_ else token.text.lower() for token in doc2_spacy if not token.is_punct]
print("\nSpacy lemmas Doc1:", lemmas_doc1)
print("Spacy lemmas Doc2:", lemmas_doc2)

# spaCy Function 3: POS tagging (token.pos_)
pos_doc1 = [(token.text, token.pos_ if token.pos_ else 'N/A') for token in doc1_spacy if not token.is_punct]
pos_doc2 = [(token.text, token.pos_ if token.pos_ else 'N/A') for token in doc2_spacy if not token.is_punct]
print("\nSpacy POS tags Doc1:", pos_doc1)
print("Spacy POS tags Doc2:", pos_doc2)

# Create vocabulary from spaCy tokens
vocab_spacy = sorted(set(doc1_tokens_spacy + doc2_tokens_spacy))
print("\nSpacy Vocabulary:", vocab_spacy)

# Create BoW vectors
vector1_spacy = create_bow_vector(doc1_tokens_spacy, vocab_spacy)
vector2_spacy = create_bow_vector(doc2_tokens_spacy, vocab_spacy)

print("\nSpacy BoW Vector Doc1:", vector1_spacy)
print("Spacy BoW Vector Doc2:", vector2_spacy)

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Spacy tokens Doc1: ['cats', 'and', 'dogs', 'are', 'not', 'allowed']
Spacy tokens Doc2: ['cats', 'and', 'dogs', 'are', 'antagonistic']

Spacy lemmas Doc1: ['cat', 'and', 'dog', 'be', 'not', 'allow']
Spacy lemmas Doc2: ['cat', 'and', 'dog', 'be', 'antagonistic']

Spacy POS tags Doc1: [('cats', 'NOUN'), ('and', 'CCONJ'), ('dogs', 'NOUN'), ('are', 'AUX'), ('not', 'PART'), ('allowed', 'VERB')]
Spacy POS tags Doc2: [('cats', 'NOUN'), ('and', 'CCONJ'), ('dogs', 'NOUN'), ('are', 'AUX'), ('antagonistic', 'ADJ')]

Spacy Vocabulary: ['allowed', 'and', 'antagonistic', 'are', 'cats', 'dogs', 'not']

Spacy BoW Vector Doc1: [1, 1, 0, 1, 1, 1, 

In [29]:
# ============ WORD2VEC DEMONSTRATION ============

from gensim.models import Word2Vec

# Prepare training data - list of tokenized sentences
training_data = [doc1_tokens, doc2_tokens]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=training_data, vector_size=10, window=2, min_count=1, sg=0)

print("\n========== WORD2VEC ==========")
print("\nWord2Vec Vocabulary:", list(word2vec_model.wv.index_to_key))

# Get vector representation for words
print("\nVector for 'cats':", word2vec_model.wv['cats'])
print("Vector for 'dogs':", word2vec_model.wv['dogs'])

# Additional Gensim functions on Word2Vec vectors
print("\nSimilarity(cats, dogs):", round(word2vec_model.wv.similarity('cats', 'dogs'), 4))
print("Most similar to 'cats':", word2vec_model.wv.most_similar('cats', topn=3))

print("\nVector dimensions:", word2vec_model.wv.vector_size)



Word2Vec Vocabulary: ['are', 'dogs', 'and', 'cats', 'antagonistic', 'allowed', 'not']

Vector for 'cats': [-0.07511582 -0.00930042  0.09538119 -0.07319167 -0.02333769 -0.01937741
  0.08077437 -0.05930896  0.00045162 -0.04753734]
Vector for 'dogs': [ 0.07380552 -0.01533481 -0.0453664   0.06554095 -0.0486019  -0.0181603
  0.02876598  0.00991878 -0.08285265 -0.09448878]

Similarity(cats, dogs): -0.2113
Most similar to 'cats': [('are', 0.10494355112314224), ('allowed', 0.09267307072877884), ('and', -0.1055101752281189)]

Vector dimensions: 10


In [30]:
# ============ DOC2VEC DEMONSTRATION ============

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Prepare tagged documents - each document needs a unique tag/ID
tagged_docs = [
    TaggedDocument(words=doc1_tokens, tags=['DOC1']),
    TaggedDocument(words=doc2_tokens, tags=['DOC2'])
]

# Train Doc2Vec model - creates vector representations for entire documents
# vector_size: dimension of document vectors
# min_count: ignores words with frequency less than this
# epochs: number of training iterations
doc2vec_model = Doc2Vec(tagged_docs, vector_size=10, min_count=1, epochs=40)

print("\n========== DOC2VEC ==========")
print("\nDoc2Vec Vocabulary:", list(doc2vec_model.wv.index_to_key))

# Get vector representation for documents
doc1_vector = doc2vec_model.dv['DOC1']
doc2_vector = doc2vec_model.dv['DOC2']

print("\nDocument 1 vector:", doc1_vector)
print("Document 2 vector:", doc2_vector)

# Calculate similarity between documents using cosine similarity
from numpy import dot
from numpy.linalg import norm

cosine_sim = dot(doc1_vector, doc2_vector) / (norm(doc1_vector) * norm(doc2_vector))
print(f"\nCosine similarity between documents: {cosine_sim:.4f}")



Doc2Vec Vocabulary: ['are', 'dogs', 'and', 'cats', 'antagonistic', 'allowed', 'not']

Document 1 vector: [-0.05262421 -0.06005096 -0.09982104  0.08610007  0.0357652   0.00221071
 -0.09950028 -0.05158884 -0.09844406  0.02011195]
Document 2 vector: [ 0.02822869  0.04650216 -0.04359442 -0.03130666 -0.03087194 -0.08773335
  0.02148127  0.09267059 -0.09591305 -0.03467903]

Cosine similarity between documents: -0.0520


In [31]:
# ============ SUMMARY COMPARISON ============

print("\n========== SUMMARY ==========")
print("\n1. BAG OF WORDS (BoW):")
print("   - Represents documents as word frequency vectors")
print("   - Includes manual BoW and Gensim Dictionary-based BoW")
print(f"   - Doc1 vector: {vector1}")
print(f"   - Doc2 vector: {vector2}")

print("\n2. DICTIONARY + DOC2BOW (GENSIM):")
print("   - Dictionary created using corpora.Dictionary")
print("   - doc2bow converts text to (word_id, frequency)")
print(f"   - Dictionary mapping: {dictionary.token2id}")
print(f"   - Doc1 doc2bow: {bow_doc1}")

print("\n3. GENSIM FUNCTIONS DEMONSTRATED (>=3):")
print("   - dictionary.doc2bow()")
print("   - dictionary.doc2idx()")
print("   - dictionary.add_documents()")
print("   - word2vec_model.wv.similarity(), wv.most_similar()")

print("\n4. SPACY FUNCTIONS DEMONSTRATED (>=3):")
print("   - Tokenization (token.text)")
print("   - Lemmatization (token.lemma_)")
print("   - POS tagging (token.pos_)")

print("\n5. N-GRAMS WITH PROBABILITIES:")
print("   - Bigrams and trigrams generated")
print("   - Conditional probabilities computed: P(next_word | context)")
print("   - Used for next-word prediction")

print("\n6. WORD2VEC AND DOC2VEC:")
print("   - Word2Vec creates dense word embeddings")
print("   - Doc2Vec creates document embeddings")
print(f"   - Word2Vec vector size: {word2vec_model.wv.vector_size}")
print(f"   - Document similarity: {cosine_sim:.4f}")



1. BAG OF WORDS (BoW):
   - Represents documents as word frequency vectors
   - Includes manual BoW and Gensim Dictionary-based BoW
   - Doc1 vector: [1, 1, 0, 1, 1, 1, 1]
   - Doc2 vector: [0, 1, 1, 1, 1, 1, 0]

2. DICTIONARY + DOC2BOW (GENSIM):
   - Dictionary created using corpora.Dictionary
   - doc2bow converts text to (word_id, frequency)
   - Dictionary mapping: {'allowed': 0, 'and': 1, 'are': 2, 'cats': 3, 'dogs': 4, 'not': 5, 'antagonistic': 6}
   - Doc1 doc2bow: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]

3. GENSIM FUNCTIONS DEMONSTRATED (>=3):
   - dictionary.doc2bow()
   - dictionary.doc2idx()
   - dictionary.add_documents()
   - word2vec_model.wv.similarity(), wv.most_similar()

4. SPACY FUNCTIONS DEMONSTRATED (>=3):
   - Tokenization (token.text)
   - Lemmatization (token.lemma_)
   - POS tagging (token.pos_)

5. N-GRAMS WITH PROBABILITIES:
   - Bigrams and trigrams generated
   - Conditional probabilities computed: P(next_word | context)
   - Used for next-word p