In [1]:
import numpy as np
import pandas as pd
from collections import Counter

# 1. Corpus and Documents
documents = [
    "people watch campusx",
    "campusx watch campusx",
    "people write comment",
    "campusx write comment"
]

# Print the documents
print("Documents:")
for i, doc in enumerate(documents):
    print(f"D{i+1}: {doc}")

# 2. Create the corpus by combining all documents
corpus = " ".join(documents)
print("\nCorpus:")
print(corpus)

# 3. Create vocabulary (unique words)
def create_vocabulary(corpus):
    # Split the corpus into words and get unique words
    words = corpus.split()
    vocabulary = sorted(list(set(words)))
    return vocabulary

vocabulary = create_vocabulary(corpus)
V = len(vocabulary)

print("\nVocabulary:", vocabulary)
print(f"Vocabulary Size (V): {V}")

# 4. One-Hot Encoding Vectors
def create_one_hot_encoding(word, vocabulary):
    # Create a vector of zeros with length equal to vocabulary size
    vector = [0] * len(vocabulary)
    
    # Set the position corresponding to the word to 1
    if word in vocabulary:
        word_index = vocabulary.index(word)
        vector[word_index] = 1
    
    return vector

# Generate one-hot encodings for each word in vocabulary
one_hot_encodings = {}
for word in vocabulary:
    one_hot_encodings[word] = create_one_hot_encoding(word, vocabulary)

# Print one-hot encodings
print("\nOne-Hot Encoding Vectors:")
for word, vector in one_hot_encodings.items():
    print(f"{word}: {vector}")

# 5. Document Vector Representation
def encode_document(doc, vocabulary):
    words = doc.split()
    encoded_doc = []
    
    for word in words:
        if word in vocabulary:
            encoded_doc.append(create_one_hot_encoding(word, vocabulary))
    
    return np.array(encoded_doc)

# Encode each document
encoded_documents = []
for i, doc in enumerate(documents):
    encoded_doc = encode_document(doc, vocabulary)
    encoded_documents.append(encoded_doc)
    
    print(f"\nDocument D{i+1} One-Hot Encoding:")
    print(encoded_doc)
    print(f"Shape: {encoded_doc.shape}")

# 6. Demonstrate OOV Handling
def handle_oov_document(doc, vocabulary):
    words = doc.split()
    known_words = []
    unknown_words = []
    
    for word in words:
        if word in vocabulary:
            known_words.append(word)
        else:
            unknown_words.append(word)
    
    return known_words, unknown_words

# Example of handling OOV words
print("\n7. Handling Out-of-Vocabulary (OOV) Words:")
new_doc = "hello campusx peoples"
known, unknown = handle_oov_document(new_doc, vocabulary)

print(f"New Document: '{new_doc}'")
print(f"Known Words: {known}")
print(f"Unknown Words (OOV): {unknown}")

# Try to encode the document with OOV words
encoded_new_doc = encode_document(new_doc, vocabulary)
print("\nEncoded New Document (only known words are encoded):")
print(encoded_new_doc)
print(f"Shape: {encoded_new_doc.shape}")

# 8. Demonstrate Orthogonality - Calculate distances between word vectors
print("\n8. Demonstrating lack of semantic relationships:")

# Create a simple example with three words
simple_vocabulary = ["walk", "run", "shoe"]
simple_encodings = {}

for word in simple_vocabulary:
    vector = [0] * len(simple_vocabulary)
    vector[simple_vocabulary.index(word)] = 1
    simple_encodings[word] = vector

print("Simple One-Hot Encodings:")
for word, vector in simple_encodings.items():
    print(f"{word}: {vector}")

# Calculate Euclidean distances between vectors
def euclidean_distance(vec1, vec2):
    return np.sqrt(sum((a - b) ** 2 for a, b in zip(vec1, vec2)))

print("\nEuclidean Distances:")
for word1 in simple_vocabulary:
    for word2 in simple_vocabulary:
        if word1 != word2:
            dist = euclidean_distance(simple_encodings[word1], simple_encodings[word2])
            print(f"Distance between '{word1}' and '{word2}': {dist:.2f}")

Documents:
D1: people watch campusx
D2: campusx watch campusx
D3: people write comment
D4: campusx write comment

Corpus:
people watch campusx campusx watch campusx people write comment campusx write comment

Vocabulary: ['campusx', 'comment', 'people', 'watch', 'write']
Vocabulary Size (V): 5

One-Hot Encoding Vectors:
campusx: [1, 0, 0, 0, 0]
comment: [0, 1, 0, 0, 0]
people: [0, 0, 1, 0, 0]
watch: [0, 0, 0, 1, 0]
write: [0, 0, 0, 0, 1]

Document D1 One-Hot Encoding:
[[0 0 1 0 0]
 [0 0 0 1 0]
 [1 0 0 0 0]]
Shape: (3, 5)

Document D2 One-Hot Encoding:
[[1 0 0 0 0]
 [0 0 0 1 0]
 [1 0 0 0 0]]
Shape: (3, 5)

Document D3 One-Hot Encoding:
[[0 0 1 0 0]
 [0 0 0 0 1]
 [0 1 0 0 0]]
Shape: (3, 5)

Document D4 One-Hot Encoding:
[[1 0 0 0 0]
 [0 0 0 0 1]
 [0 1 0 0 0]]
Shape: (3, 5)

7. Handling Out-of-Vocabulary (OOV) Words:
New Document: 'hello campusx peoples'
Known Words: ['campusx']
Unknown Words (OOV): ['hello', 'peoples']

Encoded New Document (only known words are encoded):
[[1 0 0 0 0]]
S