### One Hot Encoding:

In [1]:
# Step 1: Define the documents
d1 = "people watch campusx"
d2 = "campusx watch campusx"
d3 = "people write comment"
d4 = "campusx write comment"

documents = [d1, d2, d3, d4]

print("Documents:")
for i, doc in enumerate(documents, 1):
    print(f"d{i}: {doc}")

Documents:
d1: people watch campusx
d2: campusx watch campusx
d3: people write comment
d4: campusx write comment


In [2]:
# Step 2: Build Vocabulary (Unique Words)

vocab = set()

for doc in documents:
    words = doc.split()
    vocab.update(words)

vocab = sorted(vocab)  # Sorting for consistent order
print("Vocabulary:", vocab)


Vocabulary: ['campusx', 'comment', 'people', 'watch', 'write']


In [3]:
# Step 3: Create One Hot Encoding Manually

import numpy as np
import pandas as pd

# Create zero matrix (rows = documents, columns = vocabulary size)
one_hot_matrix = np.zeros((len(documents), len(vocab)))

# Fill matrix
for i, doc in enumerate(documents):
    for word in doc.split():
        if word in vocab:
            one_hot_matrix[i][vocab.index(word)] = 1

# Convert to DataFrame for better visualization
one_hot_df = pd.DataFrame(one_hot_matrix, columns=vocab, 
                          index=[f"d{i}" for i in range(1,5)])

one_hot_df


Unnamed: 0,campusx,comment,people,watch,write
d1,1.0,0.0,1.0,1.0,0.0
d2,1.0,0.0,0.0,1.0,0.0
d3,0.0,1.0,1.0,0.0,1.0
d4,1.0,1.0,0.0,0.0,1.0


### sklearn:

In [4]:
from sklearn.preprocessing import OneHotEncoder

# Convert documents into list of words
tokenized_docs = [doc.split() for doc in documents]

# Flatten for fitting encoder
all_words = sorted(list(set(word for doc in tokenized_docs for word in doc)))

encoder = OneHotEncoder(sparse_output=False)

# Fit on vocabulary
encoder.fit(np.array(all_words).reshape(-1,1))

# Transform each document
encoded_docs = []

for doc in tokenized_docs:
    encoded = encoder.transform(np.array(doc).reshape(-1,1))
    doc_vector = encoded.sum(axis=0)  # Sum to get document-level encoding
    doc_vector = np.where(doc_vector > 0, 1, 0)  # Binary
    encoded_docs.append(doc_vector)

one_hot_sklearn = pd.DataFrame(encoded_docs, columns=encoder.categories_[0],
                               index=[f"d{i}" for i in range(1,5)])

one_hot_sklearn

Unnamed: 0,campusx,comment,people,watch,write
d1,1,0,1,1,0
d2,1,0,0,1,0
d3,0,1,1,0,1
d4,1,1,0,0,1
