**One Hot Encoding of Words**

In [1]:
# Import numpy for array handling
import numpy as np

# Step 1: Create a simple document (list of words)
words = ["the", "cat", "sat", "on", "the", "mat"]

# Step 2: Create the vocabulary (unique words)
vocab = sorted(set(words))
print("Vocabulary:", vocab)

Vocabulary: ['cat', 'mat', 'on', 'sat', 'the']


In [2]:
# Step 3: Create a dictionary mapping word -> index
word_to_index = {word: i for i, word in enumerate(vocab)}
print("\nWord to Index Mapping:", word_to_index)


Word to Index Mapping: {'cat': 0, 'mat': 1, 'on': 2, 'sat': 3, 'the': 4}


In [3]:
# Step 4: Create a one-hot encoded matrix
num_words = len(words)
vocab_size = len(vocab)
one_hot_matrix = np.zeros((num_words, vocab_size), dtype=int)

for i, word in enumerate(words):
    index = word_to_index[word]      # get index of the word
    one_hot_matrix[i, index] = 1     # set that position to 1

In [4]:
# Step 5: Display results
print("\nOne-Hot Encoded Matrix:")
print(one_hot_matrix)


One-Hot Encoded Matrix:
[[0 0 0 0 1]
 [1 0 0 0 0]
 [0 0 0 1 0]
 [0 0 1 0 0]
 [0 0 0 0 1]
 [0 1 0 0 0]]


In [5]:
# Step 6: Show what each row means
for i, word in enumerate(words):
    print(f"{word:>5} → {one_hot_matrix[i]}")

  the → [0 0 0 0 1]
  cat → [1 0 0 0 0]
  sat → [0 0 0 1 0]
   on → [0 0 1 0 0]
  the → [0 0 0 0 1]
  mat → [0 1 0 0 0]


**One Hot Encoding of Characters**

In [6]:
import numpy as np

text = "hello"

# Create vocabulary of unique characters
chars = sorted(set(text))
print("Characters:", chars)

Characters: ['e', 'h', 'l', 'o']


In [7]:
# Create mapping character -> index
char_to_index = {ch: i for i, ch in enumerate(chars)}
print("Character to Index Mapping:", char_to_index)

Character to Index Mapping: {'e': 0, 'h': 1, 'l': 2, 'o': 3}


In [10]:
# Create one-hot encoded matrix
one_hot_chars = np.zeros((len(text), len(chars)), dtype=int)

for i, ch in enumerate(text):
    index = char_to_index[ch]
    one_hot_chars[i, index] = 1

print("\nOne-Hot Encoded Chars:")
print(one_hot_chars)


One-Hot Encoded Chars:
[[0 1 0 0]
 [1 0 0 0]
 [0 0 1 0]
 [0 0 1 0]
 [0 0 0 1]]


In [11]:
print("\nOne-Hot Encoded Matrix:")
for i, ch in enumerate(text):
    print(f"{ch} → {one_hot_chars[i]}")


One-Hot Encoded Matrix:
h → [0 1 0 0]
e → [1 0 0 0]
l → [0 0 1 0]
l → [0 0 1 0]
o → [0 0 0 1]
