# Word Embedding techniques

In [None]:
! pip install scikit-learn

## One-Hot Encoding

In [5]:
from sklearn.preprocessing import OneHotEncoder

# Sample vocabulary
vocab = ['cat', 'dog', 'mouse']
onehot_encoder = OneHotEncoder(sparse_output=False)
onehot_encoded = onehot_encoder.fit_transform([[word] for word in vocab])

print(vocab)
print(onehot_encoded)


['cat', 'dog', 'mouse']
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


## Bag of Words (BoW)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample sentences
sentences = ["The cat sat on the mat", "The dog lay on the mat"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sentences)

print(vectorizer.get_feature_names_out())
print(X.toarray())

['cat' 'dog' 'lay' 'mat' 'on' 'sat' 'the']
[[1 0 0 1 1 1 2]
 [0 1 1 1 1 0 2]]


## Term Frequency-Inverse Document Frequency (TF-IDF)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample sentences
sentences = ["The cat sat on the mat", "The dog lay on the mat"]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sentences)

print(vectorizer.get_feature_names_out())
print(X.toarray())

['cat' 'dog' 'lay' 'mat' 'on' 'sat' 'the']
[[0.44554752 0.         0.         0.31701073 0.31701073 0.44554752
  0.63402146]
 [0.         0.44554752 0.44554752 0.31701073 0.31701073 0.
  0.63402146]]
