In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Sample sentences
corpus = [
    "Data science is fun",
    "I love data and science",
    "Science and data are powerful"
]

# ---- 1. Bag of Words (BoW) ----
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(corpus)

print("Bag of Words Vocabulary:")
print(bow_vectorizer.get_feature_names_out())
print("\nBoW Matrix:")
print(bow_matrix.toarray())

# ---- 2. TF-IDF ----
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

print("\nTF-IDF Vocabulary:")
print(tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())

# ---- 3. N-grams (bigrams example) ----
ngram_vectorizer = CountVectorizer(ngram_range=(2, 2))  # bigrams
ngram_matrix = ngram_vectorizer.fit_transform(corpus)

print("\nBigrams Vocabulary:")
print(ngram_vectorizer.get_feature_names_out())
print("\nBigrams Matrix:")
print(ngram_matrix.toarray())


Bag of Words Vocabulary:
['and' 'are' 'data' 'fun' 'is' 'love' 'powerful' 'science']

BoW Matrix:
[[0 0 1 1 1 0 0 1]
 [1 0 1 0 0 1 0 1]
 [1 1 1 0 0 0 1 1]]

TF-IDF Vocabulary:
['and' 'are' 'data' 'fun' 'is' 'love' 'powerful' 'science']

TF-IDF Matrix:
[[0.         0.         0.35959372 0.6088451  0.6088451  0.
  0.         0.35959372]
 [0.50410689 0.         0.39148397 0.         0.         0.66283998
  0.         0.39148397]
 [0.42018292 0.55249005 0.32630952 0.         0.         0.
  0.55249005 0.32630952]]

Bigrams Vocabulary:
['and data' 'and science' 'are powerful' 'data and' 'data are'
 'data science' 'is fun' 'love data' 'science and' 'science is']

Bigrams Matrix:
[[0 0 0 0 0 1 1 0 0 1]
 [0 1 0 1 0 0 0 1 0 0]
 [1 0 1 0 1 0 0 0 1 0]]
