In [1]:
# -----------------------------
# Step 1: Import Libraries
# -----------------------------
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

# -----------------------------
# Step 2: Example Dataset
# -----------------------------
documents = [
    "I loved the movie, it was fantastic!",
    "The movie was terrible and boring.",
    "Amazing film, truly enjoyed it!",
    "Worst movie ever, waste of time.",
    "It was a great and enjoyable experience.",
    "Not good, I will not recommend it."
]

# Labels (for example, sentiment classification: 1 = positive, 0 = negative)
labels = [1, 0, 1, 0, 1, 0]

# -----------------------------
# Step 3a: Bag-of-Words Representation
# -----------------------------
bow_vectorizer = CountVectorizer()
bow_features = bow_vectorizer.fit_transform(documents)

print("🔹 Bag of Words (BoW) Vocabulary:")
print(bow_vectorizer.get_feature_names_out())

print("\n🔹 BoW Feature Matrix (as DataFrame):")
bow_df = pd.DataFrame(bow_features.toarray(), columns=bow_vectorizer.get_feature_names_out())
print(bow_df)

# -----------------------------
# Step 3b: TF-IDF Representation
# -----------------------------
tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(documents)

print("\n🔹 TF-IDF Vocabulary:")
print(tfidf_vectorizer.get_feature_names_out())

print("\n🔹 TF-IDF Feature Matrix (as DataFrame):")
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print(tfidf_df)


🔹 Bag of Words (BoW) Vocabulary:
['amazing' 'and' 'boring' 'enjoyable' 'enjoyed' 'ever' 'experience'
 'fantastic' 'film' 'good' 'great' 'it' 'loved' 'movie' 'not' 'of'
 'recommend' 'terrible' 'the' 'time' 'truly' 'was' 'waste' 'will' 'worst']

🔹 BoW Feature Matrix (as DataFrame):
   amazing  and  boring  enjoyable  enjoyed  ever  experience  fantastic  \
0        0    0       0          0        0     0           0          1   
1        0    1       1          0        0     0           0          0   
2        1    0       0          0        1     0           0          0   
3        0    0       0          0        0     1           0          0   
4        0    1       0          1        0     0           1          0   
5        0    0       0          0        0     0           0          0   

   film  good  ...  of  recommend  terrible  the  time  truly  was  waste  \
0     0     0  ...   0          0         0    1     0      0    1      0   
1     0     0  ...   0          