In [1]:
# CADL2: Feature Extraction with BoW & TF-IDF

# Install sklearn
!pip install scikit-learn

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

# Example dataset (movie reviews)
docs = [
    "I loved the movie, it was fantastic!",
    "The film was terrible and boring",
    "Amazing story and brilliant acting",
    "I did not like the movie, very disappointing"
]

print("Original Dataset:")
for i, d in enumerate(docs, 1):
    print(f"{i}. {d}")

# --- 1. Bag of Words ---
bow = CountVectorizer()
X_bow = bow.fit_transform(docs)

bow_df = pd.DataFrame(X_bow.toarray(), columns=bow.get_feature_names_out())
print("\nBag of Words Representation:")
print(bow_df)

# --- 2. TF-IDF ---
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(docs)

tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
print("\nTF-IDF Representation:")
print(tfidf_df.round(2))  # round for clarity


Original Dataset:
1. I loved the movie, it was fantastic!
2. The film was terrible and boring
3. Amazing story and brilliant acting
4. I did not like the movie, very disappointing

Bag of Words Representation:
   acting  amazing  and  boring  brilliant  did  disappointing  fantastic  \
0       0        0    0       0          0    0              0          1   
1       0        0    1       1          0    0              0          0   
2       1        1    1       0          1    0              0          0   
3       0        0    0       0          0    1              1          0   

   film  it  like  loved  movie  not  story  terrible  the  very  was  
0     0   1     0      1      1    0      0         0    1     0    1  
1     1   0     0      0      0    0      0         1    1     0    1  
2     0   0     0      0      0    0      1         0    0     0    0  
3     0   0     1      0      1    1      0         0    1     1    0  

TF-IDF Representation:
   acting  amazing  