# S05 - Text Classification: Logistic Regression & Naive Bayes
## Exercises

### Exercise 1 (Easy)
Convert texts to Bag-of-Words representation using sklearn.

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

texts = ["I love this movie", "This movie is terrible", "Great film!", "Waste of time"]

# Create BoW representation
vec = CountVectorizer()
y = vec.fit_transform(texts)

print(vec.get_feature_names_out())
print(y.toarray())

['film' 'great' 'is' 'love' 'movie' 'of' 'terrible' 'this' 'time' 'waste']
[[0 0 0 1 1 0 0 1 0 0]
 [0 0 1 0 1 0 1 1 0 0]
 [1 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 1 1]]


### Exercise 2 (Easy)
Convert the same texts to TF-IDF representation.

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF representation
tfidf = TfidfVectorizer()
y = tfidf.fit_transform(texts)

print(tfidf.get_feature_names_out())
print(y.toarray())

['film' 'great' 'is' 'love' 'movie' 'of' 'terrible' 'this' 'time' 'waste']
[[0.         0.         0.         0.66767854 0.52640543 0.
  0.         0.52640543 0.         0.        ]
 [0.         0.         0.55528266 0.         0.43779123 0.
  0.55528266 0.43779123 0.         0.        ]
 [0.70710678 0.70710678 0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.57735027
  0.         0.         0.57735027 0.57735027]]


### Exercise 3 (Medium)
Train a Naive Bayes classifier for sentiment analysis.

In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

texts = ["I love this movie", "Great film", "Excellent acting", "Best movie ever",
         "Terrible movie", "Waste of time", "Awful acting", "Worst film"]
labels = [1, 1, 1, 1, 0, 0, 0, 0]  # 1=positive, 0=negative

# Train Naive Bayes classifier
X_train, X_test, y_train, y_test = train_test_split(texts, labels, random_state=42)

vectorizer = CountVectorizer()

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

nb = MultinomialNB()
nb.fit(X_train_vec, y_train)

y_pred_nb = nb.predict(X_test_vec)

for x, y, pred in zip(X_test, y_test, y_pred_nb):
    print(f"{x}: Real = {y} | Pred = {pred}")

Great film: Real = 1 | Pred = 0
Waste of time: Real = 0 | Pred = 0


### Exercise 4 (Medium)
Train a Logistic Regression classifier and compare with Naive Bayes.

In [22]:
from sklearn.linear_model import LogisticRegression

# Train Logistic Regression and compare accuracy with Naive Bayes
lr = LogisticRegression()

lr.fit(X_train_vec, y_train)
y_pred_lr = lr.predict(X_test_vec)

for x, y, pred in zip(X_test, y_test, y_pred_lr):
    print(f"{x}: Real = {y} | Pred = {pred}")

Great film: Real = 1 | Pred = 0
Waste of time: Real = 0 | Pred = 0


### Exercise 5 (Hard - Research)
Implement Naive Bayes from scratch (without sklearn) for text classification.

*Hint: Use log probabilities to avoid underflow. Research: P(c|d) ∝ P(c) × Π P(w|c)*

In [23]:
import math
from collections import defaultdict, Counter

class NaiveBayesClassifier:
    def __init__(self):
        self.class_probs = {}
        self.word_probs = defaultdict(dict)
    
    def fit(self, texts, labels):
        # Your implementation
        pass
    
    def predict(self, text):
        # Your implementation
        pass
