1. Model Multinomial Naive Bayes dengan CountVectorizer

In [3]:
# Import library
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. Load dataset
df = pd.read_csv(r'D:\File untuk kuliah\Semester-5\Pembelajaran Mesin\Praktek\MachineLearning_Smst5_2025\Week9_JS09\Assets\spam.csv', encoding='latin-1')

# Hapus kolom tidak perlu jika ada
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

# Encode label
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# 2. Split data
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

# 3. Gunakan CountVectorizer dengan stopwords
from sklearn.feature_extraction import text
# CountVectorizer expects stop_words to be a string ('english'), a list, or None.
# Convert the frozenset to a list so it's accepted by newer sklearn versions.
stop_words = list(text.ENGLISH_STOP_WORDS)

vectorizer = CountVectorizer(stop_words=stop_words)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 4. Model Naive Bayes
model_cv = MultinomialNB()
model_cv.fit(X_train_vec, y_train)
y_pred_cv = model_cv.predict(X_test_vec)

# 5. Evaluasi
print("=== CountVectorizer ===")
print("Accuracy:", accuracy_score(y_test, y_pred_cv))
print("\nClassification Report:\n", classification_report(y_test, y_pred_cv))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_cv))


=== CountVectorizer ===
Accuracy: 0.9838565022421525

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.96      0.92      0.94       150

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115


Confusion Matrix:
 [[959   6]
 [ 12 138]]


2. Model Multinomial Naive Bayes dengan TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. TF-IDF dengan stopwords
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# 2. Model Naive Bayes
model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

# 3. Evaluasi
print("=== TF-IDF ===")
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_tfidf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_tfidf))


=== TF-IDF ===
Accuracy: 0.9668161434977578

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115


Confusion Matrix:
 [[965   0]
 [ 37 113]]


3. Perbandingan Hasil

In [5]:
acc_cv = accuracy_score(y_test, y_pred_cv)
acc_tfidf = accuracy_score(y_test, y_pred_tfidf)

print(f"\nAkurasi CountVectorizer: {acc_cv:.4f}")
print(f"Akurasi TF-IDF: {acc_tfidf:.4f}")

if acc_tfidf > acc_cv:
    print("\n✅ TF-IDF memberikan hasil yang lebih baik.")
else:
    print("\n✅ CountVectorizer memberikan hasil yang lebih baik.")



Akurasi CountVectorizer: 0.9839
Akurasi TF-IDF: 0.9668

✅ CountVectorizer memberikan hasil yang lebih baik.


Kesimpulan:

Berdasarkan hasil evaluasi, fitur TF-IDF biasanya memberikan akurasi lebih tinggi dibanding CountVectorizer karena memperhitungkan bobot kata yang lebih informatif dan mengurangi pengaruh kata umum (stopwords).

Namun, pada dataset tertentu, CountVectorizer bisa lebih unggul jika data sangat sederhana atau berisi banyak kata berulang.

Jadi, pada kasus spam.csv, jika akurasi TF-IDF > CountVectorizer, maka fitur TF-IDF adalah yang paling optimal.