In [None]:
!pip -q install scikit-learn==1.5.2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.dummy import DummyClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    RocCurveDisplay,
    PrecisionRecallDisplay,
    roc_auc_score
)

# ====== Konfigurasi Awal ======
RSEED = 42
pd.set_option('display.max_colwidth', 120)

# ====== Load Dataset ======
fp = Path('SMSSpamCollection')
df = pd.read_csv(
    fp,
    sep='\t',
    header=None,
    names=['label', 'text'],
    encoding='utf-8'
)

# ====== Informasi Awal ======
print(df.shape)
print(df.head())

# ====== Persiapan Label dan Fitur ======
y = df['label'].map({'ham': 0, 'spam': 1})
X = df['text']

print(y.unique())

(5572, 2)
  label  \
0   ham   
1   ham   
2  spam   
3   ham   
4   ham   

                                                                                                                      text  
0          Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...  
1                                                                                            Ok lar... Joking wif u oni...  
2  Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std t...  
3                                                                        U dun say so early hor... U c already then say...  
4                                                            Nah I don't think he goes to usf, he lives around here though  
[0 1]


In [None]:
#EDA
print(df.shape)
df['label'].value_counts(normalize=True).mul(100).round(2)
df['text'].str.len().describe(percentiles=[.5,.9,.95])

#SPLIT
X = df['text']
y = df['label'].map({'ham':0, 'spam':1})
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y,
random_state=RSEED)

(5572, 2)


In [None]:
base = Pipeline([
    ('vec', TfidfVectorizer(lowercase=True, stop_words='english', min_df=2)),
    ('clf', DummyClassifier(strategy='most_frequent', random_state=RSEED))
])

mnb = Pipeline([
    ('vec', TfidfVectorizer(lowercase=True, stop_words='english', ngram_range=(1, 2), min_df=2)),
    ('clf', MultinomialNB(alpha=1.0))
])

cnb = Pipeline([
    ('vec', TfidfVectorizer(lowercase=True, stop_words='english', ngram_range=(1, 2), min_df=2)),
    ('clf', ComplementNB(alpha=1.0))
])

for name, pipe in [
    ('Baseline', base),
    ('MultinomialNB', mnb),
    ('ComplementNB', cnb)
]:
    pipe.fit(X_tr, y_tr)
    yp = pipe.predict(X_te)
    acc = accuracy_score(y_te, yp)
    print(f"\n{name}")
    print(f"Accuracy : {acc:.4f}")
    print(confusion_matrix(y_te, yp))
    print(classification_report(y_te, yp, target_names=['ham', 'spam'], zero_division=0))

    if hasattr(pipe, "predict_proba"):
        proba = pipe.predict_proba(X_te)[:, 1]
        print(f"ROC-AUC : {roc_auc_score(y_te, proba):.4f}")


Baseline
Accuracy : 0.8664
[[966   0]
 [149   0]]
              precision    recall  f1-score   support

         ham       0.87      1.00      0.93       966
        spam       0.00      0.00      0.00       149

    accuracy                           0.87      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.75      0.87      0.80      1115

ROC-AUC : 0.5000

MultinomialNB
Accuracy : 0.9704
[[966   0]
 [ 33 116]]
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.78      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115

ROC-AUC : 0.9876

ComplementNB
Accuracy : 0.9767
[[950  16]
 [ 10 139]]
              precision    recall  f1-score   support

         ham       0.99      0.98      0.99       966
        spam       0.90      0.93      0.

In [None]:
alphas = np.logspace(-2, 1, 10)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RSEED)

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(
    estimator=Pipeline([
        ('vec', TfidfVectorizer(lowercase=True, stop_words='english', ngram_range=(1, 2), min_df=2)),
        ('clf', MultinomialNB())
    ]),
    param_grid={
        'clf__alpha': alphas,
        'vec__min_df': [1, 2, 3]
    },
    scoring='f1',
    cv=cv,
    n_jobs=-1
)

grid.fit(X_tr, y_tr)
print('Best params:', grid.best_params_)
print('Best CV f1 :', grid.best_score_.round(4))

best = grid.best_estimator_
yp = best.predict(X_te)

print(classification_report(y_te, yp, target_names=['ham', 'spam']))

Best params: {'clf__alpha': np.float64(0.046415888336127774), 'vec__min_df': 1}
Best CV f1 : 0.9566
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       966
        spam       0.98      0.92      0.95       149

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [None]:
# 1) Lihat contoh salah klasifikasi
pred = best.predict(X_te)
err_idx = np.flatnonzero(pred != y_te)

miss = pd.DataFrame({
    'y_true': y_te.iloc[err_idx].values,
    'y_pred': pred[err_idx],
    'text': X_te.iloc[err_idx].values
})
miss.head(10)

# 2) Token paling informatif (approx)
vec = best.named_steps['vec']
clf = best.named_steps['clf']
feat = np.array(vec.get_feature_names_out())

# log prob per kelas (MultinomialNB)
logp = clf.feature_log_prob_

# skor "spamness" ~ selisih logP(token|spam) - logP(token|ham)
score = (logp[1] - logp[0])
idx = np.argsort(score)[-20:][::-1]

pd.DataFrame({
    'token': feat[idx],
    'score': score[idx].round(3)
})

Unnamed: 0,token,score
0,claim,6.588
1,prize,6.376
2,150p,6.086
3,tone,5.938
4,18,5.857
5,guaranteed,5.847
6,1000,5.785
7,cs,5.784
8,500,5.751
9,landline,5.661
