<a href="https://colab.research.google.com/drive/1sBdRzNt-fn61kF3I_I6yLY11I3HSQk8M?usp=drive_link" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Tugas 2**
---
* Buatlah model klasfikasi Multinomial Naive Bayes dengan ketentuan,
1. Menggunakan data `spam.csv`
2. Fitur `CountVectorizer` dengan mengaktifkan **stop_words**
3. Evaluasi hasilnya

* Buatlah model klasfikasi Multinomial Naive Bayes dengan ketentuan,
1. Menggunakan data `spam.csv`
2. Fitur `TF-IDF` dengan mengaktifkan **stop_words**
3. Evaluasi hasilnya dan bandingkan dengan hasil pada Tugas no 2.
4. Berikan kesimpulan fitur mana yang terbaik pada kasus data `spam.csv`

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import sys
from dataclasses import dataclass
from typing import Dict, Tuple

import numpy as np
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

@dataclass
class EvalResult:
    name: str
    accuracy: float
    f1_spam: float
    precision_spam: float
    recall_spam: float
    f1_macro: float
    report: Dict
    confusion: np.ndarray


def load_spam_dataset(csv_path: str = "sample_data/spam.csv") -> Tuple[pd.Series, pd.Series]:
    """
    Memuat dataset SMS Spam Collection dari spam.csv yang memiliki kolom:
    - v1: label ('ham'/'spam')
    - v2: teks/pesan
    Beberapa versi dataset punya kolom kosong tambahan; kita pilih hanya v1 dan v2.
    """
    if not os.path.exists(csv_path):
        print(f"File '{csv_path}' tidak ditemukan. Pastikan file ada di direktori kerja.", file=sys.stderr)
        sys.exit(1)

    # Banyak versi dataset ini memakai encoding latin-1 dan punya kolom kosong.
    df = pd.read_csv(csv_path, encoding="latin-1")
    if not set(["v1", "v2"]).issubset(df.columns):
        # Coba fallback: ambil dua kolom pertama
        df = pd.read_csv(csv_path, encoding="latin-1", header=0, names=["v1", "v2", "c3", "c4", "c5"])

    df = df[["v1", "v2"]].rename(columns={"v1": "label", "v2": "text"})
    # Bersihkan missing
    df = df.dropna(subset=["label", "text"])

    # Hilangkan white-space kosong
    df["text"] = df["text"].astype(str).str.strip()
    df["label"] = df["label"].astype(str).str.strip()

    # Filter label yang valid
    df = df[df["label"].isin(["ham", "spam"])]

    X = df["text"]
    y = df["label"]
    return X, y

def build_pipeline_count(stop_words: str = "english") -> Pipeline:
    """
    Pipeline: CountVectorizer(stop_words='english') + MultinomialNB
    """
    return Pipeline(
        steps=[
            ("vec", CountVectorizer(stop_words=stop_words)),
            ("clf", MultinomialNB()),
        ]
    )

def build_pipeline_tfidf(stop_words: str = "english") -> Pipeline:
    """
    Pipeline: TfidfVectorizer(stop_words='english') + MultinomialNB
    """
    return Pipeline(
        steps=[
            ("vec", TfidfVectorizer(stop_words=stop_words)),
            ("clf", MultinomialNB()),
        ]
    )

def evaluate_model(name: str, pipe: Pipeline, X_test: pd.Series, y_test: pd.Series) -> EvalResult:
    preds = pipe.predict(X_test)
    acc = accuracy_score(y_test, preds)
    # Buat report dict agar bisa ambil F1 khusus kelas 'spam' dan macro avg
    report = classification_report(
        y_test, preds, labels=["ham", "spam"], target_names=["ham", "spam"], output_dict=True, zero_division=0
    )
    f1_spam = report["spam"]["f1-score"]
    precision_spam = report["spam"]["precision"]
    recall_spam = report["spam"]["recall"]
    f1_macro = report["macro avg"]["f1-score"]
    cm = confusion_matrix(y_test, preds, labels=["ham", "spam"])
    return EvalResult(
        name=name,
        accuracy=acc,
        f1_spam=f1_spam,
        precision_spam=precision_spam,
        recall_spam=recall_spam,
        f1_macro=f1_macro,
        report=report,
        confusion=cm,
    )

def print_eval(result: EvalResult):
    print(f"\n=== {result.name} ===")
    print(f"Akurasi      : {result.accuracy:.4f}")
    print(f"F1 (spam)    : {result.f1_spam:.4f}")
    print(f"Precision(spam): {result.precision_spam:.4f}")
    print(f"Recall  (spam): {result.recall_spam:.4f}")
    print(f"F1 (macro)   : {result.f1_macro:.4f}")
    print("Confusion Matrix [rows=true, cols=pred] (labels=['ham','spam']):")
    print(result.confusion)
    # Cetak classification report ringkas
    print("\nClassification Report:")
    print(pd.DataFrame(result.report).transpose().round(3))

def main():
    # 1) Muat data
    X, y = load_spam_dataset("spam.csv")

    # 2) Split data: stratified agar proporsi ham/spam seimbang antara train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
        # Jika dataset sangat besar, bisa tambah random_state lain untuk replikasi lain
    )

    # 3) Bangun pipeline
    count_pipe = build_pipeline_count(stop_words="english")
    tfidf_pipe = build_pipeline_tfidf(stop_words="english")

    # 4) Latih model
    count_pipe.fit(X_train, y_train)
    tfidf_pipe.fit(X_train, y_train)

    # 5) Evaluasi
    count_res = evaluate_model("MultinomialNB + CountVectorizer(stop_words='english')", count_pipe, X_test, y_test)
    tfidf_res = evaluate_model("MultinomialNB + TfidfVectorizer(stop_words='english')", tfidf_pipe, X_test, y_test)

    # 6) Tampilkan hasil
    print_eval(count_res)
    print_eval(tfidf_res)

    # 7) Bandingkan: gunakan F1 untuk kelas spam sebagai metrik utama
    better = "CountVectorizer" if count_res.f1_spam >= tfidf_res.f1_spam else "TF-IDF"
    print("\n=== Perbandingan & Kesimpulan ===")
    print(
        f"F1(spam) CountVectorizer: {count_res.f1_spam:.4f} | TF-IDF: {tfidf_res.f1_spam:.4f} | "
        f"Pilihan terbaik (berdasarkan F1 kelas spam): {better}"
    )
    # Tambahan: jika ingin berdasarkan akurasi atau macro-F1, bisa juga dibandingkan:
    if count_res.accuracy != tfidf_res.accuracy:
        print(
            f"Akurasi     CountVectorizer: {count_res.accuracy:.4f} | TF-IDF: {tfidf_res.accuracy:.4f}"
        )
    if count_res.f1_macro != tfidf_res.f1_macro:
        print(
            f"F1(macro)   CountVectorizer: {count_res.f1_macro:.4f} | TF-IDF: {tfidf_res.f1_macro:.4f}"
        )

    print(
        "\nCatatan:\n"
        "- Multinomial Naive Bayes secara teori lebih cocok untuk fitur berbasis frekuensi/count (CountVectorizer),\n"
        "  sehingga sering kali CountVectorizer memberi F1 spam sedikit lebih baik daripada TF-IDF.\n"
        "- Namun, hasil aktual bisa berbeda tergantung split data. Gunakan cross-validation untuk gambaran lebih stabil.\n"
    )

if __name__ == "__main__":
    main()


=== MultinomialNB + CountVectorizer(stop_words='english') ===
Akurasi      : 0.9839
F1 (spam)    : 0.9384
Precision(spam): 0.9580
Recall  (spam): 0.9195
F1 (macro)   : 0.9645
Confusion Matrix [rows=true, cols=pred] (labels=['ham','spam']):
[[960   6]
 [ 12 137]]

Classification Report:
              precision  recall  f1-score   support
ham               0.988   0.994     0.991   966.000
spam              0.958   0.919     0.938   149.000
accuracy          0.984   0.984     0.984     0.984
macro avg         0.973   0.957     0.965  1115.000
weighted avg      0.984   0.984     0.984  1115.000

=== MultinomialNB + TfidfVectorizer(stop_words='english') ===
Akurasi      : 0.9686
F1 (spam)    : 0.8669
Precision(spam): 1.0000
Recall  (spam): 0.7651
F1 (macro)   : 0.9246
Confusion Matrix [rows=true, cols=pred] (labels=['ham','spam']):
[[966   0]
 [ 35 114]]

Classification Report:
              precision  recall  f1-score   support
ham               0.965   1.000     0.982   966.000
spam    