## Step 1 - Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

## Step 2 - Load The Data

In [5]:
df = pd.read_csv('spam.csv', encoding='latin-1')

## Step 3 - Explore The Data

In [6]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [None]:
# Choosing the right column
df = df.iloc[:, :2]
df.columns = ["label", "message"]

In [9]:
print(df["label"].value_counts())

label
ham     4825
spam     747
Name: count, dtype: int64


## Step 4 - Preprocessing

In [10]:
print("\n" + "=" * 70)
print("PREPROCESSING DATA")
print("=" * 70)

# Mengubah label menjadi binary (ham=0, spam=1)
df["label"] = df["label"].map({"ham": 0, "spam": 1})

# Memisahkan fitur (X) dan target (y)
X = df["message"]
y = df["label"]

# Split data menjadi training dan testing (80:20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nJumlah data training: {len(X_train)}")
print(f"Jumlah data testing: {len(X_test)}")
print(f"\nDistribusi kelas di training set:")
print(pd.Series(y_train).value_counts())


PREPROCESSING DATA

Jumlah data training: 4457
Jumlah data testing: 1115

Distribusi kelas di training set:
label
0    3859
1     598
Name: count, dtype: int64


# **MODEL 1 : MULTINOMIAL NAIVE BAYES DENGAN COUNT VECTORIZER**

In [12]:
print("\n" + "=" * 70)
print("MODEL 1: MULTINOMIAL NAIVE BAYES + COUNT VECTORIZER")
print("=" * 70)

# CountVectorizer mengubah teks menjadi matriks token count
# stop_words='english' menghilangkan kata-kata umum seperti 'the', 'is', 'and'
count_vectorizer = CountVectorizer(stop_words="english")

# Fit dan transform pada data training
X_train_count = count_vectorizer.fit_transform(X_train)

# Transform pada data testing (hanya transform, tidak fit)
X_test_count = count_vectorizer.transform(X_test)

print(f"\nUkuran vocabulary: {len(count_vectorizer.vocabulary_)}")
print(f"Shape X_train setelah vectorization: {X_train_count.shape}")
print(f"Shape X_test setelah vectorization: {X_test_count.shape}")

# Training model Multinomial Naive Bayes
mnb_count = MultinomialNB()
mnb_count.fit(X_train_count, y_train)

# Prediksi
y_pred_count = mnb_count.predict(X_test_count)

# Evaluasi
print("\n" + "-" * 70)
print("EVALUASI MODEL 1 (COUNT VECTORIZER)")
print("-" * 70)

accuracy_count = accuracy_score(y_test, y_pred_count)
precision_count = precision_score(y_test, y_pred_count)
recall_count = recall_score(y_test, y_pred_count)
f1_count = f1_score(y_test, y_pred_count)

print(f"\nAccuracy:  {accuracy_count:.4f}")
print(f"Precision: {precision_count:.4f}")
print(f"Recall:    {recall_count:.4f}")
print(f"F1-Score:  {f1_count:.4f}")


MODEL 1: MULTINOMIAL NAIVE BAYES + COUNT VECTORIZER

Ukuran vocabulary: 7440
Shape X_train setelah vectorization: (4457, 7440)
Shape X_test setelah vectorization: (1115, 7440)

----------------------------------------------------------------------
EVALUASI MODEL 1 (COUNT VECTORIZER)
----------------------------------------------------------------------

Accuracy:  0.9839
Precision: 0.9580
Recall:    0.9195
F1-Score:  0.9384


In [13]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_count))


Confusion Matrix:
[[960   6]
 [ 12 137]]


In [14]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred_count, target_names=["Ham", "Spam"]))


Classification Report:
              precision    recall  f1-score   support

         Ham       0.99      0.99      0.99       966
        Spam       0.96      0.92      0.94       149

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



# **MODEL 2 : MULTINOMIAL NAIVE BAYES DENGAN TF-IDF**

In [16]:
print("\n" + "=" * 70)
print("MODEL 2: MULTINOMIAL NAIVE BAYES + TF-IDF VECTORIZER")
print("=" * 70)

# TF-IDF (Term Frequency-Inverse Document Frequency)
# Memberikan bobot lebih tinggi pada kata yang jarang muncul di banyak dokumen
tfidf_vectorizer = TfidfVectorizer(stop_words="english")

# Fit dan transform pada data training
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform pada data testing
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"\nUkuran vocabulary: {len(tfidf_vectorizer.vocabulary_)}")
print(f"Shape X_train setelah vectorization: {X_train_tfidf.shape}")
print(f"Shape X_test setelah vectorization: {X_test_tfidf.shape}")

# Training model Multinomial Naive Bayes
mnb_tfidf = MultinomialNB()
mnb_tfidf.fit(X_train_tfidf, y_train)

# Prediksi
y_pred_tfidf = mnb_tfidf.predict(X_test_tfidf)

# Evaluasi
print("\n" + "-" * 70)
print("EVALUASI MODEL 2 (TF-IDF VECTORIZER)")
print("-" * 70)

accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
precision_tfidf = precision_score(y_test, y_pred_tfidf)
recall_tfidf = recall_score(y_test, y_pred_tfidf)
f1_tfidf = f1_score(y_test, y_pred_tfidf)

print(f"\nAccuracy:  {accuracy_tfidf:.4f}")
print(f"Precision: {precision_tfidf:.4f}")
print(f"Recall:    {recall_tfidf:.4f}")
print(f"F1-Score:  {f1_tfidf:.4f}")


MODEL 2: MULTINOMIAL NAIVE BAYES + TF-IDF VECTORIZER

Ukuran vocabulary: 7440
Shape X_train setelah vectorization: (4457, 7440)
Shape X_test setelah vectorization: (1115, 7440)

----------------------------------------------------------------------
EVALUASI MODEL 2 (TF-IDF VECTORIZER)
----------------------------------------------------------------------

Accuracy:  0.9686
Precision: 1.0000
Recall:    0.7651
F1-Score:  0.8669


In [17]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_tfidf))


Confusion Matrix:
[[966   0]
 [ 35 114]]


In [18]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred_tfidf, target_names=["Ham", "Spam"]))


Classification Report:
              precision    recall  f1-score   support

         Ham       0.97      1.00      0.98       966
        Spam       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



# Conclusion Comparison

In [19]:
print("\n" + "=" * 70)
print("PERBANDINGAN MODEL")
print("=" * 70)

comparison_df = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1-Score"],
    "CountVectorizer": [accuracy_count, precision_count, recall_count, f1_count],
    "TF-IDF": [accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf],
})

print("\n", comparison_df.to_string(index=False))

# Menghitung selisih
print("\nSelisih Performance (TF-IDF - CountVectorizer):")
print(f"Accuracy:  {(accuracy_tfidf - accuracy_count):.4f}")
print(f"Precision: {(precision_tfidf - precision_count):.4f}")
print(f"Recall:    {(recall_tfidf - recall_count):.4f}")
print(f"F1-Score:  {(f1_tfidf - f1_count):.4f}")



PERBANDINGAN MODEL

    Metric  CountVectorizer   TF-IDF
 Accuracy         0.983857 0.968610
Precision         0.958042 1.000000
   Recall         0.919463 0.765101
 F1-Score         0.938356 0.866920

Selisih Performance (TF-IDF - CountVectorizer):
Accuracy:  -0.0152
Precision: 0.0420
Recall:    -0.1544
F1-Score:  -0.0714


# Conclusion

In [None]:
print("\n" + "=" * 70)
print("KESIMPULAN")
print("=" * 70)

if accuracy_tfidf > accuracy_count:
    best_model = "TF-IDF Vectorizer"
    accuracy_diff = accuracy_tfidf - accuracy_count
else:
    best_model = "Count Vectorizer"
    accuracy_diff = accuracy_count - accuracy_tfidf

print(f"""
Berdasarkan hasil evaluasi pada dataset spam.csv:

1. MODEL TERBAIK: {best_model}
   - Memiliki accuracy lebih tinggi sebesar {accuracy_diff:.4f} ({accuracy_diff * 100:.2f}%)
   
2. PENJELASAN:
   a) Count Vectorizer:
      - Menghitung frekuensi kemunculan kata
      - Sederhana dan cepat
      - Cocok untuk dataset kecil atau kata yang frekuensinya penting
      
   b) TF-IDF Vectorizer:
      - Memberikan bobot berdasarkan kepentingan kata
      - Mengurangi pengaruh kata yang terlalu umum
      - Lebih baik untuk menangkap kata-kata unik yang membedakan spam/ham
      
3. ALASAN PERBEDAAN PERFORMA:
   - Spam biasanya mengandung kata-kata spesifik yang jarang (misal: "winner", "free", "claim")
   - TF-IDF memberikan bobot lebih tinggi pada kata-kata unik ini
   - Count Vectorizer memperlakukan semua kata sama, kurang sensitif terhadap kata kunci spam
   
4. REKOMENDASI:
   - Untuk deteksi spam, gunakan {best_model}
   - Pertimbangkan juga parameter tuning untuk optimasi lebih lanjut
   - Monitor false positive/negative sesuai kebutuhan bisnis
""")


KESIMPULAN

Berdasarkan hasil evaluasi pada dataset spam.csv:

1. MODEL TERBAIK: Count Vectorizer
   - Memiliki accuracy lebih tinggi sebesar 0.0152 (1.52%)

2. PENJELASAN:
   a) Count Vectorizer:
      - Menghitung frekuensi kemunculan kata
      - Sederhana dan cepat
      - Cocok untuk dataset kecil atau kata yang frekuensinya penting

   b) TF-IDF Vectorizer:
      - Memberikan bobot berdasarkan kepentingan kata
      - Mengurangi pengaruh kata yang terlalu umum
      - Lebih baik untuk menangkap kata-kata unik yang membedakan spam/ham

3. ALASAN PERBEDAAN PERFORMA:
   - Spam biasanya mengandung kata-kata spesifik yang jarang (misal: "winner", "free", "claim")
   - TF-IDF memberikan bobot lebih tinggi pada kata-kata unik ini
   - Count Vectorizer memperlakukan semua kata sama, kurang sensitif terhadap kata kunci spam

4. REKOMENDASI:
   - Untuk deteksi spam, gunakan Count Vectorizer
   - Pertimbangkan juga parameter tuning untuk optimasi lebih lanjut
   - Monitor false positive/ne