# Clustering Dokumen SMS Spam
Analisis singkat clustering pada `spam.csv` menggunakan TF-IDF + KMeans (k=2).


In [None]:
# Cell 1: Import Library & Fungsi Utilitas
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


def load_spam_csv(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    # Ambil kolom teks (nama kolom di file adalah 'Text')
    if 'Text' not in df.columns:
        # Coba deteksi kolom teks jika berbeda
        for col in df.columns:
            if str(col).strip().lower() in {'text', 'message', 'sms'}:
                df.rename(columns={col: 'Text'}, inplace=True)
                break
    df = df[['Text']].dropna()
    df['Text'] = df['Text'].astype(str).str.strip()
    df = df[df['Text'] != '']
    return df.reset_index(drop=True)


def simple_clean(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def vectorize_text(texts):
    vectorizer = TfidfVectorizer(
        preprocessor=simple_clean,
        stop_words='english',
        ngram_range=(1, 2),
        max_features=5000,
        min_df=2,
        max_df=0.95,
    )
    X = vectorizer.fit_transform(texts)
    return X, vectorizer


def kmeans_cluster(X, k=2, random_state=42):
    model = KMeans(n_clusters=k, n_init='auto', random_state=random_state)
    labels = model.fit_predict(X)
    return model, labels


def show_top_terms_per_cluster(model: KMeans, vectorizer: TfidfVectorizer, top_n: int = 10):
    feature_names = vectorizer.get_feature_names_out()
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    results = []
    for i in range(model.n_clusters):
        terms = [feature_names[ind] for ind in order_centroids[i, :top_n]]
        results.append((i, terms))
    return results


## 1) Load Data


In [2]:
# Cell 2: Load Data
spam_df = load_spam_csv('spam.csv')
print(f"Total pesan: {len(spam_df)}")
print(spam_df.head())


Total pesan: 5572
                                                Text
0  Go until jurong point, crazy.. Available only ...
1                      Ok lar... Joking wif u oni...
2  Free entry in 2 a wkly comp to win FA Cup fina...
3  U dun say so early hor... U c already then say...
4  Nah I don't think he goes to usf, he lives aro...


## 2) Vectorize Teks (TF-IDF)


In [3]:
# Cell 3: TF-IDF Vectorization
X, vectorizer = vectorize_text(spam_df['Text'].tolist())
print(f"Shape TF-IDF: {X.shape}")


Shape TF-IDF: (5572, 5000)


## 3) Clustering dengan KMeans (k=2)


In [4]:
# Cell 4: KMeans Clustering (k=2)
model, labels = kmeans_cluster(X, k=2)
spam_df['cluster'] = labels

sil = silhouette_score(X, labels, metric='cosine') if X.shape[0] > 2 else float('nan')
print(f"Silhouette score (cosine): {sil:.4f}")


Silhouette score (cosine): 0.0085


## 4) Top Terms per Cluster & Contoh Pesan


In [5]:
# Cell 5: Ringkas Hasil Cluster (Top Terms & Contoh)
print("\nTop terms per cluster:")
for i, terms in show_top_terms_per_cluster(model, vectorizer, top_n=12):
    print(f"- Cluster {i}: {', '.join(terms)}")

print("\nContoh pesan per cluster:")
for cid in sorted(spam_df['cluster'].unique()):
    sample = spam_df[spam_df['cluster'] == cid]['Text'].head(3).tolist()
    print(f"\nCluster {cid} (contoh 3 pesan):")
    for s in sample:
        print(f"- {s}")



Top terms per cluster:
- Cluster 0: ok, just, ll, ur, come, good, know, like, got, time, home, going
- Cluster 1: gt, lt, lt gt, gt min, lt decimal, decimal gt, decimal, like lt, like, ll, min, minutes

Contoh pesan per cluster:

Cluster 0 (contoh 3 pesan):
- Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
- Ok lar... Joking wif u oni...
- Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's

Cluster 1 (contoh 3 pesan):
- Great! I hope you like your man well endowed. I am  &lt;#&gt;  inches...
- A gram usually runs like  &lt;#&gt; , a half eighth is smarter though and gets you almost a whole second gram for  &lt;#&gt;
- Do you know what Mallika Sherawat did yesterday? Find out now @  &lt;URL&gt;


## 5) Simpan Hasil


In [6]:
# Cell 6: Simpan Hasil
spam_df[['Text', 'cluster']].to_csv('spam_clusters.csv', index=False)
print("Hasil disimpan ke spam_clusters.csv")


Hasil disimpan ke spam_clusters.csv
