In [15]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# Load data
df = pd.read_csv('c:/kelompok5_22230006.csv')
tweets = df['Isi Tweet'].dropna().tolist()

# Preprocessing ringan + hapus stopword Indonesia
factory = StopWordRemoverFactory()
stop_words = set(factory.get_stop_words())

def preprocess(text):
    text = re.sub(r"http\S+", "", text) # hapus link
    text = re.sub(r"[^a-zA-Z\s]", "", text) # hapus simbol
    text = text.lower()
    tokens = text.split()
    return ' '.join([w for w in tokens if w not in stop_words])

tweets_cleaned = [preprocess(t) for t in tweets]

# Vectorisasi
vectorizer = TfidfVectorizer(max_df=0.8, min_df=2)
X = vectorizer.fit_transform(tweets_cleaned)

# Clustering
k = 5
model = KMeans(n_clusters=k, random_state=42)
model.fit(X)

# Tampilkan hasil
for i in range(k):
    print(f"\n Cluster {i+1}")
    indices = [j for j, label in enumerate(model.labels_) if label == i]
    for idx in indices[:5]: # tampilkan 5 tweet pertama dari cluster ini
        print(f"- {tweets[idx]}")


 Cluster 1
- RT @sharpandshark: Gibran selalu sebut targetnya anak muda, kok anak muda malah buat VT seperti ini. https://t.co/xycIRmxBt6
- RT @AntoniusCDN: Jangan panggil aku anak kecil paman... _____ Gibran Senin Sore https://t.co/8oRVCR2nY1
- RT @ch_chotimah2: Semangat pagi-pagi🔥 Chusnul Chotimah: Jangan Terjebak Isu Hasto, Fokus Lengserkan Gibran. https://t.co/e55tGn30Up

 Cluster 2
- RT @susipudjiastuti: @CNNIndonesia @prabowo @Gerindra @gibran_tweet @jokowi
- @DivHumas_Polri @ListyoSigitP @gibran_tweet @prabowo Sampai GA BISA KOMEN 🤣🤣🤣🤣🤣🤣 https://t.co/cDCejmYsGB
- RT @mediaindonesia: PDIP menilai Presiden Prabowo Subianto perlu merespons serius soal usulan purnawirawan TNI yang meminta Wakil Presiden…

 Cluster 3
- RT @democrazymedia: Kritik Pidato Monolog Wapres, Pakar UGM: Gibran Tak Mengerti Masalah Hilirisasi, Dia Enggak Paham https://t.co/NsWSO53k…
- RT @democrazymedia: Dihardik Senior, Prabowo Harus Dorong Pencopotan Gibran atau Akui Cuma Boneka Solo! https://t.co/8sR4EZFc