In [3]:
import pandas as pd
import re
from collections import defaultdict
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

csv_path = 'C:/kelompok5_22230006.csv'
df = pd.read_csv(csv_path)

tweets = df['Isi Tweet'].dropna().tolist()

factory = StopWordRemoverFactory()
stop_words = set(factory.get_stop_words())

def preprocess(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)                  
    text = text.lower()
    tokens = text.split()
    return ' '.join([w for w in tokens if w not in stop_words])

tweets_cleaned = [preprocess(t) for t in tweets]

def extract_phrases(text, min_words=2, max_words=4):
    words = text.split()
    phrases = []
    for size in range(min_words, max_words + 1):
        for i in range(len(words) - size + 1):
            phrase = ' '.join(words[i:i + size])
            phrases.append(phrase)
    return phrases

phrase_to_docs = defaultdict(set)
for idx, tweet in enumerate(tweets_cleaned):
    phrases = extract_phrases(tweet)
    for phrase in phrases:
        phrase_to_docs[phrase].add(idx)

min_docs_per_cluster = 2
common_phrases = {
    phrase: docs for phrase, docs in phrase_to_docs.items()
    if len(docs) >= min_docs_per_cluster
}

clusters = defaultdict(set)
for phrase, docs in common_phrases.items():
    clusters[f"Cluster: '{phrase}'"] = docs

if not clusters:
    print("❗ Tidak ada cluster yang memenuhi ambang batas !!!")
else:
    for i, (cluster_name, doc_ids) in enumerate(clusters.items(), start=1):
        print(f"\n{i}. {cluster_name} (total: {len(doc_ids)} tweet)")
        for doc_id in doc_ids:
            print(f"- {tweets[doc_id]}")



1. Cluster: 'rt democrazymedia' (total: 2 tweet)
- RT @democrazymedia: Kritik Pidato Monolog Wapres, Pakar UGM: Gibran Tak Mengerti Masalah Hilirisasi, Dia Enggak Paham https://t.co/NsWSO53k…
- RT @democrazymedia: Dihardik Senior, Prabowo Harus Dorong Pencopotan Gibran atau Akui Cuma Boneka Solo! https://t.co/8sR4EZFcUS

2. Cluster: 'pencopotan gibran' (total: 2 tweet)
- RT @Piyusaja2: BOLA PANAS PENCOPOTAN GIBRAN MAKIN BERGULIR... Hendropriyono Kasih Sinyal Kuat https://t.co/IutaPoxN1E
- RT @democrazymedia: Dihardik Senior, Prabowo Harus Dorong Pencopotan Gibran atau Akui Cuma Boneka Solo! https://t.co/8sR4EZFcUS

3. Cluster: 'rt hiwigoooagain' (total: 2 tweet)
- RT @hiwigOooagain: gak semulus itu kok orang wapresnya gibran wkaowkakwoak
- RT @hiwigOooagain: gak semulus itu kok orang wapresnya gibran wkaowkakwoak

4. Cluster: 'hiwigoooagain gak' (total: 2 tweet)
- RT @hiwigOooagain: gak semulus itu kok orang wapresnya gibran wkaowkakwoak
- RT @hiwigOooagain: gak semulus itu kok orang 