In [1]:
import pandas as pd
import emoji
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [None]:
# ========== 1️⃣ LOAD DATA ==========
df = pd.read_csv("../data/dataset.csv")
df = df.drop(columns=['Unnamed: 0'], errors='ignore')

print("=== Dataset Loaded ===")
print(df.shape)
print(df.head(2))

# ========== 2️⃣ LOAD SLANG DICTIONARY ==========
try:
    slang_df = pd.read_csv(
        "../data/indo_slang_new.csv",
        encoding="latin1",
        names=["slang", "formal"]
    )
    slang_df = slang_df.dropna().drop_duplicates()
    slang_dict = dict(zip(slang_df["slang"].astype(str).str.lower(), slang_df["formal"].astype(str).str.lower()))

    # Filter hanya slang yang muncul di dataset
    text_all = " ".join(df["tweet"].astype(str)).lower()
    text_tokens = set(text_all.split())
    slang_dict = {k: v for k, v in slang_dict.items() if k in text_tokens}
    print(f"✅ Loaded slang dictionary ({len(slang_dict)} entries dipakai dari {len(slang_df)} total)")
except Exception as e:
    print(f"⚠️ Gagal load slang.csv, fallback ke default. Error: {e}")
    slang_dict = {
        "gk": "gak", "ga": "gak", "bgt": "banget", "tdk": "tidak", 
        "yg": "yang", "dgn": "dengan", "utk": "untuk", "nggak": "tidak",
        "wowo": "prabowo", "bowo": "prabowo", "jae": "jokowi", "pakde": "jokowi",
        "jkw": "jokowi", "uno": "sandiaga", "bang sandi": "sandiaga", "sandi": "sandiaga"
    }

# Jangan normalisasi kata negasi
negation_words = {"tidak", "bukan", "jangan", "belum", "tanpa", "kurang"}
slang_dict = {k: v for k, v in slang_dict.items() if k not in negation_words}

# ========== 3️⃣ CUSTOM STOPWORD REMOVER (NEGATION-AWARE) ==========
factory = StopWordRemoverFactory()
default_stopwords = set(factory.get_stop_words())
important_words = {"bukan", "tidak", "jangan", "belum", "tanpa", "kurang"}
custom_stopwords = list(default_stopwords - important_words)
stopword_remover = factory.create_stop_word_remover()
stopword_remover.stop_words = custom_stopwords

# ========== 4️⃣ PRECOMPILE SLANG REGEX ==========
if slang_dict:
    slang_pattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in slang_dict.keys()) + r')\b')
else:
    slang_pattern = None

# ========== 5️⃣ EMOJI MAPPING ==========
EMOJI_MAPPING = {
    ":face_with_tears_of_joy:": "tertawa",
    ":loudly_crying_face:": "sedih",
    ":smiling_face_with_heart_eyes:": "cinta",
    ":folded_hands:": "berdoa",
    ":angry_face:": "marah",
    ":thumbs_up:": "setuju",
    ":thinking_face:": "berpikir",
    ":clapping_hands:": "dukung",
    ":red_heart:": "cinta",
    ":broken_heart:": "kecewa",
    ":fire:": "semangat",
    ":skull:": "gagal",
    ":money_bag:": "uang",
    ":flag_for_Indonesia:": "indonesia",
    ":prayer_beads:": "tasbih",
}

def demojize_to_text(text: str) -> str:
    text = emoji.demojize(text, delimiters=(" ", " "))
    for code, word in EMOJI_MAPPING.items():
        text = text.replace(code, f"[{word}]")
    return text

# ========== 6️⃣ PREPROCESS FUNCTION (OPTIMIZED) ==========
def preprocess(text: str) -> str:
    if not isinstance(text, str) or not text.strip():
        return ""

    # 1. Lowercase
    text = text.lower()

    # 2. Hapus URL
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    # 3. Hapus mention @user, tapi ubah #hashtag jadi teks
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#(\w+)", r"\1", text)

    # 4. Konversi emoji ke kata
    text = demojize_to_text(text)

    # 5. Normalisasi duplikasi karakter (opsional tapi disarankan)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)

    # 6. Normalisasi slang
    if slang_pattern:
        text = slang_pattern.sub(lambda x: slang_dict.get(x.group(), x.group()), text)

    # 7. Hapus tanda baca & simbol non-huruf/angka/spasi
    text = re.sub(r"[^\w\s]", " ", text)

    # 8. Bersihkan spasi berlebih
    text = " ".join(text.split())

    # 9. Stopword removal (tanpa buang kata negasi)
    text = stopword_remover.remove(text)

    10. (Opsional) Stemming — aktifkan jika ingin
    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
    stemmer = StemmerFactory().create_stemmer()
    text = stemmer.stem(text)

    return text.strip()

# ========== 7️⃣ JALANKAN PIPELINE ==========
print("\n=== Mulai preprocessing (negation-aware)... ===")
df["clean_tweet"] = df["tweet"].astype(str).apply(preprocess)
print("✅ Preprocessing selesai!")

# ========== 8️⃣ SIMPAN HASIL ==========
output_path = "../data/dataset_clean.csv"
df.to_csv(output_path, index=False)
print(f"✅ Data bersih disimpan ke: {output_path}")

# ========== 9️⃣ CONTOH HASIL ==========
print("\n=== Contoh Sebelum & Sesudah ===")
for i in range(min(3, len(df))):
    print(f"\n[{i+1}] Tweet Asli: {df['tweet'].iloc[i]}")
    print(f"[{i+1}] Setelah Bersih: {df['clean_tweet'].iloc[i]}")

=== Dataset Loaded ===
(1815, 2)
  sentimen                                              tweet
0  negatif  Kata @prabowo Indonesia tidak dihargai bangsa ...
1   netral  Batuan Langka, Tasbih Jokowi Hadiah dari Habib...
✅ Loaded slang dictionary (1089 entries dipakai dari 15167 total)

=== Mulai preprocessing (negation-aware)... ===
✅ Preprocessing selesai!
✅ Data bersih disimpan ke: ../data/dataset_clean.csv

=== Contoh Sebelum & Sesudah ===

[1] Tweet Asli: Kata @prabowo Indonesia tidak dihargai bangsa asing!   Berita ini ðŸ‘‡ pasti hoax buatan penguasa, ya kan @rockygerung?ðŸ˜œ https://twitter.com/mediaindonesia/status/1117575436337160192?s=21Â â€¦
[1] Setelah Bersih: kata indonesia dihargai bangsa asing berita ðÿ hoaks buatan penguasa kan ðÿ œ â

[2] Tweet Asli: Batuan Langka, Tasbih Jokowi Hadiah dari Habib Luthfi Seharga Mercy?  http://dlvr.it/R2pvZVÂ 
[2] Setelah Bersih: batuan langka tasbih jokowi hadiah habib luthfi seharga mercy

[3] Tweet Asli: Di era Jokowi, ekonomi Indonesi