## READING DATA FROM LOCAL
Ya disini cuma read data yang masih belum di labeling semua, jadi stelah read langsung drop yang null dulu

In [None]:
import pandas as pd
file_path = r'E:\$7th\TA\Eksploring_TF-IDF\DATA\Cleaned_1.csv'

df = pd.read_csv(file_path)
print("Dataset Loaded Successfully")

invalid_rows = df[
    df['image_corelation'].isna() | (df['image_corelation'] == '-') |
    df['Label'].isna() | (df['Label'] == '-')
]

print("Jumlah baris invalid unik:", len(invalid_rows))
# Buat mask untuk baris yang valid (bukan null dan bukan '-')
mask = (
    df['image_corelation'].notna() & (df['image_corelation'] != '-') &
    df['Label'].notna() & (df['Label'] != '-')
)

# Terapkan mask → hanya ambil baris yang valid
df_cleaned = df[mask].copy()

print("Jumlah baris sebelum drop:", len(df))
print("Jumlah baris sesudah drop:", len(df_cleaned))
print("Jumlah baris yang dihapus:", len(df) - len(df_cleaned))

# Opsional: simpan ke file baru
output_path = r'E:\$7th\TA\Eksploring_TF-IDF\DATA\Cleaned_DropNull.csv'
df_cleaned.to_csv(output_path, index=False)
print(f"Dataset cleaned berhasil disimpan ke {output_path}")

Dataset Loaded Successfully
Jumlah baris invalid unik: 1924
Jumlah baris sebelum drop: 18450
Jumlah baris sesudah drop: 16526
Jumlah baris yang dihapus: 1924
Dataset cleaned berhasil disimpan ke E:\$7th\TA\Eksploring_TF-IDF\DATA\Cleaned_DropNull.csv


## TF-IDF
ini gw mau coba cleaning dlu -> baru nanti di bobot pake TF IDF -> aku ambil kolom full_text

### PREPRO FULL_TEXT
6 Step preprocessing dilakukan disini sesuai teori pada proposal

In [None]:
# ===== Preprocessing ===== #

import re
import pandas as pd
import json
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemover import StopWordRemover
from Sastrawi.Dictionary.ArrayDictionary import ArrayDictionary
from tqdm import tqdm

print("Library yang dibutuhkan berhasil di-import")

#DATASET
raw_path = r'E:\$7th\TA\Eksploring_TF-IDF\DATA\Cleaned_DropNull.csv'  #input
cleaned_text_path = r'E:\$7th\TA\Eksploring_TF-IDF\DATA\Cleaned_Text.csv' #output

#KAMUS
stopwords_path = r'E:\$7th\TA\Eksploring_TF-IDF\Kamus\combined_stop_words.txt'
slang_path = r'E:\$7th\TA\Eksploring_TF-IDF\Kamus\combined_slang_words.txt'
exceptions_path = r'E:\$7th\TA\Eksploring_TF-IDF\Kamus\combined_exceptions.txt'  

# --- Memuat Dataset Utama ---
df_full = pd.read_csv(raw_path)

#df = df_full.head(100).copy() pake ngetes
df = df_full.copy()
print(f"Menggunakan Dataset: {raw_path}")
print(f"Total rows: {len(df)}")

# Pilih kolom teks
text_col = None
for c in df.columns:
    if 'full_text' in c.lower():
        text_col = c
        break
if text_col is None:
    raise RuntimeError(f"Tidak ada kolom teks. Kolom tersedia: {df.columns.tolist()}")
print(f"Using text column: {text_col}")

# --- INI SETUP TOOLS NYA ---
print("\nMenyiapkan stemmer, custom stopword remover, kamus normalisasi, dan exception list...")

# 1. Stemmer
stemmer = StemmerFactory().create_stemmer()

# 2. Custom Stopword Remover -> ada di kamus stopwords (yang dianggap kata ga penting)
try:
    with open(stopwords_path, 'r') as f:
        custom_stopword_list = [line.strip() for line in f]
    dictionary = ArrayDictionary(custom_stopword_list)
    stop_remover = StopWordRemover(dictionary)
    print(f"Berhasil memuat {len(custom_stopword_list)} kata dari kamus stopwords.")
except FileNotFoundError:
    print(f"PERINGATAN: File stopwords tidak ditemukan di '{stopwords_path}'.")
    stop_remover = StopWordRemover(ArrayDictionary([]))

# 3. Kamus Normalisasi (slang → baku -> kalo ada lagi tambahin aja) 
try:
    with open(slang_path, 'r') as f:
        slang_dict = json.load(f)
    print(f"Berhasil memuat {len(slang_dict)} kata dari kamus slang.")
except (FileNotFoundError, json.JSONDecodeError) as e:
    print(f"PERINGATAN: Gagal memuat kamus slang dari '{slang_path}'. Error: {e}")
    slang_dict = {}

# 4. Exception List -> ada beberapa kata dari sastrawi yang di stemnya aneh jadi kalo nemu bisa tambah di exception list biar ga diproses
try:
    with open(exceptions_path, 'r') as f:
        exception_list = [line.strip() for line in f if line.strip()]
    print(f"Berhasil memuat {len(exception_list)} kata dari exception list.")
except FileNotFoundError:
    print(f"PERINGATAN: File exception list tidak ditemukan di '{exceptions_path}'.")
    exception_list = []

# --- LOOP STEP PREPROCESSING NYA DISINI---
texts_to_process = df[text_col].fillna("").astype(str).tolist()
processed_texts = [] 

print("\nMemulai proses preprocessing untuk setiap teks...")
for text in tqdm(texts_to_process, desc="Preprocessing Texts"):
    # 1. Cleaning
         # Hapus URL
    text = re.sub(r'http\S+|www\S+', '', text)

        # Hapus mention & hashtag
    text = re.sub(r'@\w+|#\w+', '', text)

        # Ganti range angka 
    text = re.sub(r'(\d+)\s*-\s*(\d+)', r'\1_\2', text)

        # Hapus kata yang cuma 1 huruf
    text = re.sub(r'\b[a-zA-Z]\b', '', text)

        # Hilangkan tanda strip di akhir kata (contoh: prabowo- → prabowo)
    text = re.sub(r'-\b', '', text)

        # Hapus karakter selain huruf, angka, spasi, underscore
    text = re.sub(r'[^a-zA-Z0-9\s_]', ' ', text)

        # Rapikan spasi
    text = re.sub(r'\s+', ' ', text).strip()

    # 2. Case Folding Lowercase
    text = text.lower()
        
    #3. Normalisasi slang
    words = text.split()
    normalized_words = [slang_dict.get(word, word) for word in words]
    text = " ".join(normalized_words)

    #4. Hapus stopwords
    text = stop_remover.remove(text)

    # 5 & 6 Tokenization & Stemming dengan exception
    stemmed_words = []
    for w in text.split():
        if w in exception_list:
            stemmed_words.append(w)  # skip stemming untuk kata di exception list
        else:
            stemmed_words.append(stemmer.stem(w))
    text = " ".join(stemmed_words)

    processed_texts.append(text)
    
# Masukkan hasil akhir ke DataFrame
df['clean_text'] = processed_texts

print("\nPreprocessing selesai!")

#print("\n===== HASIL PERBANDINGAN (100 TWEET) =====\n")
#print(df[[text_col, 'clean_text']].to_string())

print("Dokumen kosong setelah preprocessing:", (df["clean_text"].str.strip() == "").sum())

# --- Simpan Hasil Akhir ke CSV ---
columns_to_keep = [text_col, 'clean_text', 'image_corelation', 'Label']

      # Buat DataFrame baru hanya dengan kolom-kolom tersebut
final_df = df[columns_to_keep]

      # Simpan DataFrame final ke CSV
final_df.to_csv(cleaned_text_path, index=False)

print(f"\nDataFrame final dengan kolom {columns_to_keep} berhasil disimpan di: {cleaned_text_path}")


Library yang dibutuhkan berhasil di-import
Menggunakan Dataset: E:\$7th\TA\Eksploring_TF-IDF\DATA\Cleaned_DropNull.csv
Total rows: 16526
Using text column: full_text

Menyiapkan stemmer, custom stopword remover, dan kamus normalisasi...
Berhasil memuat 675 kata dari kamus stopwords.
Berhasil memuat 1018 kata dari kamus slang.

Memulai proses preprocessing untuk setiap teks...


Preprocessing Texts: 100%|██████████| 16526/16526 [1:09:24<00:00,  3.97it/s]



Preprocessing selesai!
Dokumen kosong setelah preprocessing: 2

DataFrame final dengan kolom ['full_text', 'clean_text', 'image_corelation', 'Label'] berhasil disimpan di: E:\$7th\TA\Eksploring_TF-IDF\DATA\Cleaned_Text.csv


### SAVE HASIL TF-IDF ke PKL dan NPZ
NPZ ini bentuknya bobot-bobot TF-IDF yang udah di matrix jadi nanti kalo mau di fusion tinggal panggil file ini

In [16]:
# ===== TF-IDF (fit & transform) =====
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import pickle

# Load dataframe hasil preprocessing
cleaned_text_path = r'E:\$7th\TA\Eksploring_TF-IDF\DATA\Cleaned_Text.csv'
df = pd.read_csv(cleaned_text_path)
texts = df["clean_text"].fillna("").astype(str)
texts = texts[texts.str.strip() != ""]

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1,2),   # unigram + bigram
    token_pattern=r"(?u)\b\w\w+\b"
)

print("Fitting TfidfVectorizer...")
X = vectorizer.fit_transform(texts)
print("TF-IDF matrix shape:", X.shape)

# Diagnostics: top IDF terms
feature_names = np.array(vectorizer.get_feature_names_out())
idf = vectorizer.idf_
top_idx = np.argsort(idf)[::-1][:25]
print("\nTop 25 (rare) features:")
for i in top_idx:
    print(f" {feature_names[i]} (idf={idf[i]:.3f})")

# Save artifacts
out_dir = os.path.join(os.path.dirname(cleaned_text_path), 'tfidf_artifacts')
os.makedirs(out_dir, exist_ok=True)

with open(os.path.join(out_dir, "tfidf_vectorizer.pkl"), "wb") as f:
    pickle.dump(vectorizer, f)
sparse.save_npz(os.path.join(out_dir, "tfidf_matrix.npz"), X)

print("\nSaved vectorizer & matrix in:", out_dir)


Fitting TfidfVectorizer...
TF-IDF matrix shape: (16525, 20000)

Top 25 (rare) features:
 1mdb (idf=10.020)
 mesin ekonomi (idf=10.020)
 mbz (idf=10.020)
 to ganjel (idf=10.020)
 sgro (idf=10.020)
 ganjel (idf=10.020)
 200 cat (idf=10.020)
 mie gaco (idf=10.020)
 taipei (idf=10.020)
 telur omega (idf=10.020)
 keramat (idf=10.020)
 venture capital (idf=10.020)
 keyk (idf=10.020)
 fn (idf=10.020)
 too high (idf=10.020)
 terusin (idf=10.020)
 crypto winter (idf=10.020)
 knight (idf=10.020)
 kut (idf=10.020)
 lembur (idf=10.020)
 kdslots (idf=10.020)
 wtb wtb (idf=10.020)
 tpid (idf=10.020)
 tus (idf=10.020)
 1002 (idf=10.020)

Saved vectorizer & matrix in: E:\$7th\TA\Eksploring_TF-IDF\DATA\tfidf_artifacts
