# TF-IDF Pembobotan

## Import Library & Dataset Hasil Prepro

In [None]:
# ===== TF-IDF Experiment Setup =====
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import pickle

# Load dataframe hasil preprocessing
cleaned_text_path = r'E:\$7th\TA\Eksploring_TF-IDF\DATA\Cleaned_Text.csv'
df = pd.read_csv(cleaned_text_path)
texts = df["clean_text"].fillna("").astype(str)
texts = texts[texts.str.strip() != ""]

# Output directory
out_dir = os.path.join(os.path.dirname(cleaned_text_path), 'tfidf_artifacts')
os.makedirs(out_dir, exist_ok=True)

print(f"Dataset loaded: {len(texts)} rows")


In [None]:


# Load dataframe hasil preprocessing
cleaned_text_path = r'E:\$7th\TA\Eksploring_TF-IDF\DATA\Cleaned_Text.csv'
df = pd.read_csv(cleaned_text_path)
texts = df["clean_text"].fillna("").astype(str)
texts = texts[texts.str.strip() != ""]

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1,2),   # unigram + bigram 
    token_pattern=r"(?u)\b\w\w+\b"
)

print("Fitting TfidfVectorizer...")
X = vectorizer.fit_transform(texts)
print("TF-IDF matrix shape:", X.shape)

# Diagnostics: top IDF terms
feature_names = np.array(vectorizer.get_feature_names_out())
idf = vectorizer.idf_
top_idx = np.argsort(idf)[::-1][:25]
print("\nTop 25 (rare) features:")
for i in top_idx:
    print(f" {feature_names[i]} (idf={idf[i]:.3f})")

# Save artifacts
out_dir = os.path.join(os.path.dirname(cleaned_text_path), 'tfidf_artifacts')
os.makedirs(out_dir, exist_ok=True)

with open(os.path.join(out_dir, "tfidf_vectorizer.pkl"), "wb") as f:
    pickle.dump(vectorizer, f)
sparse.save_npz(os.path.join(out_dir, "tfidf_matrix.npz"), X)

print("\nSaved vectorizer & matrix in:", out_dir)


## Unigram

In [None]:
print("=== UNIGRAM ===")
vectorizer_uni = TfidfVectorizer(ngram_range=(1,1))

X_uni = vectorizer_uni.fit_transform(texts)
print("Shape:", X_uni.shape)

pickle.dump(vectorizer_uni, open(os.path.join(out_dir, "vectorizer_unigram.pkl"), "wb"))
sparse.save_npz(os.path.join(out_dir, "matrix_unigram.npz"), X_uni)


## Bigram

In [None]:
print("=== BIGRAM ===")
vectorizer_bi = TfidfVectorizer(ngram_range=(2,2))

X_bi = vectorizer_bi.fit_transform(texts)
print("Shape:", X_bi.shape)

pickle.dump(vectorizer_bi, open(os.path.join(out_dir, "vectorizer_bigram.pkl"), "wb"))
sparse.save_npz(os.path.join(out_dir, "matrix_bigram.npz"), X_bi)


## Trigram

In [None]:
print("=== TRIGRAM ===")
vectorizer_tri = TfidfVectorizer(ngram_range=(3,3))

X_tri = vectorizer_tri.fit_transform(texts)
print("Shape:", X_tri.shape)

pickle.dump(vectorizer_tri, open(os.path.join(out_dir, "vectorizer_trigram.pkl"), "wb"))
sparse.save_npz(os.path.join(out_dir, "matrix_trigram.npz"), X_tri)


## Unigram + Bigram

In [None]:
print("=== UNIGRAM + BIGRAM ===")
vectorizer_unibi = TfidfVectorizer(ngram_range=(1,2))

X_unibi = vectorizer_unibi.fit_transform(texts)
print("Shape:", X_unibi.shape)

pickle.dump(vectorizer_unibi, open(os.path.join(out_dir, "vectorizer_unibigram.pkl"), "wb"))
sparse.save_npz(os.path.join(out_dir, "matrix_unibigram.npz"), X_unibi)


## Unigram + Bigram + Trigram

In [None]:
print("=== UNIGRAM + BIGRAM + TRIGRAM ===")
vectorizer_all = TfidfVectorizer(ngram_range=(1,3))

X_all = vectorizer_all.fit_transform(texts)
print("Shape:", X_all.shape)

pickle.dump(vectorizer_all, open(os.path.join(out_dir, "vectorizer_unibi_tri.pkl"), "wb"))
sparse.save_npz(os.path.join(out_dir, "matrix_unibi_tri.npz"), X_all)
