# TF-IDF Pembobotan

## Import Library & Dataset Hasil Prepro

In [6]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import pickle
import re

In [11]:
def load_split_data(split_dir, train_filename, test_filename):
    """Memuat dataset train dan test, dan mendeteksi rasio split dari nama file."""
    
    train_path = os.path.join(split_dir, train_filename)
    test_path = os.path.join(split_dir, test_filename)
    
    try:
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        
        # Ekstrak angka dari nama file untuk mendapatkan rasio (contoh: '8020')
        split_ratio_str = re.search(r'(\d+)', train_filename).group(1)
        
        print(f"Data latih berhasil dimuat: {train_df.shape[0]} baris.")
        print(f"Data uji berhasil dimuat: {test_df.shape[0]} baris.")
        print(f"Rasio split terdeteksi: {split_ratio_str}")
        
        return train_df, test_df, split_ratio_str
        
    except FileNotFoundError as e:
        print(f"Error: Salah satu file tidak ditemukan. {e}")
        return None, None, None

# --- Konfigurasi dan Eksekusi (CUSTOM)---
SPLIT_DATA_DIR = r'E:\$7th\TA\Eksploring_TF-IDF\DATA\split_data'
TRAIN_FILENAME = 'train7030.csv'  
TEST_FILENAME = 'test7030.csv'

train_df, test_df, split_ratio = load_split_data(SPLIT_DATA_DIR, TRAIN_FILENAME, TEST_FILENAME)

Data latih berhasil dimuat: 11413 baris.
Data uji berhasil dimuat: 4892 baris.
Rasio split terdeteksi: 7030


## Unigram

In [16]:
def extract_tfidf_unigram(train_df, test_df, output_dir):
    """Melakukan ekstraksi fitur TF-IDF (unigram) dan menyimpan hasilnya."""
    
    os.makedirs(output_dir, exist_ok=True)
    
    train_texts = train_df['clean_text'].fillna("").astype(str)
    test_texts = test_df['clean_text'].fillna("").astype(str)
    
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 1),
        max_features=20000,
        token_pattern=r"(?u)\b\w\w+\b"
    )
    
    print("Memulai ekstraksi fitur TF-IDF Unigram...")
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    
    print(f"Ukuran matriks train: {X_train.shape}")
    print(f"Ukuran matriks test: {X_test.shape}")

    pickle.dump(vectorizer, open(os.path.join(output_dir, "vectorizer_unigram.pkl"), "wb"))
    sparse.save_npz(os.path.join(output_dir, "X_train_unigram.npz"), X_train)
    sparse.save_npz(os.path.join(output_dir, "X_test_unigram.npz"), X_test)
    
    print(f"Hasil ekstraksi fitur Unigram berhasil disimpan di: {output_dir}")

# --- Konfigurasi dan Eksekusi ---
TFIDF_ARTIFACTS_DIR = r'E:\$7th\TA\Eksploring_TF-IDF\DATA\tfidf_artifacts'

dynamic_folder_name = f'unigram_{split_ratio}'
UNIGRAM_OUTPUT_DIR = os.path.join(TFIDF_ARTIFACTS_DIR, dynamic_folder_name)
    
extract_tfidf_unigram(train_df, test_df, UNIGRAM_OUTPUT_DIR)

Memulai ekstraksi fitur TF-IDF Unigram...
Ukuran matriks train: (11413, 16020)
Ukuran matriks test: (4892, 16020)
Hasil ekstraksi fitur Unigram berhasil disimpan di: E:\$7th\TA\Eksploring_TF-IDF\DATA\tfidf_artifacts\unigram_7030


## Bigram

In [17]:
def extract_tfidf_bigram(train_df, test_df, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    train_texts = train_df['clean_text'].fillna("").astype(str)
    test_texts = test_df['clean_text'].fillna("").astype(str)
    
    vectorizer = TfidfVectorizer(
        ngram_range=(2, 2),
        max_features=20000,
        token_pattern=r"(?u)\b\w\w+\b"
    )
    
    print("Memulai ekstraksi fitur TF-IDF Bigram...")
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    
    print(f"Ukuran matriks train: {X_train.shape}")
    print(f"Ukuran matriks test: {X_test.shape}")

    pickle.dump(vectorizer, open(os.path.join(output_dir, "vectorizer_bigram.pkl"), "wb"))
    sparse.save_npz(os.path.join(output_dir, "X_train_bigram.npz"), X_train)
    sparse.save_npz(os.path.join(output_dir, "X_test_bigram.npz"), X_test)
    
    print(f"Hasil ekstraksi fitur Bigram berhasil disimpan di: {output_dir}")

# --- Konfigurasi dan Eksekusi ---
TFIDF_ARTIFACTS_DIR = r'E:\$7th\TA\Eksploring_TF-IDF\DATA\tfidf_artifacts'
BIGRAM_OUTPUT_DIR = os.path.join(TFIDF_ARTIFACTS_DIR, f'bigram_{split_ratio}')
extract_tfidf_bigram(train_df, test_df, BIGRAM_OUTPUT_DIR)

Memulai ekstraksi fitur TF-IDF Bigram...
Ukuran matriks train: (11413, 20000)
Ukuran matriks test: (4892, 20000)
Hasil ekstraksi fitur Bigram berhasil disimpan di: E:\$7th\TA\Eksploring_TF-IDF\DATA\tfidf_artifacts\bigram_7030


## Trigram

In [18]:
def extract_tfidf_trigram(train_df, test_df, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    train_texts = train_df['clean_text'].fillna("").astype(str)
    test_texts = test_df['clean_text'].fillna("").astype(str)
    
    vectorizer = TfidfVectorizer(
        ngram_range=(3, 3),
        max_features=20000,
        token_pattern=r"(?u)\b\w\w+\b"
    )
    
    print("Memulai ekstraksi fitur TF-IDF Trigram...")
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    
    print(f"Ukuran matriks train: {X_train.shape}")
    print(f"Ukuran matriks test: {X_test.shape}")

    pickle.dump(vectorizer, open(os.path.join(output_dir, "vectorizer_trigram.pkl"), "wb"))
    sparse.save_npz(os.path.join(output_dir, "X_train_trigram.npz"), X_train)
    sparse.save_npz(os.path.join(output_dir, "X_test_trigram.npz"), X_test)
    
    print(f"Hasil ekstraksi fitur Trigram berhasil disimpan di: {output_dir}")

# --- Konfigurasi dan Eksekusi ---
TFIDF_ARTIFACTS_DIR = r'E:\$7th\TA\Eksploring_TF-IDF\DATA\tfidf_artifacts'
TRIGRAM_OUTPUT_DIR = os.path.join(TFIDF_ARTIFACTS_DIR, f'trigram_{split_ratio}')
extract_tfidf_trigram(train_df, test_df, TRIGRAM_OUTPUT_DIR)

Memulai ekstraksi fitur TF-IDF Trigram...
Ukuran matriks train: (11413, 20000)
Ukuran matriks test: (4892, 20000)
Hasil ekstraksi fitur Trigram berhasil disimpan di: E:\$7th\TA\Eksploring_TF-IDF\DATA\tfidf_artifacts\trigram_7030


## Unigram + Bigram

In [19]:
def extract_tfidf_uni_bigram(train_df, test_df, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    train_texts = train_df['clean_text'].fillna("").astype(str)
    test_texts = test_df['clean_text'].fillna("").astype(str)
    
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=20000,
        token_pattern=r"(?u)\b\w\w+\b"
    )
    
    print("Memulai ekstraksi fitur TF-IDF Unigram + Bigram...")
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    
    print(f"Ukuran matriks train: {X_train.shape}")
    print(f"Ukuran matriks test: {X_test.shape}")

    pickle.dump(vectorizer, open(os.path.join(output_dir, "vectorizer_uni_bigram.pkl"), "wb"))
    sparse.save_npz(os.path.join(output_dir, "X_train_uni_bigram.npz"), X_train)
    sparse.save_npz(os.path.join(output_dir, "X_test_uni_bigram.npz"), X_test)
    
    print(f"Hasil ekstraksi fitur Unigram + Bigram berhasil disimpan di: {output_dir}")

# --- Konfigurasi dan Eksekusi ---
TFIDF_ARTIFACTS_DIR = r'E:\$7th\TA\Eksploring_TF-IDF\DATA\tfidf_artifacts'
UNI_BIGRAM_OUTPUT_DIR = os.path.join(TFIDF_ARTIFACTS_DIR, f'uni_bigram_{split_ratio}')
extract_tfidf_uni_bigram(train_df, test_df, UNI_BIGRAM_OUTPUT_DIR)

Memulai ekstraksi fitur TF-IDF Unigram + Bigram...
Ukuran matriks train: (11413, 20000)
Ukuran matriks test: (4892, 20000)
Hasil ekstraksi fitur Unigram + Bigram berhasil disimpan di: E:\$7th\TA\Eksploring_TF-IDF\DATA\tfidf_artifacts\uni_bigram_7030


## Unigram + Bigram + Trigram

In [None]:
def extract_tfidf_uni_bi_trigram(train_df, test_df, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    train_texts = train_df['clean_text'].fillna("").astype(str)
    test_texts = test_df['clean_text'].fillna("").astype(str)
    
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 3),
        max_features=20000,
        token_pattern=r"(?u)\b\w\w+\b"
    )
    
    print("Memulai ekstraksi fitur TF-IDF Unigram + Bigram + Trigram...")
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    
    print(f"Ukuran matriks train: {X_train.shape}")
    print(f"Ukuran matriks test: {X_test.shape}")

    pickle.dump(vectorizer, open(os.path.join(output_dir, "vectorizer_uni_bi_trigram.pkl"), "wb"))
    sparse.save_npz(os.path.join(output_dir, "X_train_uni_bi_trigram.npz"), X_train)
    sparse.save_npz(os.path.join(output_dir, "X_test_uni_bi_trigram.npz"), X_test)
    
    print(f"Hasil ekstraksi fitur Unigram + Bigram + Trigram berhasil disimpan di: {output_dir}")

# --- Konfigurasi dan Eksekusi ---
TFIDF_ARTIFACTS_DIR = r'E:\$7th\TA\Eksploring_TF-IDF\DATA\tfidf_artifacts'
UNI_BI_TRIGRAM_OUTPUT_DIR = os.path.join(TFIDF_ARTIFACTS_DIR, f'uni_bi_trigram_{split_ratio}')
extract_tfidf_uni_bi_trigram(train_df, test_df, UNI_BI_TRIGRAM_OUTPUT_DIR)

=== UNIGRAM + BIGRAM + TRIGRAM ===
Shape: (16519, 297556)
