## Split Data
Splitting Data from master data (DataFrame_Master.csv) yg sudah ada image_path

### Dynamic Skema
Tinggal ganti angka ratio untuk ubah skema

In [2]:
# ===== SPLIT DATA TRAIN AND DATA TEST =====

import os
import pandas as pd
from sklearn.model_selection import train_test_split

def split_data(master_df_path, output_dir, train_ratio):
    """
    Memuat DataFrame master, membaginya jadi train & test,
    lalu menyimpannya ke folder output dengan nama file dinamis.
    
    Args:
        master_df_path (str): Path ke file DataFrame_Master.csv
        output_dir (str): Folder tempat menyimpan hasil split
        train_ratio (float): Proporsi data train (misal 0.8 untuk 80/20)
    """
    print("\n--- Memulai Proses Split Data ---")
    
    # 1. Muat DataFrame Master
    try:
        df = pd.read_csv(master_df_path, index_col=0)
        print(f"DataFrame Master berhasil dimuat. Total baris: {len(df)}")
    except FileNotFoundError:
        print(f"ERROR: File tidak ditemukan di '{master_df_path}'.")
        return None, None

    # 2. Pisahkan Fitur (X) dan Target (y)
    X = df
    y = df['Label']

    # 3. Hitung test size dari train_ratio
    test_ratio = 1 - train_ratio
    train_percent = round(train_ratio * 100)
    test_percent = round(test_ratio * 100)

    if train_percent + test_percent != 100:
        test_percent = 100 - train_percent

    print(f"🔄 Melakukan split data ({train_percent}% train, {test_percent}% test)...")
    train_df, test_df = train_test_split(
        X, test_size=test_ratio, random_state=42, stratify=y
    )

    # 4. Buat folder output jika belum ada
    os.makedirs(output_dir, exist_ok=True)

    train_path = os.path.join(output_dir, f"train{train_percent}{test_percent}.csv")
    test_path  = os.path.join(output_dir, f"test{train_percent}{test_percent}.csv")

    # 5. Simpan hasil split
    train_df.to_csv(train_path, index=True)
    test_df.to_csv(test_path, index=True)

    # 6. Tampilkan Ringkasan
    print("\n" + "="*45)
    print("      ✅ PROSES SPLIT DATA SELESAI")
    print("="*45)
    print(f"📂 Train data disimpan di: {train_path}")
    print(f"📂 Test data  disimpan di: {test_path}")
    print(f"\nUkuran Data Latih (Train): {train_df.shape}")
    print(f"Ukuran Data Uji (Test): {test_df.shape}")
    print("\nDistribusi Label pada Data Latih (proporsi):")
    print(train_df['Label'].value_counts(normalize=True))
    print("\nDistribusi Label pada Data Uji (proporsi):")
    print(test_df['Label'].value_counts(normalize=True))
    
    print("\n--- Proses Selesai ---")
    return train_df, test_df

# --- Jalankan Fungsi ---
master_file_path = r'E:\$7th\TA\Eksploring_TF-IDF\DATA\DataFrame_Master.csv'
output_split_dir = r'E:\$7th\TA\Eksploring_TF-IDF\DATA\split_data'

# CUSTOM RATIO DISINI
train_df, test_df = split_data(master_file_path, output_split_dir, train_ratio=0.8)



--- Memulai Proses Split Data ---
DataFrame Master berhasil dimuat. Total baris: 16305
🔄 Melakukan split data (80% train, 20% test)...

      ✅ PROSES SPLIT DATA SELESAI
📂 Train data disimpan di: E:\$7th\TA\Eksploring_TF-IDF\DATA\split_data\train8020.csv
📂 Test data  disimpan di: E:\$7th\TA\Eksploring_TF-IDF\DATA\split_data\test8020.csv

Ukuran Data Latih (Train): (13044, 20)
Ukuran Data Uji (Test): (3261, 20)

Distribusi Label pada Data Latih (proporsi):
Label
0    0.699632
1    0.300368
Name: proportion, dtype: float64

Distribusi Label pada Data Uji (proporsi):
Label
0    0.699479
1    0.300521
Name: proportion, dtype: float64

--- Proses Selesai ---


### Label Y_Test

#### Load Dataset

In [None]:
import re

def load_split_data(split_dir, train_filename, test_filename):
    """Memuat dataset train dan test, dan mendeteksi rasio split dari nama file."""
    
    train_path = os.path.join(split_dir, train_filename)
    test_path = os.path.join(split_dir, test_filename)
    
    try:
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        
        # Ekstrak angka dari nama file untuk mendapatkan rasio (contoh: '8020')
        split_ratio_str = re.search(r'(\d+)', train_filename).group(1)
        
        print(f"Data latih berhasil dimuat: {train_df.shape[0]} baris.")
        print(f"Data uji berhasil dimuat: {test_df.shape[0]} baris.")
        print(f"Rasio split terdeteksi: {split_ratio_str}")
        
        return train_df, test_df, split_ratio_str
        
    except FileNotFoundError as e:
        print(f"Error: Salah satu file tidak ditemukan. {e}")
        return None, None, None

# --- Konfigurasi dan Eksekusi (CUSTOM)---
SPLIT_DATA_DIR = r'E:\$7th\TA\Eksploring_TF-IDF\DATA\split_data'
TRAIN_FILENAME = 'train8020.csv'  
TEST_FILENAME = 'test8020.csv'

train_df, test_df, split_ratio = load_split_data(SPLIT_DATA_DIR, TRAIN_FILENAME, TEST_FILENAME)

NameError: name 're' is not defined

In [5]:
# ===== CELL 2: SIMPAN LABEL DATA UJI =====

import os

def save_test_label(test_dataframe, ratio_str, output_dir):
    """Mengambil kolom 'Label' dari test_df dan menyimpannya."""
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Ambil hanya kolom label
    y_test = test_dataframe['Label']
    
    # Buat nama file dinamis dan simpan
    save_path = os.path.join(output_dir, f'y_test_{ratio_str}.csv')
    y_test.to_csv(save_path, index=False, header=True)
    
    print(f"File label untuk data uji berhasil disimpan di: {save_path}")

# --- Eksekusi ---
LABEL_OUTPUT_DIR = r'E:\$7th\TA\Eksploring_TF-IDF\DATA\_$label'

# Langsung gunakan variabel dari Cell 1
save_test_label(test_df, split_ratio, LABEL_OUTPUT_DIR)

NameError: name 'split_ratio' is not defined