## Feature Engineering untuk concat feature

### Import Library

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import pickle
import re

### Load Dataset Split

In [None]:
def load_split_data(split_dir, train_filename, test_filename):
    """Memuat dataset train dan test, dan mendeteksi rasio split dari nama file."""
    
    train_path = os.path.join(split_dir, train_filename)
    test_path = os.path.join(split_dir, test_filename)
    
    try:
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        
        # Ekstrak angka dari nama file untuk mendapatkan rasio (contoh: '8020')
        split_ratio_str = re.search(r'(/d+)', train_filename).group(1)
        
        print(f"Data latih berhasil dimuat: {train_df.shape[0]} baris.")
        print(f"Data uji berhasil dimuat: {test_df.shape[0]} baris.")
        print(f"Rasio split terdeteksi: {split_ratio_str}")
        
        return train_df, test_df, split_ratio_str
        
    except FileNotFoundError as e:
        print(f"Error: Salah satu file tidak ditemukan. {e}")
        return None, None, None

# --- Konfigurasi dan Eksekusi (CUSTOM)---
SPLIT_DATA_DIR = r'/home/spil/1Bagus/BACKUP/TA/Multimodal_Process_Exploration/DATA/split_data'
TRAIN_FILENAME = 'train8020.csv'  
TEST_FILENAME = 'test8020.csv'

train_df, test_df, split_ratio = load_split_data(SPLIT_DATA_DIR, TRAIN_FILENAME, TEST_FILENAME)

Data latih berhasil dimuat: 13044 baris.
Data uji berhasil dimuat: 3261 baris.
Rasio split terdeteksi: 8020


### Select Feature 

In [4]:
def select_metadata_features(df, feature_list):
    """Memilih kolom fitur metadata yang relevan beserta labelnya."""
    
    required_cols = feature_list + ['Label']
    
    if not all(col in df.columns for col in required_cols):
        missing = set(required_cols) - set(df.columns)
        print(f"Error: Kolom berikut tidak ditemukan: {missing}")
        return None
        
    return df[required_cols]

# --- Konfigurasi dan Eksekusi ---
METADATA_FEATURES = [
    'followers_count',
    'verified_account',
    'age_account',
    'quote_count',
    'reply_count',
    'retweet_count',
    'favorite_count',
    'image_corelation'
]

if 'train_df' in locals():
    train_metadata_df = select_metadata_features(train_df, METADATA_FEATURES)
    test_metadata_df = select_metadata_features(test_df, METADATA_FEATURES)
    
    if train_metadata_df is not None:
        print("Pemilihan fitur metadata berhasil untuk data train dan test.")
        # print(train_metadata_df.head())

Pemilihan fitur metadata berhasil untuk data train dan test.


### Scalling / Standarisasi (Numerik)

In [None]:
def scale_features(train_df, test_df):
    """Melakukan standard scaling pada fitur numerik."""
    
    # Pisahkan fitur (X) dari label (y)
    X_train = train_df.drop('Label', axis=1)
    y_train = train_df['Label']
    X_test = test_df.drop('Label', axis=1)
    y_test = test_df['Label']
    
    scaler = StandardScaler()
    
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print("Fitur numerik berhasil di-scale.")
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

# --- Eksekusi ---
if 'train_metadata_df' in locals():
    X_train_scaled, X_test_scaled, y_train, y_test, scaler = scale_features(train_metadata_df, test_metadata_df)
    # print(f"Shape X_train_scaled: {X_train_scaled.shape}")

Fitur numerik berhasil di-scale.


### Save File

In [None]:
def save_metadata_artifacts(X_train_s, X_test_s, y_train_s, y_test_s, scaler_obj, output_dir):
    """Menyimpan fitur yang sudah diproses dan objek scaler."""
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Simpan fitur yang sudah di-scale sebagai array NumPy (.npy)
    np.save(os.path.join(output_dir, 'X_train_metadata.npy'), X_train_s)
    np.save(os.path.join(output_dir, 'X_test_metadata.npy'), X_test_s)
    
    # Simpan objek scaler yang sudah dilatih (.pkl)
    with open(os.path.join(output_dir, 'metadata_scaler.pkl'), 'wb') as f:
        pickle.dump(scaler_obj, f)
        
    print(f"Semua artefak metadata berhasil disimpan di folder: {output_dir}")

# --- Konfigurasi dan Eksekusi ---
BASE_ARTIFACTS_DIR = r'/home/spil/1Bagus/BACKUP/TA/Multimodal_Process_Exploration/DATA/_feature_artifacts'

if 'X_train_scaled' in locals() and 'split_ratio' in locals():
    dynamic_folder_name = f'metadata_{split_ratio}'
    METADATA_OUTPUT_DIR = os.path.join(BASE_ARTIFACTS_DIR, dynamic_folder_name)
    
    save_metadata_artifacts(X_train_scaled, X_test_scaled, y_train, y_test, scaler, METADATA_OUTPUT_DIR)

Semua artefak metadata berhasil disimpan di folder: E:/$7th/TA/Eksploring_TF-IDF/DATA/_feature_artifacts/metadata_8020
