In [1]:
# Data Preprocessing
# Notebook ini berisi tahap preprocessing data untuk dataset Iris berdasarkan hasil analisis dari DataUnderstanding notebook, termasuk penanganan outliers yang terdeteksi oleh multi-model PyCaret (ABOD, KNN, COF).

print("DATA PREPROCESSING NOTEBOOK")
print("=" * 50)
print("Tahap preprocessing data untuk dataset Iris")
print("Berdasarkan hasil analisis dari DataUnderstanding notebook")
print("Penanganan outliers dari multi-model PyCaret (ABOD, KNN, COF)")

DATA PREPROCESSING NOTEBOOK
Tahap preprocessing data untuk dataset Iris
Berdasarkan hasil analisis dari DataUnderstanding notebook
Penanganan outliers dari multi-model PyCaret (ABOD, KNN, COF)


## 1. Import Libraries dan Setup Environment

In [2]:
# Import libraries yang diperlukan untuk preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Import PyCaret untuk preprocessing dan modeling
try:
    from pycaret.datasets import get_data
    from pycaret.classification import *
    from pycaret.anomaly import *
    print("PyCaret berhasil diimport")
except ImportError:
    print("PyCaret tidak tersedia. Install dengan: pip install pycaret")

# Atur style untuk visualisasi
plt.style.use('default')
sns.set_palette("husl")

print("Libraries berhasil diimport untuk preprocessing")
print("Libraries yang tersedia:")
print("   • Pandas & NumPy: Data manipulation")
print("   • Matplotlib & Seaborn: Visualisasi")
print("   • Scikit-learn: Preprocessing tools")
print("   • PyCaret: Advanced ML preprocessing")

PyCaret berhasil diimport
Libraries berhasil diimport untuk preprocessing
Libraries yang tersedia:
   • Pandas & NumPy: Data manipulation
   • Matplotlib & Seaborn: Visualisasi
   • Scikit-learn: Preprocessing tools
   • PyCaret: Advanced ML preprocessing


## 2. Load Data dan Hasil Outlier Detection

In [3]:
# Load dataset Iris dengan hasil outlier detection dari DataUnderstanding
print("=== LOADING DATA DAN HASIL OUTLIER DETECTION ===")

try:
    # Load data dari file CSV atau PyCaret
    try:
        # Coba load dari file lokal
        df = pd.read_csv('data_iris.csv', delimiter=';')
        
        # Konversi kolom numerik yang menggunakan koma sebagai decimal separator
        numeric_columns = ['sepal length', 'sepal width', 'petal length', 'petal width']
        
        for col in numeric_columns:
            if col in df.columns:
                # Konversi koma ke titik untuk decimal
                df[col] = df[col].astype(str).str.replace(',', '.').astype(float)
        
        # Buat kolom species numerik dan species name
        df['species'] = df['Class'].map({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2})
        df['species_name'] = df['Class'].map({'Iris-setosa': 'setosa', 'Iris-versicolor': 'versicolor', 'Iris-virginica': 'virginica'})
        
        # Rename kolom untuk konsistensi dengan format sklearn
        df = df.rename(columns={
            'sepal length': 'sepal length (cm)',
            'sepal width': 'sepal width (cm)', 
            'petal length': 'petal length (cm)',
            'petal width': 'petal width (cm)'
        })
        
        # Drop kolom yang tidak diperlukan
        if 'id' in df.columns:
            df = df.drop('id', axis=1)
        if 'Class' in df.columns:
            df = df.drop('Class', axis=1)
        
        print("Dataset Iris berhasil dimuat dari data_iris.csv")
        
    except FileNotFoundError:
        # Jika file tidak ditemukan, gunakan dataset Iris dari PyCaret
        print("File lokal tidak ditemukan. Menggunakan dataset Iris dari PyCaret...")
        try:
            df = get_data('iris')
            df['species'] = df['species'].map({'setosa': 0, 'versicolor': 1, 'virginica': 2})
            df['species_name'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
            print("Dataset Iris berhasil dimuat dari PyCaret")
        except:
            print("Error: Tidak dapat memuat dataset dari PyCaret")
    
    # Define feature columns
    features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
    
    print(f"\nInfo Dataset:")
    print(f"   • Ukuran: {df.shape[0]} baris, {df.shape[1]} kolom")
    print(f"   • Features: {features}")
    print(f"   • Target: species (0=setosa, 1=versicolor, 2=virginica)")
    
    # Tampilkan sample data
    print(f"\nSample Data:")
    print(df.head())
    
except Exception as e:
    print(f"Error loading data: {e}")
    df = None

=== LOADING DATA DAN HASIL OUTLIER DETECTION ===
Dataset Iris berhasil dimuat dari data_iris.csv

Info Dataset:
   • Ukuran: 150 baris, 6 kolom
   • Features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
   • Target: species (0=setosa, 1=versicolor, 2=virginica)

Sample Data:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   species species_name  
0        0       setosa  
1        0       setosa  
2        0       setosa  
3        0       setosa  
4        0       setosa  


In [4]:
# Simulasi hasil outlier detection dari DataUnderstanding
# Berdasarkan hasil multi-model PyCaret (ABOD, KNN, COF)
print("=== SIMULASI HASIL OUTLIER DETECTION DARI DATAUNDERSTANDING ===")

if df is not None:
    # Hasil outlier detection berdasarkan analisis DataUnderstanding
    # ABOD: [41, 62, 100, 106, 108, 117, 131, 134]
    # KNN:  [41, 57, 98, 106, 109, 117, 118, 131]  
    # COF:  [14, 15, 22, 33, 41, 44, 106, 117]
    
    outlier_results = {
        'abod': [41, 62, 100, 106, 108, 117, 131, 134],
        'knn': [41, 57, 98, 106, 109, 117, 118, 131],
        'cof': [14, 15, 22, 33, 41, 44, 106, 117]
    }
    
    # Create outlier columns
    for model_name, outlier_indices in outlier_results.items():
        df[f'{model_name}_outlier'] = 0
        df.loc[outlier_indices, f'{model_name}_outlier'] = 1
    
    # Calculate consensus scores
    df['consensus_score'] = df['abod_outlier'] + df['knn_outlier'] + df['cof_outlier']
    df['strong_consensus'] = (df['consensus_score'] >= 2).astype(int)
    df['perfect_consensus'] = (df['consensus_score'] == 3).astype(int)
    
    # Analisis hasil outlier detection
    print(f"\nHasil Outlier Detection dari DataUnderstanding:")
    for model_name, outlier_indices in outlier_results.items():
        outlier_count = len(outlier_indices)
        outlier_pct = (outlier_count / len(df)) * 100
        print(f"   • {model_name.upper()}: {outlier_count} outliers ({outlier_pct:.1f}%)")
        print(f"     Indices: {outlier_indices}")
    
    # Consensus analysis
    consensus_stats = df['consensus_score'].value_counts().sort_index()
    print(f"\nConsensus Analysis:")
    for score, count in consensus_stats.items():
        pct = (count / len(df)) * 100
        if score == 0:
            print(f"   • Normal (0 models): {count} data ({pct:.1f}%)")
        else:
            print(f"   • {int(score)} model(s) agree: {count} data ({pct:.1f}%)")
    
    # Strong consensus outliers
    strong_consensus_count = df['strong_consensus'].sum()
    strong_consensus_indices = df[df['strong_consensus'] == 1].index.tolist()
    print(f"\nStrong Consensus Outliers (≥2 models agree):")
    print(f"   • Count: {strong_consensus_count} ({strong_consensus_count/len(df)*100:.1f}%)")
    print(f"   • Indices: {strong_consensus_indices}")
    
    # Perfect consensus outliers
    perfect_consensus_count = df['perfect_consensus'].sum()
    perfect_consensus_indices = df[df['perfect_consensus'] == 1].index.tolist()
    print(f"\nPerfect Consensus Outliers (all models agree):")
    print(f"   • Count: {perfect_consensus_count} ({perfect_consensus_count/len(df)*100:.1f}%)")
    print(f"   • Indices: {perfect_consensus_indices}")
    
else:
    print("Tidak dapat melakukan analisis outlier karena data tidak berhasil dimuat")

=== SIMULASI HASIL OUTLIER DETECTION DARI DATAUNDERSTANDING ===

Hasil Outlier Detection dari DataUnderstanding:
   • ABOD: 8 outliers (5.3%)
     Indices: [41, 62, 100, 106, 108, 117, 131, 134]
   • KNN: 8 outliers (5.3%)
     Indices: [41, 57, 98, 106, 109, 117, 118, 131]
   • COF: 8 outliers (5.3%)
     Indices: [14, 15, 22, 33, 41, 44, 106, 117]

Consensus Analysis:
   • Normal (0 models): 133 data (88.7%)
   • 1 model(s) agree: 13 data (8.7%)
   • 2 model(s) agree: 1 data (0.7%)
   • 3 model(s) agree: 3 data (2.0%)

Strong Consensus Outliers (≥2 models agree):
   • Count: 4 (2.7%)
   • Indices: [41, 106, 117, 131]

Perfect Consensus Outliers (all models agree):
   • Count: 3 (2.0%)
   • Indices: [41, 106, 117]


## 3. Outlier Treatment Strategies

In [5]:
# Strategi Penanganan Outliers berdasarkan Consensus Analysis
print("=== STRATEGI PENANGANAN OUTLIERS ===")

if df is not None and 'consensus_score' in df.columns:
    
    # Buat beberapa versi dataset dengan treatment berbeda
    datasets = {}
    
    # Dataset 1: Original (tidak ada treatment)
    datasets['original'] = df.copy()
    
    # Dataset 2: Remove Strong Consensus Outliers (≥2 models agree)
    datasets['remove_strong'] = df[df['strong_consensus'] == 0].copy().reset_index(drop=True)
    
    # Dataset 3: Remove Perfect Consensus Outliers (all 3 models agree)
    datasets['remove_perfect'] = df[df['perfect_consensus'] == 0].copy().reset_index(drop=True)
    
    # Dataset 4: Cap outliers menggunakan IQR method pada strong consensus
    datasets['cap_outliers'] = df.copy()
    
    # Untuk dataset cap_outliers, ganti nilai outliers dengan batas IQR
    strong_outlier_mask = df['strong_consensus'] == 1
    for feature in features:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Cap outliers
        datasets['cap_outliers'].loc[strong_outlier_mask, feature] = np.clip(
            datasets['cap_outliers'].loc[strong_outlier_mask, feature],
            lower_bound, upper_bound
        )
    
    # Dataset 5: Winsorization (cap pada percentile 5-95)
    datasets['winsorize'] = df.copy()
    for feature in features:
        p5 = df[feature].quantile(0.05)
        p95 = df[feature].quantile(0.95)
        datasets['winsorize'][feature] = np.clip(datasets['winsorize'][feature], p5, p95)
    
    # Analisis dampak setiap treatment
    print(f"\nAnalisis Dampak Treatment Outliers:")
    print(f"{'Strategy':<20} {'Samples':<10} {'Outliers Removed':<18} {'% Removed':<12}")
    print("-" * 65)
    
    original_count = len(datasets['original'])
    
    for strategy, dataset in datasets.items():
        current_count = len(dataset)
        removed_count = original_count - current_count
        removed_pct = (removed_count / original_count) * 100
        
        strategy_name = {
            'original': 'Original',
            'remove_strong': 'Remove Strong',
            'remove_perfect': 'Remove Perfect', 
            'cap_outliers': 'Cap IQR',
            'winsorize': 'Winsorization'
        }.get(strategy, strategy)
        
        print(f"{strategy_name:<20} {current_count:<10} {removed_count:<18} {removed_pct:<11.1f}%")
    
    # Tampilkan statistik deskriptif untuk perbandingan
    print(f"\nPerbandingan Statistik Deskriptif (Sepal Length):")
    print(f"{'Strategy':<20} {'Mean':<8} {'Std':<8} {'Min':<8} {'Max':<8}")
    print("-" * 55)
    
    for strategy, dataset in datasets.items():
        if len(dataset) > 0:
            feature = 'sepal length (cm)'
            stats = dataset[feature].describe()
            strategy_name = {
                'original': 'Original',
                'remove_strong': 'Remove Strong',
                'remove_perfect': 'Remove Perfect',
                'cap_outliers': 'Cap IQR', 
                'winsorize': 'Winsorization'
            }.get(strategy, strategy)
            
            print(f"{strategy_name:<20} {stats['mean']:<7.2f} {stats['std']:<7.2f} {stats['min']:<7.2f} {stats['max']:<7.2f}")
    
    # Rekomendasi strategy
    print(f"\nREKOMENDASI TREATMENT STRATEGY:")
    
    strong_consensus_count = df['strong_consensus'].sum()
    perfect_consensus_count = df['perfect_consensus'].sum()
    strong_pct = (strong_consensus_count / len(df)) * 100
    perfect_pct = (perfect_consensus_count / len(df)) * 100
    
    print(f"\n   Berdasarkan analisis outlier detection:")
    print(f"   • Strong consensus outliers: {strong_consensus_count} ({strong_pct:.1f}%)")
    print(f"   • Perfect consensus outliers: {perfect_consensus_count} ({perfect_pct:.1f}%)")
    
    if perfect_pct < 2:
        recommended_strategy = 'original'
        reason = "Outlier percentage sangat rendah, dataset berkualitas baik"
    elif strong_pct < 5:
        recommended_strategy = 'cap_outliers'
        reason = "Outlier moderate, capping lebih baik dari removal"
    else:
        recommended_strategy = 'remove_strong'
        reason = "Outlier tinggi, perlu removal untuk model stability"
    
    print(f"\n   STRATEGI YANG DIREKOMENDASIKAN: {recommended_strategy.upper()}")
    print(f"   Alasan: {reason}")
    
    # Untuk Iris dataset yang natural, kita pilih strategi yang mempertahankan data
    print(f"\n   KHUSUS UNTUK IRIS DATASET:")
    print(f"   • Outliers mungkin merupakan variasi natural bunga iris")
    print(f"   • Untuk klasifikasi, outliers dapat membantu model robustness")
    print(f"   • Direkomendasikan: ORIGINAL atau CAP_OUTLIERS")
    
    # Simpan dataset yang direkomendasikan
    if recommended_strategy in datasets:
        df_processed = datasets[recommended_strategy].copy()
        print(f"\nDataset processed dengan strategi '{recommended_strategy}' siap digunakan")
    else:
        df_processed = datasets['original'].copy()
        print(f"\nMenggunakan dataset original sebagai fallback")
        
else:
    print("Tidak dapat melakukan treatment outliers karena data atau hasil outlier tidak tersedia")
    df_processed = df.copy() if df is not None else None

=== STRATEGI PENANGANAN OUTLIERS ===



Analisis Dampak Treatment Outliers:
Strategy             Samples    Outliers Removed   % Removed   
-----------------------------------------------------------------
Original             150        0                  0.0        %
Remove Strong        146        4                  2.7        %
Remove Perfect       147        3                  2.0        %
Cap IQR              150        0                  0.0        %
Winsorization        150        0                  0.0        %

Perbandingan Statistik Deskriptif (Sepal Length):
Strategy             Mean     Std      Min      Max     
-------------------------------------------------------
Original             5.84    0.83    4.30    7.90   
Remove Strong        5.83    0.80    4.30    7.70   
Remove Perfect       5.85    0.81    4.30    7.90   
Cap IQR              5.84    0.83    4.30    7.90   
Winsorization        5.83    0.78    4.60    7.25   

REKOMENDASI TREATMENT STRATEGY:

   Berdasarkan analisis outlier detection:
   • St

## 4. Feature Scaling dan Engineering

In [6]:
# Feature Scaling dan Engineering berdasarkan insights dari DataUnderstanding
print("=== FEATURE SCALING DAN ENGINEERING ===")

if df_processed is not None:
    
    # Analisis distribusi fitur untuk menentukan scaling method
    print(f"\nAnalisis Distribusi Fitur untuk Scaling:")
    for feature in features:
        stats = df_processed[feature].describe()
        skewness = df_processed[feature].skew()
        print(f"   • {feature.split('(')[0].strip()}: Mean={stats['mean']:.2f}, Std={stats['std']:.2f}, Skew={skewness:.2f}")
    
    # Buat beberapa versi scaling
    scaling_methods = {}
    
    # 1. StandardScaler (Z-score normalization)
    scaler_standard = StandardScaler()
    df_standard = df_processed.copy()
    df_standard[features] = scaler_standard.fit_transform(df_processed[features])
    scaling_methods['standard'] = {
        'data': df_standard,
        'scaler': scaler_standard,
        'description': 'StandardScaler (Mean=0, Std=1)'
    }
    
    # 2. MinMaxScaler (0-1 normalization)
    scaler_minmax = MinMaxScaler()
    df_minmax = df_processed.copy()
    df_minmax[features] = scaler_minmax.fit_transform(df_processed[features])
    scaling_methods['minmax'] = {
        'data': df_minmax,
        'scaler': scaler_minmax,
        'description': 'MinMaxScaler (Range 0-1)'
    }
    
    # 3. RobustScaler (robust to outliers)
    scaler_robust = RobustScaler()
    df_robust = df_processed.copy()
    df_robust[features] = scaler_robust.fit_transform(df_processed[features])
    scaling_methods['robust'] = {
        'data': df_robust,
        'scaler': scaler_robust,
        'description': 'RobustScaler (Median & IQR based)'
    }
    
    # 4. No scaling (original)
    scaling_methods['none'] = {
        'data': df_processed.copy(),
        'scaler': None,
        'description': 'No Scaling (Original values)'
    }
    
    # Analisis hasil scaling
    print(f"\nPerbandingan Hasil Scaling (Sepal Length):")
    feature_sample = 'sepal length (cm)'
    print(f"{'Method':<15} {'Mean':<8} {'Std':<8} {'Min':<8} {'Max':<8} {'Range':<8}")
    print("-" * 65)
    
    for method_name, method_info in scaling_methods.items():
        if feature_sample in method_info['data'].columns:
            stats = method_info['data'][feature_sample].describe()
            range_val = stats['max'] - stats['min']
            print(f"{method_name.title():<15} {stats['mean']:<7.3f} {stats['std']:<7.3f} {stats['min']:<7.3f} {stats['max']:<7.3f} {range_val:<7.3f}")
    
    # Feature Engineering berdasarkan insights dari DataUnderstanding
    print(f"\nFEATURE ENGINEERING:")
    print(f"   Berdasarkan analisis korelasi dari DataUnderstanding:")
    print(f"   • Petal length & petal width memiliki korelasi tinggi")
    print(f"   • Sepal width memiliki korelasi rendah dengan fitur lain")
    
    # Tambahkan engineered features untuk semua scaling methods
    for method_name, method_info in scaling_methods.items():
        df_current = method_info['data']
        
        # Feature engineering
        # 1. Petal area (length × width)
        df_current['petal_area'] = df_current['petal length (cm)'] * df_current['petal width (cm)']
        
        # 2. Sepal area (length × width)  
        df_current['sepal_area'] = df_current['sepal length (cm)'] * df_current['sepal width (cm)']
        
        # 3. Petal to sepal ratio
        df_current['petal_sepal_length_ratio'] = df_current['petal length (cm)'] / df_current['sepal length (cm)']
        df_current['petal_sepal_width_ratio'] = df_current['petal width (cm)'] / df_current['sepal width (cm)']
        
        # 4. Total area
        df_current['total_area'] = df_current['petal_area'] + df_current['sepal_area']
        
        # 5. Aspect ratios
        df_current['petal_aspect_ratio'] = df_current['petal length (cm)'] / (df_current['petal width (cm)'] + 1e-8)
        df_current['sepal_aspect_ratio'] = df_current['sepal length (cm)'] / (df_current['sepal width (cm)'] + 1e-8)
        
        # Update method info
        scaling_methods[method_name]['data'] = df_current
    
    # Update features list
    engineered_features = [
        'petal_area', 'sepal_area', 'petal_sepal_length_ratio', 
        'petal_sepal_width_ratio', 'total_area', 'petal_aspect_ratio', 'sepal_aspect_ratio'
    ]
    
    all_features = features + engineered_features
    
    print(f"\nFeature Engineering Completed:")
    print(f"   • Original features: {len(features)}")
    print(f"   • Engineered features: {len(engineered_features)}")
    print(f"   • Total features: {len(all_features)}")
    print(f"\nEngineered Features:")
    for i, feat in enumerate(engineered_features, 1):
        print(f"   {i}. {feat}")
    
    # Rekomendasi scaling method
    print(f"\nREKOMENDASI SCALING METHOD:")
    
    # Untuk Iris dataset dengan outliers yang sudah ditangani
    if 'strong_consensus' in df_processed.columns and df_processed['strong_consensus'].sum() > 0:
        recommended_scaling = 'robust'
        reason = "RobustScaler direkomendasikan karena masih ada outliers"
    else:
        recommended_scaling = 'standard'
        reason = "StandardScaler optimal untuk data yang sudah bersih"
    
    print(f"   METODE YANG DIREKOMENDASIKAN: {recommended_scaling.upper()}")
    print(f"   Alasan: {reason}")
    print(f"   Deskripsi: {scaling_methods[recommended_scaling]['description']}")
    
    # Pilih dataset final
    df_final = scaling_methods[recommended_scaling]['data'].copy()
    final_scaler = scaling_methods[recommended_scaling]['scaler']
    
    print(f"\nDataset final dengan {recommended_scaling} scaling siap untuk modeling")
    print(f"Shape: {df_final.shape}")
    print(f"Features: {len(all_features)} total features")
    
else:
    print("Tidak dapat melakukan feature scaling karena data processed tidak tersedia")
    df_final = None
    final_scaler = None
    all_features = features if 'features' in locals() else []

=== FEATURE SCALING DAN ENGINEERING ===

Analisis Distribusi Fitur untuk Scaling:
   • sepal length: Mean=5.84, Std=0.83, Skew=0.31
   • sepal width: Mean=3.05, Std=0.43, Skew=0.33
   • petal length: Mean=3.76, Std=1.76, Skew=-0.27
   • petal width: Mean=1.20, Std=0.76, Skew=-0.10



Perbandingan Hasil Scaling (Sepal Length):
Method          Mean     Std      Min      Max      Range   
-----------------------------------------------------------------
Standard        -0.000  1.003   -1.870  2.492   4.362  
Minmax          0.429   0.230   0.000   1.000   1.000  
Robust          0.033   0.637   -1.154  1.615   2.769  
None            5.843   0.828   4.300   7.900   3.600  

FEATURE ENGINEERING:
   Berdasarkan analisis korelasi dari DataUnderstanding:
   • Petal length & petal width memiliki korelasi tinggi
   • Sepal width memiliki korelasi rendah dengan fitur lain

Feature Engineering Completed:
   • Original features: 4
   • Engineered features: 7
   • Total features: 11

Engineered Features:
   1. petal_area
   2. sepal_area
   3. petal_sepal_length_ratio
   4. petal_sepal_width_ratio
   5. total_area
   6. petal_aspect_ratio
   7. sepal_aspect_ratio

REKOMENDASI SCALING METHOD:
   METODE YANG DIREKOMENDASIKAN: ROBUST
   Alasan: RobustScaler direkomendasikan karen

## 5. Train-Test Split dan Persiapan Final

In [7]:
# Train-Test Split dan Persiapan Dataset untuk Modeling
print("=== TRAIN-TEST SPLIT DAN PERSIAPAN FINAL ===")

if df_final is not None:
    
    # Persiapan features dan target
    X = df_final[all_features].copy()
    y = df_final['species'].copy()
    
    print(f"Dataset Preparation:")
    print(f"   • Total samples: {len(df_final)}")
    print(f"   • Features: {len(all_features)}")
    print(f"   • Target classes: {y.nunique()} (setosa=0, versicolor=1, virginica=2)")
    
    # Cek distribusi kelas
    class_distribution = y.value_counts().sort_index()
    print(f"\nClass Distribution:")
    species_names = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}
    for class_idx, count in class_distribution.items():
        pct = (count / len(y)) * 100
        print(f"   • {species_names[class_idx]}: {count} samples ({pct:.1f}%)")
    
    # Train-test split dengan stratified sampling
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.2, 
        random_state=42, 
        stratify=y
    )
    
    print(f"\nTrain-Test Split (80-20 dengan stratified sampling):")
    print(f"   • Train set: {len(X_train)} samples")
    print(f"   • Test set: {len(X_test)} samples")
    
    # Verifikasi distribusi kelas di train dan test
    train_dist = y_train.value_counts().sort_index()
    test_dist = y_test.value_counts().sort_index()
    
    print(f"\nDistribusi Kelas setelah Split:")
    print(f"{'Class':<12} {'Train Count':<12} {'Train %':<10} {'Test Count':<11} {'Test %':<8}")
    print("-" * 65)
    
    for class_idx in [0, 1, 2]:
        train_count = train_dist.get(class_idx, 0)
        test_count = test_dist.get(class_idx, 0)
        train_pct = (train_count / len(y_train)) * 100
        test_pct = (test_count / len(y_test)) * 100
        
        print(f"{species_names[class_idx]:<12} {train_count:<12} {train_pct:<9.1f}% {test_count:<11} {test_pct:<7.1f}%")
    
    # Simpan informasi preprocessing untuk reproduksi
    preprocessing_info = {
        'outlier_treatment': 'cap_outliers',  # atau strategy yang dipilih
        'scaling_method': 'robust',  # atau method yang dipilih
        'original_features': features,
        'engineered_features': engineered_features,
        'all_features': all_features,
        'scaler': final_scaler,
        'train_size': len(X_train),
        'test_size': len(X_test),
        'random_state': 42
    }
    
    print(f"\nPreprocessing Information Tersimpan:")
    print(f"   • Outlier treatment: {preprocessing_info['outlier_treatment']}")
    print(f"   • Scaling method: {preprocessing_info['scaling_method']}")
    print(f"   • Total features: {len(preprocessing_info['all_features'])}")
    print(f"   • Random state: {preprocessing_info['random_state']}")
    
    # Tampilkan sample dari dataset final
    print(f"\nSample Data Final (Train Set - First 5 rows):")
    sample_features = ['sepal length (cm)', 'petal length (cm)', 'petal_area', 'total_area']
    print(X_train[sample_features].head())
    
    print(f"\nTarget Labels (Train Set - First 10):")
    print(y_train.head(10).tolist())
    
    # PyCaret Data Preparation
    print(f"\nPERSIAPAN UNTUK PYCARET MODELING:")
    
    # Gabungkan X dan y untuk PyCaret
    df_pycaret = X_train.copy()
    df_pycaret['species'] = y_train
    
    # Siapkan test set terpisah untuk evaluasi final
    df_test_final = X_test.copy()
    df_test_final['species'] = y_test
    
    print(f"   df_pycaret: {df_pycaret.shape} (untuk training & validation)")
    print(f"   df_test_final: {df_test_final.shape} (untuk final evaluation)")
    print(f"   Target column: 'species'")
    print(f"   Feature columns: {len(all_features)} features")
    
    # Summary informasi untuk modeling
    print(f"\nSUMMARY PREPROCESSING:")
    print(f"   Objective: Multi-class classification (3 classes)")
    print(f"   Data quality: High (outliers handled, scaled, engineered)")
    print(f"   Features: Original (4) + Engineered (7) = {len(all_features)}")
    print(f"   Class balance: Good (stratified split maintained)")
    print(f"   Ready for: PyCaret automated ML pipeline")
    
    print(f"\nPREPROCESSING COMPLETED SUCCESSFULLY!")
    print(f"Dataset siap untuk modeling dengan PyCaret")
    
else:
    print("Tidak dapat melakukan train-test split karena dataset final tidak tersedia")
    X_train = X_test = y_train = y_test = None
    df_pycaret = df_test_final = None
    preprocessing_info = None

=== TRAIN-TEST SPLIT DAN PERSIAPAN FINAL ===
Dataset Preparation:
   • Total samples: 150
   • Features: 11
   • Target classes: 3 (setosa=0, versicolor=1, virginica=2)

Class Distribution:
   • setosa: 50 samples (33.3%)
   • versicolor: 50 samples (33.3%)
   • virginica: 50 samples (33.3%)



Train-Test Split (80-20 dengan stratified sampling):
   • Train set: 120 samples
   • Test set: 30 samples

Distribusi Kelas setelah Split:
Class        Train Count  Train %    Test Count  Test %  
-----------------------------------------------------------------
setosa       40           33.3     % 10          33.3   %
versicolor   40           33.3     % 10          33.3   %
virginica    40           33.3     % 10          33.3   %

Preprocessing Information Tersimpan:
   • Outlier treatment: cap_outliers
   • Scaling method: robust
   • Total features: 11
   • Random state: 42

Sample Data Final (Train Set - First 5 rows):
     sepal length (cm)  petal length (cm)  petal_area  total_area
8            -1.076923          -0.842857    0.618095    0.833480
106          -0.692308           0.042857    0.011429    0.703736
76            0.769231           0.128571    0.008571   -0.299121
9            -0.692308          -0.814286    0.651429    0.512967
89           -0.230769          -0.

## 6. Summary dan Next Steps

In [8]:
# SUMMARY PREPROCESSING DAN NEXT STEPS
print("=" * 80)
print("                      SUMMARY DATA PREPROCESSING")
print("=" * 80)

print("\nDATA UNDERSTANDING INTEGRATION:")
print("   Berhasil mengintegrasikan hasil outlier detection dari DataUnderstanding")
print("   Multi-model outlier detection (ABOD, KNN, COF) dianalisis dan diterapkan")
print("   Consensus analysis digunakan untuk menentukan treatment strategy")

print("\nPREPROCESSING PIPELINE YANG DILAKUKAN:")
print("   1. Data Loading & Quality Check")
print("      • Dataset Iris: 150 samples, 4 original features")
print("      • No missing values, no duplicates")
print("      • Balanced classes (50 samples each)")

print("\n   2. Outlier Analysis & Treatment")
print("      • ABOD detected: 8 outliers (5.3%)")  
print("      • KNN detected: 8 outliers (5.3%)")
print("      • COF detected: 8 outliers (5.3%)")
print("      • Consensus approach untuk robust treatment")

print("\n   3. Feature Scaling & Engineering")
print("      • Scaling: RobustScaler (optimal untuk outliers)")
print("      • Original features: 4")
print("      • Engineered features: 7 (areas, ratios, aspects)")
print("      • Total features: 11")

print("\n   4. Train-Test Split")
print("      • Stratified split: 80% train, 20% test")
print("      • Class distribution maintained")
print("      • Random state: 42 (reproducible)")

if 'preprocessing_info' in locals() and preprocessing_info is not None:
    print(f"\nFINAL DATASET CHARACTERISTICS:")
    print(f"   • Train samples: {preprocessing_info.get('train_size', 'N/A')}")
    print(f"   • Test samples: {preprocessing_info.get('test_size', 'N/A')}")
    print(f"   • Features: {len(preprocessing_info.get('all_features', []))}")
    print(f"   • Classes: 3 (setosa, versicolor, virginica)")
    print(f"   • Data quality: High (processed & validated)")

print(f"\nTECHNICAL SPECIFICATIONS:")
print(f"   • Outlier treatment: Based on multi-model consensus")
print(f"   • Scaling method: RobustScaler (median-based, outlier-resistant)")
print(f"   • Feature engineering: Domain-specific (botanical measurements)")
print(f"   • Cross-validation ready: Stratified sampling applied")

print(f"\nNEXT STEPS - MODELING PHASE:")
print(f"   1. Exploratory Data Analysis pada processed data")
print(f"   2. PyCaret Setup & Model Comparison")
print(f"      • Setup classification environment")
print(f"      • Compare multiple algorithms automatically")
print(f"      • Hyperparameter tuning & optimization")

print(f"\n   3. Model Training & Evaluation")
print(f"      • Train best performing models")
print(f"      • Cross-validation (k-fold)")
print(f"      • Feature importance analysis")

print(f"\n   4. Model Validation & Testing")
print(f"      • Final evaluation on test set")
print(f"      • Performance metrics (accuracy, precision, recall, F1)")
print(f"      • Confusion matrix analysis")

print(f"\n   5. Model Deployment Preparation")
print(f"      • Model finalization & saving")
print(f"      • Preprocessing pipeline serialization")
print(f"      • Documentation & deployment notes")

print(f"\nREKOMENDASI UNTUK MODELING:")
print(f"   • Gunakan PyCaret untuk automated ML workflow")
print(f"   • Focus pada ensemble methods (Random Forest, XGBoost)")
print(f"   • Monitor overfitting dengan validation curves")
print(f"   • Analyze feature importance untuk interpretability")

print(f"\nAVAILABLE DATASETS FOR MODELING:")
if 'df_pycaret' in locals() and df_pycaret is not None:
    print(f"   df_pycaret: Training data untuk PyCaret setup")
    print(f"   df_test_final: Hold-out test set untuk final evaluation")
    print(f"   preprocessing_info: Pipeline metadata untuk reproduksi")
else:
    print(f"   Datasets belum tersedia - run preprocessing cells terlebih dahulu")

print(f"\nDOCUMENTATION & REPRODUCIBILITY:")
print(f"   • All preprocessing steps documented dengan kode")
print(f"   • Random states fixed untuk reproducibility")
print(f"   • Scaler objects tersimpan untuk inference")
print(f"   • Feature engineering pipeline dapat direplikasi")

print(f"\n" + "=" * 80)
print(f"                      PREPROCESSING COMPLETED")
print(f"                    READY FOR MODELING PHASE")
print(f"=" * 80)

                      SUMMARY DATA PREPROCESSING

DATA UNDERSTANDING INTEGRATION:
   Berhasil mengintegrasikan hasil outlier detection dari DataUnderstanding
   Multi-model outlier detection (ABOD, KNN, COF) dianalisis dan diterapkan
   Consensus analysis digunakan untuk menentukan treatment strategy

PREPROCESSING PIPELINE YANG DILAKUKAN:
   1. Data Loading & Quality Check
      • Dataset Iris: 150 samples, 4 original features
      • No missing values, no duplicates
      • Balanced classes (50 samples each)

   2. Outlier Analysis & Treatment
      • ABOD detected: 8 outliers (5.3%)
      • KNN detected: 8 outliers (5.3%)
      • COF detected: 8 outliers (5.3%)
      • Consensus approach untuk robust treatment

   3. Feature Scaling & Engineering
      • Scaling: RobustScaler (optimal untuk outliers)
      • Original features: 4
      • Engineered features: 7 (areas, ratios, aspects)
      • Total features: 11

   4. Train-Test Split
      • Stratified split: 80% train, 20% test
   