In [16]:
# preprocessing_data.py
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

def load_and_clean_data(file_path):
    """Load dan cleaning data sesuai dokumen Bab 3.2.3"""
    print("ðŸ•’ Loading data...")
    data_original = pd.read_excel(file_path)
    
    # Cleaning data
    data_original = data_original.replace({',': '.'}, regex=True)
    
    # Definisikan kolom
    num_cols = ['usia', 'bmi', 'sistolik', 'diastolik', 'hb', 'berat_janin', 'cairan_ketuban']
    cat_cols = ['proteinuria', 'diabetes', 'riwayat_hipertensi', 'riwayat_keluarga',
                'primigravida', 'kehamilan_kembar']
    
    # Convert numerical columns
    data_original[num_cols] = data_original[num_cols].astype(float)
    
    # Cleaning diagnosis label
    data_original['diagnosis_lanjutan'] = data_original['diagnosis_lanjutan'].astype(str).str.strip().str.capitalize()
    data_original['diagnosis_lanjutan'] = data_original['diagnosis_lanjutan'].replace({
        'TIdak': 'Tidak',
        'tidak': 'Tidak',
        'preeklamsia': 'Preeklamsia',
        'Preeklampsia': 'Preeklamsia'
    })
    
    # Proteinuria tetap sebagai string
    data_original['proteinuria'] = data_original['proteinuria'].astype(str)
    
    # Label encoding
    label_mapping = {'Tidak': 0, 'Preeklamsia': 1}
    data_original['diagnosis_lanjutan'] = data_original['diagnosis_lanjutan'].map(label_mapping)
    
    return data_original, num_cols, cat_cols


def create_preprocessor(num_cols, cat_cols):
    """Membuat preprocessor sesuai Bab 3.3.1 + FIX proteinuria"""
    
    # --- Encoder khusus proteinuria (tanpa drop, supaya '+' tidak hilang) ---
    proteinuria_encoder = OneHotEncoder(handle_unknown='ignore', drop=None)

    # --- Encoder kategori lain tetap drop='first' ---
    other_cat_cols = [col for col in cat_cols if col != 'proteinuria']
    other_encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
    
    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), num_cols),
        ('proteinuria', proteinuria_encoder, ['proteinuria']),
        ('cat', other_encoder, other_cat_cols)
    ])
    
    return preprocessor


def save_cleaned_data(data, filename='data_cleaned.pkl'):
    import pickle
    with open(filename, 'wb') as f:
        pickle.dump(data, f)
    print(f"âœ… Data cleaned saved as {filename}")

def main():
    data_cleaned, num_cols, cat_cols = load_and_clean_data('data_collect_from_bidan_fix.xlsx')
    preprocessor = create_preprocessor(num_cols, cat_cols)

    data_to_save = {
        'data_cleaned': data_cleaned,
        'num_cols': num_cols,
        'cat_cols': cat_cols,
        'preprocessor': preprocessor
    }

    save_cleaned_data(data_to_save)

    print("\nðŸ“Š DATA SUMMARY:")
    print("   â€¢ Shape:", data_cleaned.shape)
    print("   â€¢ Numerical features:", len(num_cols))
    print("   â€¢ Categorical features:", len(cat_cols))
    print("   â€¢ Class distribution:", data_cleaned['diagnosis_lanjutan'].value_counts().to_dict())
    print("   â€¢ Missing values:", data_cleaned.isnull().sum().sum())

    return data_to_save

if __name__ == "__main__":
    data_to_save = main()


ðŸ•’ Loading data...
âœ… Data cleaned saved as data_cleaned.pkl

ðŸ“Š DATA SUMMARY:
   â€¢ Shape: (457, 14)
   â€¢ Numerical features: 7
   â€¢ Categorical features: 6
   â€¢ Class distribution: {0: 275, 1: 182}
   â€¢ Missing values: 0


In [17]:
import pandas as pd
from IPython.display import display

# --- Load data dan preprocessor ---
data_cleaned, num_cols, cat_cols = load_and_clean_data('data_collect_from_bidan_fix.xlsx')
preprocessor = create_preprocessor(num_cols, cat_cols)

# Ambil X
X = data_cleaned[num_cols + cat_cols]

# Fit-transform data
X_transformed = preprocessor.fit_transform(X)

# Ambil nama kolom proteinuria
proteinuria_columns = preprocessor.named_transformers_['proteinuria'].get_feature_names_out(['proteinuria'])

# Ambil nama kolom kategori lainnya
other_cat_cols = [col for col in cat_cols if col != 'proteinuria']
other_columns = preprocessor.named_transformers_['cat'].get_feature_names_out(other_cat_cols)

# Gabungkan semua kolom
all_columns = num_cols + list(proteinuria_columns) + list(other_columns)

# Bentuk DataFrame hasil preprocessing
X_df = pd.DataFrame(X_transformed, columns=all_columns)

# --- Tampilkan dalam bentuk tabel rapi ---
display(
    X_df.head(10).style.set_properties(**{
        'background-color': 'white',
        'border': '1px solid #ccc',
        'padding': '4px',
        'font-size': '12px'
    }).set_table_attributes('style="border-collapse: collapse; width: 100%;"')
)


ðŸ•’ Loading data...


Unnamed: 0,usia,bmi,sistolik,diastolik,hb,berat_janin,cairan_ketuban,proteinuria_+,proteinuria_++,proteinuria_+++,proteinuria_negatif,diabetes_tidak,riwayat_hipertensi_tidak,riwayat_keluarga_tidak,primigravida_tidak,kehamilan_kembar_tidak
0,0.077767,1.752558,0.04512,0.322788,-0.347023,-0.092542,-0.830545,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
1,1.020102,-1.041668,1.283779,-1.585094,-1.291353,-1.25654,1.729637,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
2,0.750863,-0.359109,1.065192,0.322788,-1.073431,1.588788,0.835989,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
3,1.154721,0.83537,-0.027742,-0.284266,0.887871,-1.25654,0.860141,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
4,-0.999187,-0.231129,0.409432,2.490835,-1.436635,-0.092542,-0.806393,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
5,-0.999187,0.83537,0.846605,-1.671816,0.452026,-1.385873,0.763531,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
6,-0.056852,-0.657729,-0.027742,-1.671816,0.669949,1.459455,-0.468255,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
7,0.481625,-1.468268,-0.756364,-0.631153,-0.782868,1.071455,0.932599,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0
8,-0.191472,0.04616,0.919467,-0.284266,-1.291353,-1.773873,-0.250881,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
9,-1.672283,1.645909,-0.246329,-1.064763,0.669949,1.071455,1.560569,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
