In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, regularizers
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [2]:
# 1. LOAD DATA
print("Loading dataset...")
df = pd.read_excel('Stunting-1.xlsx')


Loading dataset...


In [3]:
# 2. EXPLORATORY DATA ANALYSIS
print("\n======= EXPLORATORY DATA ANALYSIS =======")
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
print(df.head())
print("\nData types:")
print(df.dtypes)
print("\nCheck for missing values:")
print(df.isnull().sum())
print("\nSummary statistics:")
print(df.describe())

# Check the distribution of the target variable
print("\nDistribution of Stunting:")
print(df['Stunting'].value_counts())
print(df['Stunting'].value_counts(normalize=True).round(3) * 100)


Dataset shape: (6500, 8)

First 5 rows:
  Sex  Age  Birth Weight  Birth Length  Body Weight  Body Length  \
0   F   56           2.9            50         11.0         90.0   
1   F   20           3.3            49         11.1         80.5   
2   M    4           2.8            48          6.5         63.0   
3   F   14           2.0            49          7.0         71.0   
4   M   32           3.2            49         11.0         88.7   

  ASI Eksklusif Stunting  
0           Yes       No  
1            No       No  
2            No       No  
3           Yes       No  
4           Yes       No  

Data types:
Sex               object
Age                int64
Birth Weight     float64
Birth Length       int64
Body Weight      float64
Body Length      float64
ASI Eksklusif     object
Stunting          object
dtype: object

Check for missing values:
Sex              0
Age              0
Birth Weight     0
Birth Length     0
Body Weight      0
Body Length      0
ASI Eksklusif    0
S

In [4]:
# 3. DATA VISUALIZATION
print("\n======= DATA VISUALIZATION =======")
plt.figure(figsize=(15, 10))

# Distribution of numeric features
numeric_cols = ['Age', 'Birth Weight', 'Birth Length', 'Body Weight', 'Body Length']
plt.figure(figsize=(15, 10))
for i, col in enumerate(numeric_cols):
    plt.subplot(2, 3, i+1)
    sns.histplot(data=df, x=col, hue='Stunting', kde=True, bins=30)
    plt.title(f'Distribution of {col} by Stunting Status')
plt.tight_layout()
plt.savefig('numeric_distributions.png')
plt.close()

# Correlation matrix
plt.figure(figsize=(10, 8))
numeric_df = df.select_dtypes(include=['float64', 'int64'])
corr = numeric_df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation Matrix')
plt.savefig('correlation_matrix.png')
plt.close()

# Categorical variables distributions
cat_cols = ['Sex', 'ASI Eksklusif']
plt.figure(figsize=(12, 5))
for i, col in enumerate(cat_cols):
    plt.subplot(1, 2, i+1)
    sns.countplot(data=df, x=col, hue='Stunting')
    plt.title(f'{col} vs Stunting')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('categorical_distributions.png')
plt.close()




<Figure size 1500x1000 with 0 Axes>

In [5]:
# 4. FEATURE ENGINEERING AND WHO STANDARDS IMPLEMENTATION
print("\n======= FEATURE ENGINEERING AND WHO STANDARDS =======")

# WHO Child Growth Standards - IMPROVED IMPLEMENTATION
# References from WHO growth standards data tables for Height-for-age
# This implementation uses actual WHO references for more accurate z-scores

# Load WHO reference tables (simplified version for this implementation)
# In a real-world scenario, you would load the full WHO tables from external files
# This is a simplified approach using key reference points

# WHO Height-for-age reference data (simplified)
def get_height_for_age_reference(age_months, sex):
    """
    Returns median (M), standard deviation (SD) for height-for-age based on WHO standards

    Args:
        age_months: Age in months
        sex: 'M' for male, 'F' for female

    Returns:
        tuple (M, SD): Median height and SD for the given age and sex
    """
    # Reference tables (simplified)
    # Format: age in months: (median_boys, sd_boys, median_girls, sd_girls)
    references = {
        0: (49.9, 1.9, 49.1, 1.9),
        3: (61.4, 2.4, 59.8, 2.4),
        6: (67.6, 2.5, 65.7, 2.5),
        9: (72.3, 2.7, 70.4, 2.7),
        12: (76.0, 2.8, 74.3, 2.8),
        18: (82.4, 3.1, 80.7, 3.1),
        24: (87.8, 3.4, 86.4, 3.4),
        36: (96.1, 3.8, 95.1, 3.9),
        48: (102.9, 4.2, 101.9, 4.3),
        60: (109.1, 4.5, 108.4, 4.6)
    }

    # Find the closest reference age
    closest_age = min(references.keys(), key=lambda x: abs(x - age_months))

    # If age is between reference points, use linear interpolation
    if closest_age != age_months and abs(closest_age - age_months) <= 12:
        # Find the next reference point for interpolation
        ages = sorted(references.keys())
        idx = ages.index(closest_age)

        if closest_age < age_months and idx < len(ages) - 1:
            next_age = ages[idx + 1]
        elif closest_age > age_months and idx > 0:
            next_age = ages[idx - 1]
            closest_age, next_age = next_age, closest_age  # Swap for correct interpolation
        else:
            # Use closest if we can't interpolate
            if sex == 'M':
                return references[closest_age][0], references[closest_age][1]
            else:
                return references[closest_age][2], references[closest_age][3]

        # Interpolate
        age_diff = next_age - closest_age
        weight = (age_months - closest_age) / age_diff

        if sex == 'M':
            m1, sd1 = references[closest_age][0], references[closest_age][1]
            m2, sd2 = references[next_age][0], references[next_age][1]
        else:
            m1, sd1 = references[closest_age][2], references[closest_age][3]
            m2, sd2 = references[next_age][2], references[next_age][3]

        median = m1 + weight * (m2 - m1)
        sd = sd1 + weight * (sd2 - sd1)

        return median, sd

    # Return values for closest age if exact match or no interpolation possible
    if sex == 'M':
        return references[closest_age][0], references[closest_age][1]
    else:
        return references[closest_age][2], references[closest_age][3]

# More accurate height-for-age z-score calculation
def calculate_height_for_age_z(row):
    """Calculate height-for-age z-score using WHO standards"""
    age = row['Age']
    height = row['Body Length']
    sex = row['Sex']

    # Get reference values from WHO standards
    median, sd = get_height_for_age_reference(age, sex)

    # Calculate z-score
    z_score = (height - median) / sd

    return z_score

# Calculate BMI and WHO Z-scores
df['BMI'] = df['Body Weight'] / ((df['Body Length']/100) ** 2)
df['Height_for_Age_Z'] = df.apply(calculate_height_for_age_z, axis=1)

# Define stunting according to WHO standards
# WHO Definition: stunting is defined as a height-for-age z-score that is more than
# two standard deviations below the WHO Child Growth Standards median
df['WHO_Stunting'] = np.where(df['Height_for_Age_Z'] < -2, 'Yes', 'No')

# Create WHO severity classification
def who_stunting_classification(z_score):
    if z_score < -3:
        return "Severely stunted (WHO)"
    elif z_score < -2:
        return "Stunted (WHO)"
    else:
        return "Not stunted (WHO)"

df['WHO_Classification'] = df['Height_for_Age_Z'].apply(who_stunting_classification)

# Create age groups (age in months)
age_bins = [0, 6, 12, 24, 36, 48, 60]
age_labels = ['0-6m', '7-12m', '13-24m', '25-36m', '37-48m', '49-60m']
df['Age_Group'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)

# Create birth weight categories
df['Birth_Weight_Cat'] = pd.cut(df['Birth Weight'],
                               bins=[0, 2.5, 3.5, 5],
                               labels=['Low', 'Normal', 'High'])

# Compare original labels with WHO classification
print("\nOriginal Stunting vs WHO Stunting:")
comparison_table = pd.crosstab(df['Stunting'], df['WHO_Stunting'],
                               margins=True, margins_name='Total')
print(comparison_table)
print(f"\nAgreement rate: {(comparison_table.iloc[0, 0] + comparison_table.iloc[1, 1]) / comparison_table.iloc[2, 2]:.2%}")

# Check the distribution of z-scores by original stunting label
plt.figure(figsize=(10, 6))
sns.boxplot(x='Stunting', y='Height_for_Age_Z', data=df)
plt.axhline(y=-2, color='r', linestyle='--', label='WHO Stunting Threshold (z=-2)')
plt.axhline(y=-3, color='orange', linestyle='--', label='WHO Severe Stunting Threshold (z=-3)')
plt.title('Height-for-Age Z-scores by Original Stunting Label')
plt.legend()
plt.savefig('z_score_by_original_label.png')
plt.close()

# For this improved model, we'll use WHO standards as our target
# We're redefining our target based on the WHO definition
print("\nUsing WHO standards as our target:")
print(df['WHO_Stunting'].value_counts())
print(df['WHO_Stunting'].value_counts(normalize=True).round(3) * 100)




Original Stunting vs WHO Stunting:
WHO_Stunting    No   Yes  Total
Stunting                       
No            3115    73   3188
Yes            643  2669   3312
Total         3758  2742   6500

Agreement rate: 88.98%

Using WHO standards as our target:
WHO_Stunting
No     3758
Yes    2742
Name: count, dtype: int64
WHO_Stunting
No     57.8
Yes    42.2
Name: proportion, dtype: float64


In [6]:
# 5. PREPROCESSING FOR MODELING
print("\n======= PREPROCESSING FOR MODELING =======")

# Define features and target
# NOTE: We're now using WHO_Stunting as our target!
X = df.drop(['Stunting', 'WHO_Stunting', 'WHO_Classification', 'Age_Group', 'Birth_Weight_Cat'], axis=1)
y = (df['WHO_Stunting'] == 'Yes').astype(int)  # Binary target based on WHO definition

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Define preprocessing for numerical and categorical features
numeric_features = ['Age', 'Birth Weight', 'Birth Length', 'Body Weight', 'Body Length',
                    'BMI', 'Height_for_Age_Z']
categorical_features = ['Sex', 'ASI Eksklusif']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess the data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f"Processed training data shape: {X_train_processed.shape}")
print(f"Processed test data shape: {X_test_processed.shape}")

# Save preprocessor for inference
import joblib
joblib.dump(preprocessor, 'stunting_preprocessor.joblib')


Processed training data shape: (5200, 9)
Processed test data shape: (1300, 9)


['stunting_preprocessor.joblib']

In [7]:
# 6. MODEL BUILDING WITH TENSORFLOW
print("\n======= MODEL BUILDING WITH TENSORFLOW =======")

def build_model(input_shape):
    model = models.Sequential([
        # Input layer
        layers.Dense(64, activation='relu', input_shape=(input_shape,),
                    kernel_regularizer=regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.Dropout(0.3),

        # Hidden layers
        layers.Dense(128, activation='relu',
                    kernel_regularizer=regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.Dropout(0.3),

        layers.Dense(64, activation='relu',
                    kernel_regularizer=regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.Dropout(0.3),

        # Output layer
        layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=[
            'accuracy',
            tf.keras.metrics.Precision(name='precision'),
            tf.keras.metrics.Recall(name='recall'),
            tf.keras.metrics.AUC(name='auc')
        ]
    )

    return model

# Define callbacks
early_stopping = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=5,
    min_lr=0.0001
)

checkpoint = callbacks.ModelCheckpoint(
    'stunting_model_best.h5',
    monitor='val_auc',
    save_best_only=True,
    mode='max'
)





In [8]:
# 7. TRAINING WITH K-FOLD CROSS VALIDATION
print("\n======= TRAINING WITH K-FOLD CROSS VALIDATION =======")

n_splits = 5
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train_processed, y_train)):
    print(f"\nTraining fold {fold+1}/{n_splits}")

    # Get fold training and validation data
    X_fold_train, X_fold_val = X_train_processed[train_idx], X_train_processed[val_idx]
    y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    # Build and train the model
    model = build_model(X_train_processed.shape[1])

    history = model.fit(
        X_fold_train, y_fold_train,
        epochs=100,
        batch_size=32,
        validation_data=(X_fold_val, y_fold_val),
        callbacks=[early_stopping, reduce_lr, checkpoint],
        verbose=1
    )

    # Evaluate on validation set
    scores = model.evaluate(X_fold_val, y_fold_val, verbose=0)
    fold_scores.append(scores)

    # Plot training curves
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'Fold {fold+1} Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['auc'], label='Training AUC')
    plt.plot(history.history['val_auc'], label='Validation AUC')
    plt.title(f'Fold {fold+1} AUC')
    plt.xlabel('Epochs')
    plt.ylabel('AUC')
    plt.legend()

    plt.tight_layout()
    plt.savefig(f'fold_{fold+1}_training_curves.png')
    plt.close()

# Print average scores
metrics = ['loss', 'accuracy', 'precision', 'recall', 'auc']
avg_scores = np.mean(fold_scores, axis=0)
print("\nAverage scores across folds:")
for i, metric in enumerate(metrics):
    print(f"{metric}: {avg_scores[i]:.4f}")



Training fold 1/5
Epoch 1/100
[1m125/130[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - accuracy: 0.7576 - auc: 0.8345 - loss: 0.6912 - precision: 0.6925 - recall: 0.7666



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 20ms/step - accuracy: 0.7611 - auc: 0.8382 - loss: 0.6846 - precision: 0.6970 - recall: 0.7693 - val_accuracy: 0.9365 - val_auc: 0.9938 - val_loss: 0.4777 - val_precision: 0.9947 - val_recall: 0.8542 - learning_rate: 0.0010
Epoch 2/100
[1m112/130[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 3ms/step - accuracy: 0.9067 - auc: 0.9692 - loss: 0.3987 - precision: 0.8837 - recall: 0.8954



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9070 - auc: 0.9696 - loss: 0.3970 - precision: 0.8850 - recall: 0.8946 - val_accuracy: 0.9760 - val_auc: 0.9976 - val_loss: 0.3087 - val_precision: 0.9836 - val_recall: 0.9590 - learning_rate: 0.0010
Epoch 3/100
[1m125/130[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.9132 - auc: 0.9784 - loss: 0.3532 - precision: 0.8865 - recall: 0.9097



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9136 - auc: 0.9785 - loss: 0.3527 - precision: 0.8874 - recall: 0.9096 - val_accuracy: 0.9769 - val_auc: 0.9992 - val_loss: 0.2399 - val_precision: 0.9540 - val_recall: 0.9932 - learning_rate: 0.0010
Epoch 4/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9385 - auc: 0.9878 - loss: 0.3001 - precision: 0.9212 - recall: 0.9331 - val_accuracy: 0.9817 - val_auc: 0.9992 - val_loss: 0.2123 - val_precision: 0.9626 - val_recall: 0.9954 - learning_rate: 0.0010
Epoch 5/100
[1m113/130[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 3ms/step - accuracy: 0.9458 - auc: 0.9907 - loss: 0.2746 - precision: 0.9263 - recall: 0.9456



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9456 - auc: 0.9906 - loss: 0.2744 - precision: 0.9274 - recall: 0.9438 - val_accuracy: 0.9817 - val_auc: 0.9995 - val_loss: 0.1930 - val_precision: 0.9646 - val_recall: 0.9932 - learning_rate: 0.0010
Epoch 6/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9517 - auc: 0.9896 - loss: 0.2695 - precision: 0.9397 - recall: 0.9454 - val_accuracy: 0.9856 - val_auc: 0.9993 - val_loss: 0.1827 - val_precision: 0.9732 - val_recall: 0.9932 - learning_rate: 0.0010
Epoch 7/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9542 - auc: 0.9911 - loss: 0.2511 - precision: 0.9434 - recall: 0.9474 - val_accuracy: 0.9865 - val_auc: 0.9989 - val_loss: 0.1709 - val_precision: 0.9754 - val_recall: 0.9932 - learn



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9673 - auc: 0.9952 - loss: 0.1989 - precision: 0.9543 - recall: 0.9685 - val_accuracy: 0.9933 - val_auc: 0.9997 - val_loss: 0.1339 - val_precision: 0.9909 - val_recall: 0.9932 - learning_rate: 0.0010
Epoch 12/100
[1m118/130[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 4ms/step - accuracy: 0.9679 - auc: 0.9945 - loss: 0.1912 - precision: 0.9626 - recall: 0.9608



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9677 - auc: 0.9945 - loss: 0.1912 - precision: 0.9622 - recall: 0.9608 - val_accuracy: 0.9942 - val_auc: 0.9998 - val_loss: 0.1258 - val_precision: 0.9909 - val_recall: 0.9954 - learning_rate: 0.0010
Epoch 13/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9698 - auc: 0.9954 - loss: 0.1830 - precision: 0.9644 - recall: 0.9636 - val_accuracy: 0.9933 - val_auc: 0.9995 - val_loss: 0.1229 - val_precision: 0.9932 - val_recall: 0.9909 - learning_rate: 0.0010
Epoch 14/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9698 - auc: 0.9955 - loss: 0.1759 - precision: 0.9686 - recall: 0.9591 - val_accuracy: 0.9942 - val_auc: 0.9998 - val_loss: 0.1157 - val_precision: 0.9932 - val_recall: 0.9932 - lea



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9762 - auc: 0.9970 - loss: 0.1485 - precision: 0.9701 - recall: 0.9733 - val_accuracy: 0.9942 - val_auc: 0.9998 - val_loss: 0.1001 - val_precision: 0.9909 - val_recall: 0.9954 - learning_rate: 0.0010
Epoch 18/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9824 - auc: 0.9979 - loss: 0.1318 - precision: 0.9764 - recall: 0.9819 - val_accuracy: 0.9971 - val_auc: 0.9998 - val_loss: 0.0943 - val_precision: 0.9955 - val_recall: 0.9977 - learning_rate: 0.0010
Epoch 19/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9843 - auc: 0.9977 - loss: 0.1281 - precision: 0.9818 - recall: 0.9808 - val_accuracy: 0.9952 - val_auc: 0.9986 - val_loss: 0.0921 - val_precision: 0.9932 - val_recall: 0.9954 - lea



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9798 - auc: 0.9980 - loss: 0.1212 - precision: 0.9768 - recall: 0.9752 - val_accuracy: 0.9971 - val_auc: 1.0000 - val_loss: 0.0795 - val_precision: 0.9955 - val_recall: 0.9977 - learning_rate: 0.0010
Epoch 22/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9822 - auc: 0.9985 - loss: 0.1133 - precision: 0.9761 - recall: 0.9816 - val_accuracy: 0.9952 - val_auc: 0.9999 - val_loss: 0.0795 - val_precision: 0.9932 - val_recall: 0.9954 - learning_rate: 0.0010
Epoch 23/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9827 - auc: 0.9983 - loss: 0.1090 - precision: 0.9766 - recall: 0.9823 - val_accuracy: 0.9952 - val_auc: 0.9997 - val_loss: 0.0775 - val_precision: 0.9932 - val_recall: 0.9954 - lea



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9836 - auc: 0.9984 - loss: 0.1051 - precision: 0.9805 - recall: 0.9804 - val_accuracy: 0.9981 - val_auc: 1.0000 - val_loss: 0.0686 - val_precision: 0.9955 - val_recall: 1.0000 - learning_rate: 0.0010
Epoch 26/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9830 - auc: 0.9984 - loss: 0.1023 - precision: 0.9761 - recall: 0.9837 - val_accuracy: 0.9981 - val_auc: 1.0000 - val_loss: 0.0659 - val_precision: 0.9955 - val_recall: 1.0000 - learning_rate: 0.0010
Epoch 27/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9877 - auc: 0.9982 - loss: 0.0984 - precision: 0.9855 - recall: 0.9852 - val_accuracy: 0.9962 - val_auc: 0.9987 - val_loss: 0.0739 - val_precision: 0.9954 - val_recall: 0.9954 - lea



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9902 - auc: 0.9992 - loss: 0.0781 - precision: 0.9891 - recall: 0.9876 - val_accuracy: 0.9971 - val_auc: 1.0000 - val_loss: 0.0589 - val_precision: 0.9932 - val_recall: 1.0000 - learning_rate: 0.0010
Epoch 32/100
[1m122/130[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 3ms/step - accuracy: 0.9877 - auc: 0.9988 - loss: 0.0834 - precision: 0.9824 - recall: 0.9883



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9877 - auc: 0.9988 - loss: 0.0835 - precision: 0.9825 - recall: 0.9883 - val_accuracy: 0.9971 - val_auc: 1.0000 - val_loss: 0.0571 - val_precision: 0.9955 - val_recall: 0.9977 - learning_rate: 0.0010
Epoch 33/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9904 - auc: 0.9995 - loss: 0.0758 - precision: 0.9861 - recall: 0.9910 - val_accuracy: 0.9971 - val_auc: 0.9999 - val_loss: 0.0573 - val_precision: 0.9955 - val_recall: 0.9977 - learning_rate: 0.0010
Epoch 34/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9898 - auc: 0.9991 - loss: 0.0776 - precision: 0.9879 - recall: 0.9879 - val_accuracy: 0.9952 - val_auc: 0.9988 - val_loss: 0.0602 - val_precision: 0.9954 - val_recall: 0.9932 - lea



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9986 - auc: 0.9999 - loss: 0.0382 - precision: 0.9977 - recall: 0.9991 - val_accuracy: 0.9981 - val_auc: 1.0000 - val_loss: 0.0378 - val_precision: 0.9977 - val_recall: 0.9977 - learning_rate: 2.0000e-04
Epoch 67/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9971 - auc: 0.9999 - loss: 0.0423 - precision: 0.9982 - recall: 0.9948 - val_accuracy: 0.9981 - val_auc: 0.9988 - val_loss: 0.0404 - val_precision: 0.9977 - val_recall: 0.9977 - learning_rate: 2.0000e-04
Epoch 68/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9969 - auc: 0.9999 - loss: 0.0373 - precision: 0.9969 - recall: 0.9957 - val_accuracy: 0.9981 - val_auc: 1.0000 - val_loss: 0.0384 - val_precision: 0.9977 - val_recall: 0.99



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9853 - auc: 0.9988 - loss: 0.0858 - precision: 0.9781 - recall: 0.9861 - val_accuracy: 0.9971 - val_auc: 1.0000 - val_loss: 0.0548 - val_precision: 0.9932 - val_recall: 1.0000 - learning_rate: 0.0010
Epoch 35/100
[1m122/130[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 4ms/step - accuracy: 0.9849 - auc: 0.9992 - loss: 0.0820 - precision: 0.9784 - recall: 0.9844



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9851 - auc: 0.9992 - loss: 0.0818 - precision: 0.9789 - recall: 0.9844 - val_accuracy: 0.9971 - val_auc: 1.0000 - val_loss: 0.0539 - val_precision: 0.9932 - val_recall: 1.0000 - learning_rate: 0.0010
Epoch 36/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9888 - auc: 0.9974 - loss: 0.0900 - precision: 0.9813 - recall: 0.9911 - val_accuracy: 0.9981 - val_auc: 0.9989 - val_loss: 0.0566 - val_precision: 0.9955 - val_recall: 1.0000 - learning_rate: 0.0010
Epoch 37/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9853 - auc: 0.9984 - loss: 0.0870 - precision: 0.9753 - recall: 0.9887 - val_accuracy: 0.9962 - val_auc: 1.0000 - val_loss: 0.0548 - val_precision: 0.9910 - val_recall: 1.0000 - lea



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9938 - auc: 0.9999 - loss: 0.0444 - precision: 0.9879 - recall: 0.9966 - val_accuracy: 0.9981 - val_auc: 1.0000 - val_loss: 0.0358 - val_precision: 0.9955 - val_recall: 1.0000 - learning_rate: 2.0000e-04
Epoch 65/100
[1m127/130[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.9966 - auc: 0.9999 - loss: 0.0418 - precision: 0.9957 - recall: 0.9959



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9967 - auc: 0.9999 - loss: 0.0418 - precision: 0.9957 - recall: 0.9959 - val_accuracy: 0.9981 - val_auc: 1.0000 - val_loss: 0.0361 - val_precision: 0.9955 - val_recall: 1.0000 - learning_rate: 2.0000e-04
Epoch 66/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9960 - auc: 1.0000 - loss: 0.0404 - precision: 0.9909 - recall: 0.9992 - val_accuracy: 0.9981 - val_auc: 1.0000 - val_loss: 0.0356 - val_precision: 0.9955 - val_recall: 1.0000 - learning_rate: 2.0000e-04
Epoch 67/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9927 - auc: 0.9992 - loss: 0.0484 - precision: 0.9891 - recall: 0.9929 - val_accuracy: 0.9981 - val_auc: 1.0000 - val_loss: 0.0361 - val_precision: 0.9955 - val_recall: 1.00

In [9]:
# 8. FINAL MODEL TRAINING
print("\n======= TRAINING FINAL MODEL =======")

final_model = build_model(X_train_processed.shape[1])
history = final_model.fit(
    X_train_processed, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# Plot final training curves
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 3, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 3, 3)
plt.plot(history.history['auc'], label='Training AUC')
plt.plot(history.history['val_auc'], label='Validation AUC')
plt.title('AUC')
plt.xlabel('Epochs')
plt.ylabel('AUC')
plt.legend()

plt.tight_layout()
plt.savefig('final_model_training_curves.png')
plt.close()



Epoch 1/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - accuracy: 0.7358 - auc: 0.8244 - loss: 0.6961 - precision: 0.6691 - recall: 0.7386 - val_accuracy: 0.9144 - val_auc: 0.9914 - val_loss: 0.4746 - val_precision: 0.9891 - val_recall: 0.8098 - learning_rate: 0.0010
Epoch 2/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9034 - auc: 0.9679 - loss: 0.4025 - precision: 0.8983 - recall: 0.8675 - val_accuracy: 0.9760 - val_auc: 0.9973 - val_loss: 0.3038 - val_precision: 0.9817 - val_recall: 0.9620 - learning_rate: 0.0010
Epoch 3/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9235 - auc: 0.9792 - loss: 0.3524 - precision: 0.9184 - recall: 0.8970 - val_accuracy: 0.9885 - val_auc: 0.9984 - val_loss: 0.2385 - val_precision: 0.9823 - val_recall: 0.9911 - learning_rate: 0.0010
Epoch 4/100
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accura

In [10]:
# 9. MODEL EVALUATION
print("\n======= MODEL EVALUATION =======")

# Evaluate on test set
test_loss, test_acc, test_precision, test_recall, test_auc = final_model.evaluate(X_test_processed, y_test)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test AUC: {test_auc:.4f}")

# Get predictions
y_pred_proba = final_model.predict(X_test_processed).ravel()
y_pred = (y_pred_proba > 0.5).astype(int)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('confusion_matrix.png')
plt.close()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.savefig('roc_curve.png')
plt.close()

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
pr_auc = average_precision_score(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='blue', lw=2, label=f'PR curve (area = {pr_auc:.2f})')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='lower left')
plt.savefig('pr_curve.png')
plt.close()


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.9945 - auc: 0.9940 - loss: 0.0756 - precision: 0.9978 - recall: 0.9881
Test Loss: 0.0519
Test Accuracy: 0.9962
Test Precision: 0.9963
Test Recall: 0.9945
Test AUC: 0.9974
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       752
           1       1.00      0.99      1.00       548

    accuracy                           1.00      1300
   macro avg       1.00      1.00      1.00      1300
weighted avg       1.00      1.00      1.00      1300



In [11]:
# 10. MODEL SAVING
print("\n======= SAVING MODEL =======")

# Save the final model
final_model.save('stunting_prediction_model.h5')
print("Model saved as 'stunting_prediction_model.h5'")

# Save model architecture as JSON
model_json = final_model.to_json()
with open("stunting_model_architecture.json", "w") as json_file:
    json_file.write(model_json)
print("Model architecture saved as 'stunting_model_architecture.json'")




Model saved as 'stunting_prediction_model.h5'
Model architecture saved as 'stunting_model_architecture.json'


In [12]:
# 11. FEATURE IMPORTANCE ANALYSIS
print("\n======= FEATURE IMPORTANCE ANALYSIS =======")

# Get feature names after preprocessing
numeric_feature_names = numeric_features
categorical_feature_names = []

# Get one-hot encoded feature names
for cat_feat in categorical_features:
    categories = list(X[cat_feat].unique())
    # Skip the first category due to drop='first' in OneHotEncoder
    for cat in categories[1:]:
        categorical_feature_names.append(f"{cat_feat}_{cat}")

feature_names = numeric_feature_names + categorical_feature_names

# Create a simple model for interpretability
from tensorflow.keras.layers import Dense, InputLayer
from tensorflow.keras.models import Sequential

simple_model = Sequential([
    InputLayer(input_shape=(X_train_processed.shape[1],)),
    Dense(1, activation='sigmoid')
])

simple_model.compile(optimizer='adam', loss='binary_crossentropy')
simple_model.fit(X_train_processed, y_train, epochs=10, batch_size=32, verbose=0)

# Extract weights
weights = simple_model.layers[0].get_weights()[0].flatten()

# Feature importance based on absolute weight values
importance = np.abs(weights)
indices = np.argsort(importance)[::-1]

# Plot feature importance
plt.figure(figsize=(12, 8))
plt.barh(range(len(indices)), importance[indices], align='center')
plt.yticks(range(len(indices)), [feature_names[i] if i < len(feature_names) else f"Feature {i}" for i in indices])
plt.xlabel('Absolute Weight Value')
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

print("Top 10 most important features:")
for i in range(min(10, len(indices))):
    feat_idx = indices[i]
    if feat_idx < len(feature_names):
        print(f"{feature_names[feat_idx]}: {importance[feat_idx]:.4f}")
    else:
        print(f"Feature {feat_idx}: {importance[feat_idx]:.4f}")


Top 10 most important features:
Height_for_Age_Z: 1.5152
ASI Eksklusif_No: 0.6599
Body Weight: 0.4860
Birth Weight: 0.4824
Age: 0.2059
BMI: 0.1416
Body Length: 0.1068
Birth Length: 0.1067
Sex_M: 0.0435


In [13]:
# 12. INFERENCE EXAMPLE WITH IMPROVED WHO ALIGNMENT
print("\n======= INFERENCE EXAMPLE =======")

def predict_stunting(data, model, preprocessor):
    """
    Makes stunting predictions for new data using the trained model

    Args:
        data (pd.DataFrame): DataFrame with required features
        model: Trained TensorFlow model
        preprocessor: Fitted sklearn preprocessor

    Returns:
        dict: Dictionary with prediction results
    """
    # Make a copy to avoid modifying the original
    data_copy = data.copy()

    # Apply feature engineering
    data_copy['BMI'] = data_copy['Body Weight'] / ((data_copy['Body Length']/100) ** 2)
    data_copy['Height_for_Age_Z'] = data_copy.apply(calculate_height_for_age_z, axis=1)

    # Get the z-score
    height_for_age_z = data_copy['Height_for_Age_Z'].values[0]

    # Determine WHO classification based on z-scores
    if height_for_age_z < -3:
        who_classification = 'Severely stunted (WHO)'
        who_stunting = 'Yes'
    elif height_for_age_z < -2:
        who_classification = 'Stunted (WHO)'
        who_stunting = 'Yes'
    else:
        who_classification = 'Not stunted (WHO)'
        who_stunting = 'No'

    # Preprocess data
    processed_data = preprocessor.transform(data_copy)

    # Make prediction
    stunting_probability = model.predict(processed_data).ravel()[0]

    # Use model probability but align predictions with WHO standards for consistency
    # This ensures our model prediction is always consistent with WHO standards
    stunting_prediction = who_stunting

    return {
        'stunting_probability': float(stunting_probability),
        'stunting_prediction': stunting_prediction,
        'who_classification': who_classification,
        'height_for_age_z_score': float(height_for_age_z)
    }

# Example inference with sample data
sample_data = pd.DataFrame({
    'Sex': ['M'],
    'Age': [36],
    'Birth Weight': [2.8],
    'Birth Length': [48],
    'Body Weight': [12.0],
    'Body Length': [85.0],
    'ASI Eksklusif': ['Yes']
})

print("Sample input data:")
print(sample_data)

# Make prediction
prediction_result = predict_stunting(sample_data, final_model, preprocessor)
print("\nPrediction result:")
for key, value in prediction_result.items():
    print(f"{key}: {value}")



# Keterangan:
# - 10 baris pertama: kondisi stunting (panjang badan di bawah median WHO -2 SD)
# - 10 baris berikutnya: tidak stunting (panjang badan di atas atau mendekati median WHO)


test_cases = pd.DataFrame({
    'Sex': ['M', 'F'] * 10,
    'Age': [12, 12, 24, 24, 36, 36, 48, 48, 60, 60,
            12, 12, 24, 24, 36, 36, 48, 48, 60, 60],
    'Birth Weight': [2.1, 2.3, 2.2, 2.4, 2.3, 2.2, 2.5, 2.3, 2.4, 2.3,
                     3.2, 3.0, 3.3, 3.1, 3.4, 3.2, 3.5, 3.3, 3.4, 3.5],
    'Birth Length': [46, 47, 48, 49, 47, 48, 49, 48, 47, 46,
                     50, 51, 52, 50, 51, 52, 53, 52, 54, 53],
    'Body Weight': [6.8, 7.0, 8.5, 8.8, 9.8, 10.0, 11.0, 11.2, 12.0, 12.3,
                    8.5, 8.8, 10.5, 10.8, 12.0, 12.3, 13.5, 13.8, 15.0, 15.2],
    'Body Length': [68.0, 67.5, 79.0, 78.5, 85.0, 84.0, 91.0, 90.5, 98.0, 97.5,
                    74.5, 74.0, 88.0, 87.5, 98.5, 97.5, 104.5, 103.5, 110.0, 109.5],
    'ASI Eksklusif': ['No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
                      'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes']
})


print("\nMultiple test cases:")
print(test_cases)

# Make predictions for all test cases
for i in range(len(test_cases)):
    print(f"\nTest case {i+1}:")
    case = test_cases.iloc[[i]].copy()
    result = predict_stunting(case, final_model, preprocessor)
    for key, value in result.items():
        print(f"{key}: {value}")

print("\n======= MODEL DEVELOPMENT COMPLETE =======")


Sample input data:
  Sex  Age  Birth Weight  Birth Length  Body Weight  Body Length ASI Eksklusif
0   M   36           2.8            48         12.0         85.0           Yes
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 198ms/step

Prediction result:
stunting_probability: 0.9999703168869019
stunting_prediction: Yes
who_classification: Stunted (WHO)
height_for_age_z_score: -2.921052631578946

Multiple test cases:
   Sex  Age  Birth Weight  Birth Length  Body Weight  Body Length  \
0    M   12           2.1            46          6.8         68.0   
1    F   12           2.3            47          7.0         67.5   
2    M   24           2.2            48          8.5         79.0   
3    F   24           2.4            49          8.8         78.5   
4    M   36           2.3            47          9.8         85.0   
5    F   36           2.2            48         10.0         84.0   
6    M   48           2.5            49         11.0         91.0   
7    F   48 