In [1]:
#Import Libraries
import pandas as pd
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np


In [2]:
#Load and process data
df = pd.read_csv('data_tinggi_balita.csv')
df = df.rename(columns={'Status Gizi': 'Status', 'Jenis Kelamin': 'Gender', 'Umur (bulan)': 'Age', 'Tinggi Badan (cm)': 'Height'})
df['Gender'] = df['Gender'].replace('perempuan', 'female')
df['Gender'] = df['Gender'].replace('laki-laki', 'male')
df['Status'] = df['Status'].replace('tinggi', 'tall')
print(df)

        Age  Gender      Height            Status
0         0    male   44.591973           stunted
1         0    male   56.705203              tall
2         0    male   46.863358            normal
3         0    male   47.508026            normal
4         0    male   42.743494  severely stunted
...     ...     ...         ...               ...
120994   60  female  100.600000            normal
120995   60  female   98.300000           stunted
120996   60  female  121.300000            normal
120997   60  female  112.200000            normal
120998   60  female  109.800000            normal

[120999 rows x 4 columns]


In [3]:
#Data preprocessing
df['Gender'] = df['Gender'].astype(str)
df['Status'] = df['Status'].astype(str)

X = df.drop('Status', axis=1)
y = df['Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# One-hot encode the 'Gender' column
encoder = OneHotEncoder(sparse_output=False, drop='first')
X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

X_train_encoded[['Gender']] = encoder.fit_transform(X_train[['Gender']])
X_test_encoded[['Gender']] = encoder.transform(X_test[['Gender']])

# Label encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [15]:
#ImplementeXtreme gradient boosting
xgbc_model = XGBClassifier(random_state=42)
xgbc_model.fit(X_train_encoded, y_train_encoded)

# Predict and evaluate the model
y_pred_encoded = xgbc_model.predict(X_test_encoded)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

acc_xgbc = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {acc_xgbc*100:.2f}%\n")

# Classification Report
print("Classification Report for XGBoost: ")
print(classification_report(y_test, y_pred))


Accuracy Score: 99.10%

Classification Report for XGBoost: 
                  precision    recall  f1-score   support

          normal       1.00      0.99      0.99     13382
severely stunted       0.99      0.99      0.99      4130
         stunted       0.97      0.98      0.97      2790
            tall       0.99      0.99      0.99      3898

        accuracy                           0.99     24200
       macro avg       0.99      0.99      0.99     24200
    weighted avg       0.99      0.99      0.99     24200



In [5]:
#Cross validation for extreme gradient boosting
cv_scores = cross_val_score(xgbc_model, X_train_encoded, y_train_encoded, cv=5)

print("Cross-Validation Scores: ", cv_scores)
print(f"Mean CV Accuracy: {np.mean(cv_scores)*100:.2f}%")
print(f"Standard Deviation of CV Accuracy: {np.std(cv_scores)*100:.2f}%")


Cross-Validation Scores:  [0.99116736 0.98941116 0.99044421 0.99034091 0.99044372]
Mean CV Accuracy: 99.04%
Standard Deviation of CV Accuracy: 0.06%


In [8]:
#Test Extreme gradient boosting
new_test_cases = pd.DataFrame({
    'Age': [16] * 4,
    'Gender': ['male'] * 4, #male
    'Height': [75,30,77.5,90]
})
new_test_cases_encoded = new_test_cases.copy()
new_test_cases_encoded[['Gender']] = encoder.transform(new_test_cases[['Gender']])
new_test_cases_xgbc_pred_encoded = xgbc_model.predict(new_test_cases_encoded)
new_test_cases_xgbc_pred = label_encoder.inverse_transform(new_test_cases_xgbc_pred_encoded)
print("XGBoost Predictions:", new_test_cases_xgbc_pred)


XGBoost Predictions: ['stunted' 'severely stunted' 'normal' 'tall']


In [9]:
#Implement SVC
from sklearn.svm import SVC
# Initialize and train the SVM model
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train_encoded, y_train_encoded)

# Predict using the SVM model
y_pred_svm_encoded = svm_model.predict(X_test_encoded)
y_pred_svm = label_encoder.inverse_transform(y_pred_svm_encoded)

# Evaluate the SVM model
acc_svm = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy Score (SVM): {acc_svm*100:.2f}%\n")

# Classification Report for SVM
print("Classification Report for SVM: ")
print(classification_report(y_test, y_pred_svm))

Accuracy Score (SVM): 95.25%

Classification Report for SVM: 
                  precision    recall  f1-score   support

          normal       0.97      0.98      0.97     13382
severely stunted       0.93      0.96      0.94      4130
         stunted       0.87      0.81      0.84      2790
            tall       0.98      0.96      0.97      3898

        accuracy                           0.95     24200
       macro avg       0.94      0.93      0.93     24200
    weighted avg       0.95      0.95      0.95     24200



In [10]:
#Cross validate SVM
cv_scores_svm = cross_val_score(svm_model, X_train_encoded, y_train_encoded, cv=5)

print("Cross-Validation Scores (SVM): ", cv_scores_svm)
print(f"Mean CV Accuracy (SVM): {np.mean(cv_scores_svm)*100:.2f}%")
print(f"Standard Deviation of CV Accuracy (SVM): {np.std(cv_scores_svm)*100:.2f}%")

Cross-Validation Scores (SVM):  [0.95139463 0.95123967 0.95165289 0.95356405 0.94963583]
Mean CV Accuracy (SVM): 95.15%
Standard Deviation of CV Accuracy (SVM): 0.13%


In [11]:
#Test SVM
new_test_cases_svm_pred_encoded = svm_model.predict(new_test_cases_encoded)
new_test_cases_svm_pred = label_encoder.inverse_transform(new_test_cases_svm_pred_encoded)
print("SVM Predictions:", new_test_cases_svm_pred)


SVM Predictions: ['normal' 'severely stunted' 'normal' 'tall']


In [12]:
#Implement K-NN
from sklearn.neighbors import KNeighborsClassifier

# Initialize and train the KNN model
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_encoded, y_train_encoded)

# Predict using the KNN model
y_pred_knn_encoded = knn_model.predict(X_test_encoded)
y_pred_knn = label_encoder.inverse_transform(y_pred_knn_encoded)

# Evaluate the KNN model
acc_knn = accuracy_score(y_test, y_pred_knn)
print(f"Accuracy Score (KNN): {acc_knn*100:.2f}%\n")

# Classification Report for KNN
print("Classification Report for KNN: ")
print(classification_report(y_test, y_pred_knn))

Accuracy Score (KNN): 99.66%

Classification Report for KNN: 
                  precision    recall  f1-score   support

          normal       1.00      1.00      1.00     13382
severely stunted       0.99      1.00      1.00      4130
         stunted       0.99      0.99      0.99      2790
            tall       1.00      1.00      1.00      3898

        accuracy                           1.00     24200
       macro avg       1.00      1.00      1.00     24200
    weighted avg       1.00      1.00      1.00     24200



In [13]:
#Cross validate K-NN
cv_scores_knn = cross_val_score(knn_model, X_train_encoded, y_train_encoded, cv=5)

print("Cross-Validation Scores (KNN): ", cv_scores_knn)
print(f"Mean CV Accuracy (KNN): {np.mean(cv_scores_knn)*100:.2f}%")
print(f"Standard Deviation of CV Accuracy (KNN): {np.std(cv_scores_knn)*100:.2f}%")


Cross-Validation Scores (KNN):  [0.99545455 0.99493802 0.99529959 0.99633264 0.99597087]
Mean CV Accuracy (KNN): 99.56%
Standard Deviation of CV Accuracy (KNN): 0.05%


In [14]:
#Test K-NN
new_test_cases_knn_pred_encoded = knn_model.predict(new_test_cases_encoded)
new_test_cases_knn_pred = label_encoder.inverse_transform(new_test_cases_knn_pred_encoded)
print("KNN Predictions:", new_test_cases_knn_pred)

KNN Predictions: ['stunted' 'severely stunted' 'normal' 'tall']
