# **Load Librarry**

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, KFold, learning_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import GridSearchCV


# **Load Dataset**

In [2]:
file_path_main = 'https://raw.githubusercontent.com/BimaBayuUWUUU/DSAI_Batch6_Code/main/Finpro/data_FP/data_multilabel/Binary_Target_CFeaturesEngineering_Nm_NoDup.csv'

In [3]:
# Muat dataset Anda
df_main = pd.read_csv(file_path_main)

# **Data Splitting and Scaling**

## **Split Features and Target**

In [4]:
# Pisahkan fitur dan target
y = df_main.filter(like='nama_industri_encoded_')
X = df_main.drop(columns=y.columns)

## **Scaling Features**

In [5]:
from sklearn.preprocessing import StandardScaler
# Standardisasi fitur
scaler = StandardScaler()
X = scaler.fit_transform(X)

## **Split Data Train and Data Test**

In [6]:
from sklearn.model_selection import train_test_split
# Bagi data menjadi set latih dan set uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Inisialisasi jumlah kelas
num_classes = len(y_train.columns)

In [8]:
# Mengubah DataFrame menjadi array numpy
y_train = y_train.to_numpy()

In [9]:
# Mengubah DataFrame menjadi array numpy
y_test = y_test.to_numpy()

## **Check Label Distribution**

In [10]:
from collections import Counter

In [11]:
# Menghitung jumlah total sampel
total_samples = len(y_train)

# Menghitung jumlah sampel untuk setiap kelas
label_counts = np.sum(y, axis=0)

# Menampilkan jumlah sampel untuk setiap kelas
print("Label counts in training data:")
for i, count in enumerate(label_counts):
    class_name = f"Class {i+1}"
    percentage = (count / total_samples) * 100
    print(f"{class_name}: {count} samples ({percentage:.2f}%)")

Label counts in training data:
Class 1: 531 samples (2.02%)
Class 2: 531 samples (2.02%)
Class 3: 523 samples (1.99%)
Class 4: 531 samples (2.02%)
Class 5: 527 samples (2.00%)
Class 6: 530 samples (2.01%)
Class 7: 527 samples (2.00%)
Class 8: 531 samples (2.02%)
Class 9: 530 samples (2.01%)
Class 10: 510 samples (1.94%)
Class 11: 510 samples (1.94%)
Class 12: 531 samples (2.02%)
Class 13: 525 samples (1.99%)
Class 14: 527 samples (2.00%)
Class 15: 466 samples (1.77%)
Class 16: 526 samples (2.00%)
Class 17: 527 samples (2.00%)
Class 18: 531 samples (2.02%)
Class 19: 521 samples (1.98%)
Class 20: 523 samples (1.99%)
Class 21: 517 samples (1.96%)
Class 22: 440 samples (1.67%)
Class 23: 514 samples (1.95%)
Class 24: 523 samples (1.99%)
Class 25: 519 samples (1.97%)
Class 26: 521 samples (1.98%)
Class 27: 531 samples (2.02%)
Class 28: 529 samples (2.01%)
Class 29: 531 samples (2.02%)
Class 30: 525 samples (1.99%)
Class 31: 531 samples (2.02%)
Class 32: 531 samples (2.02%)
Class 33: 531 samp

# **Modelling With Hyperparameter Tuning**

## **Extreme Gradient Boosting Multi-Label Classifier**

In [30]:
# Inisialisasi model
xgboost = XGBClassifier(tree_method="hist", device="cuda")

# Mendapatkan parameter default
paramsXgb = xgboost.get_params()

# Mencetak parameter default
print("Parameter default XGBoostClassifier:")
for param, value in paramsXgb.items():
    print(f"{param}: {value}")

Parameter default XGBoostClassifier:
objective: binary:logistic
base_score: None
booster: None
callbacks: None
colsample_bylevel: None
colsample_bynode: None
colsample_bytree: None
device: None
early_stopping_rounds: None
enable_categorical: False
eval_metric: None
feature_types: None
gamma: None
grow_policy: None
importance_type: None
interaction_constraints: None
learning_rate: None
max_bin: None
max_cat_threshold: None
max_cat_to_onehot: None
max_delta_step: None
max_depth: None
max_leaves: None
min_child_weight: None
missing: nan
monotone_constraints: None
multi_strategy: None
n_estimators: None
n_jobs: None
num_parallel_tree: None
random_state: None
reg_alpha: None
reg_lambda: None
sampling_method: None
scale_pos_weight: None
subsample: None
tree_method: None
validate_parameters: None
verbosity: None


In [32]:
# Menginisialisasi XGBClassifier dengan MultiOutputClassifier
xgboost = XGBClassifier(tree_method="hist", device="cuda")

param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [5, 6, 7],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Menginisialisasi GridSearchCV
grid_search_Xgb = GridSearchCV(estimator=xgboost, param_grid=param_grid_xgb, cv=5, scoring='accuracy')

# Melatih model GridSearchCV
grid_search_Xgb.fit(X_train, y_train)

# Mendapatkan parameter terbaik dan skor terbaik
print("Parameter terbaik yang ditemukan: ", grid_search_Xgb.best_params_)
print("Skor akurasi terbaik: ", grid_search_Xgb.best_score_)

# Mengevaluasi model terbaik pada set pengujian
best_model_Xgb = grid_search_Xgb.best_estimator_
y_pred_Xgb = best_model_Xgb.predict(X_test)

# Menghitung metrik evaluasi
accuraacy_Xgb = accuracy_score(y_test, y_pred_Xgb)
precision_Xgb = precision_score(y_test, y_pred_Xgb, average='micro')
recall_Xgb = recall_score(y_test, y_pred_Xgb, average='micro')
f1_Xgb = f1_score(y_test, y_pred_Xgb, average='micro')

# Menampilkan metrik evaluasi
print(f"Akurasi set pengujian dengan parameter terbaik: {accuraacy_Xgb:.4f}")
print(f"Presisi: {precision_Xgb:.4f}")
print(f"Recall: {recall_Xgb:.4f}")
print(f"F1 Score: {f1_Xgb:.4f}")


Parameter terbaik yang ditemukan:  {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.8}
Skor akurasi terbaik:  0.8826232038610297
Akurasi set pengujian dengan parameter terbaik: 0.8840
Presisi: 0.9825
Recall: 0.8895
F1 Score: 0.9337


## **Adaptive Boosting Multi-Label Classifier**

In [28]:
# Inisialisasi model
ada = AdaBoostClassifier()

# Mendapatkan parameter default
paramsAda = ada.get_params()

# Mencetak parameter default
print("Parameter default AdaBoostClassifier:")
for param, value in paramsAda.items():
    print(f"{param}: {value}")

Parameter default AdaBoostClassifier:
algorithm: SAMME.R
estimator: None
learning_rate: 1.0
n_estimators: 50
random_state: None


In [33]:
# Inisialisasi model
adaboost = MultiOutputClassifier(AdaBoostClassifier(), n_jobs=-1)

# Mendefinisikan parameter grid untuk pencarian
param_grid_adb = {
    'estimator__n_estimators': [50, 100, 200],
    'estimator__learning_rate': [0.01, 0.1, 1.0],
    'estimator__estimator': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=2), None],
    'estimator__algorithm': ['SAMME', 'SAMME.R']
}

# Menginisialisasi GridSearchCV
grid_search_Ada = GridSearchCV(estimator=adaboost, param_grid=param_grid_adb, cv=5, scoring='accuracy')

# Melatih model GridSearchCV
grid_search_Ada.fit(X_train, y_train)

# Mendapatkan parameter terbaik dan skor terbaik
print("Parameter terbaik yang ditemukan: ", grid_search_Ada.best_params_)
print("Skor akurasi terbaik: ", grid_search_Ada.best_score_)

# Mengevaluasi model terbaik pada set pengujian
best_model_Ada = grid_search_Ada.best_estimator_
y_pred_Ada = best_model_Ada.predict(X_test)

# Menghitung metrik evaluasi
accuracy_Ada = accuracy_score(y_test, y_pred_Ada)
precision_Ada  = precision_score(y_test, y_pred_Ada, average='macro')
recall_Ada  = recall_score(y_test, y_pred_Ada, average='macro')
f1_Ada  = f1_score(y_test, y_pred_Ada, average='macro')

# Menampilkan metrik evaluasi
print(f"Akurasi set pengujian dengan parameter terbaik: {accuracy_Ada:.4f}")
print(f"Precision: {precision_Ada:.4f}")
print(f"Recall: {recall_Ada:.4f}")
print(f"F1 Score: {f1_Ada :.4f}")

Parameter terbaik yang ditemukan:  {'estimator__algorithm': 'SAMME.R', 'estimator__estimator': DecisionTreeClassifier(max_depth=2), 'estimator__learning_rate': 0.1, 'estimator__n_estimators': 200}
Skor akurasi terbaik:  0.8754461400412792
Akurasi set pengujian dengan parameter terbaik: 0.8748
Precision: 0.9898
Recall: 0.8756
F1 Score: 0.9260


# **Model Evaluation With Cross Validation**