# **Penting**
- Pastikan Anda melakukan Run All sebelum mengirimkan submission untuk memastikan seluruh cell berjalan dengan baik.
- Hapus simbol pagar (#) jika Anda menerapkan kriteria tambahan
- Biarkan simbol pagar (#) jika Anda tidak menerapkan kriteria tambahan

# **1. Import Library**
Pada tahap ini, Anda perlu mengimpor beberapa pustaka (library) Python yang dibutuhkan untuk analisis data dan pembangunan model machine learning.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import joblib

# **2. Memuat Dataset dari Hasil Clustering**
Memuat dataset hasil clustering dari file CSV ke dalam variabel DataFrame.

In [2]:
# Gunakan dataset hasil clustering yang memiliki fitur Target
url = "https://raw.githubusercontent.com/107rasyid/membangun-proyek-machine-learning-pemula/refs/heads/main/data_clustering.csv"
df = pd.read_csv(url)

In [3]:
# Tampilkan 5 baris pertama dengan function head.
print(df.head())

   TransactionAmount      TransactionDate  TransactionType  Location  Channel  \
0          -0.970546  2023-04-11 16:29:14                1        36        0   
1           0.268963  2023-06-27 16:44:19                1        15        0   
2          -0.586526  2023-07-10 18:16:08                1        23        2   
3          -0.387294  2023-05-05 16:32:11                1        33        2   
4          -0.972736  2023-10-16 17:51:24                0         1        2   

   CustomerAge  CustomerOccupation  TransactionDuration  LoginAttempts  \
0     1.419862                   0            -0.548393      -0.204629   
1     1.307715                   0             0.307960      -0.204629   
2    -1.439874                   3            -0.905207      -0.204629   
3    -1.047361                   3            -1.347656      -0.204629   
4     0.018031                   3             1.121495      -0.204629   

   AccountBalance PreviousTransactionDate  TransactionMonth  Target 

# **3. Data Splitting**
Tahap Data Splitting bertujuan untuk memisahkan dataset menjadi dua bagian: data latih (training set) dan data uji (test set).

In [4]:
# Menggunakan train_test_split() untuk melakukan pembagian dataset.
# Pisahkan fitur (X) dan target (y)
X = df.drop(columns=['Target', 'TransactionDate', 'PreviousTransactionDate'])
y = df['Target']

# Lakukan train-test split (70% train, 30% test) dengan stratifikasi label
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

# Cek ukuran masing-masing set
print("Train set:", X_train.shape, y_train.shape)
print("Test set: ", X_test.shape,  y_test.shape)

Train set: (1656, 10) (1656,)
Test set:  (710, 10) (710,)


# **4. Membangun Model Klasifikasi**
Setelah memilih algoritma klasifikasi yang sesuai, langkah selanjutnya adalah melatih model menggunakan data latih.

Berikut adalah rekomendasi tahapannya.
1. Menggunakan algoritma klasifikasi yaitu Decision Tree.
2. Latih model menggunakan data yang sudah dipisah.

In [5]:
# Buatlah model klasifikasi menggunakan Decision Tree
# Inisialisasi model Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)

# Latih model menggunakan data train
dt_model.fit(X_train, y_train)

# Prediksi pada data test
y_pred = dt_model.predict(X_test)

# Evaluasi performa
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9464788732394366

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94       189
           1       0.95      0.98      0.96       214
           2       0.96      0.93      0.94       229
           3       0.92      0.92      0.92        78

    accuracy                           0.95       710
   macro avg       0.94      0.94      0.94       710
weighted avg       0.95      0.95      0.95       710


Confusion Matrix:
 [[178   4   6   1]
 [  2 210   2   0]
 [  5   7 212   5]
 [  5   1   0  72]]


In [6]:
# Menyimpan Model
# import joblib
# joblib.dump(model, 'decision_tree_model.h5')

# Simpan model ke file
import joblib
joblib.dump(dt_model, 'decision_tree_model.h5')

['decision_tree_model.h5']

# **5. Memenuhi Kriteria Skilled dan Advanced dalam Membangun Model Klasifikasi**



**Biarkan kosong jika tidak menerapkan kriteria skilled atau advanced**

In [7]:
# Melatih model menggunakan algoritma klasifikasi selain Decision Tree.
# Inisialisasi Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Latih model dengan data train
rf_model.fit(X_train, y_train)

# Prediksi pada data test
y_pred_rf = rf_model.predict(X_test)

In [8]:
# Menampilkan hasil evaluasi akurasi, presisi, recall, dan F1-Score pada seluruh algoritma yang sudah dibuat.
# Evaluasi performa
print("=== Random Forest ===")
print("Accuracy :", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf, average='weighted'))
print("Recall   :", recall_score(y_test, y_pred_rf, average='weighted'))
print("F1-score :", f1_score(y_test, y_pred_rf, average='weighted'))

=== Random Forest ===
Accuracy : 0.9605633802816902
Precision: 0.9607026508000275
Recall   : 0.9605633802816902
F1-score : 0.9604295341549589


In [9]:
# Menyimpan Model Selain Decision Tree
# Model ini bisa lebih dari satu
# import joblib
# joblib.dump(___, 'explore_<Nama Algoritma>_classification.h5')


# Simpan model Random Forest
joblib.dump(rf_model, 'explore_RandomForest_classification.h5')

['explore_RandomForest_classification.h5']

Hyperparameter Tuning Model

Pilih salah satu algoritma yang ingin Anda tuning

In [10]:
# Lakukan Hyperparameter Tuning dan Latih ulang.

# Tentukan grid parameter
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Siapkan GridSearchCV dengan 3-fold CV
grid_rf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='f1_weighted',
    cv=3,
    n_jobs=-1,
    verbose=1
)

# Jalankan tuning
grid_rf.fit(X_train, y_train)

# Ambil model terbaik
best_rf = grid_rf.best_estimator_
print("Best Parameters:", grid_rf.best_params_, "\n")

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300} 



In [11]:
# Menampilkan hasil evaluasi akurasi, presisi, recall, dan F1-Score pada algoritma yang sudah dituning.
y_pred = best_rf.predict(X_test)
print("=== Random Forest (GridSearch) ===")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall   :", recall_score(y_test, y_pred, average='weighted'))
print("F1-score :", f1_score(y_test, y_pred, average='weighted'))

=== Random Forest (GridSearch) ===
Accuracy : 0.9633802816901409
Precision: 0.9633641971475776
Recall   : 0.9633802816901409
F1-score : 0.9632438961083428


In [12]:
# Menyimpan Model hasil tuning
# import joblib
# joblib.dump(model_dt, 'tuning_classification.h5')

# Simpan model hasil tuning
joblib.dump(best_rf, 'tuning_classification.h5')

['tuning_classification.h5']