In [56]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tabulate import tabulate
# from google.colab import files

In [50]:
# Load dataset
from sklearn.datasets import load_iris

iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = iris.target

# display(X)
print(tabulate(X, headers='keys', tablefmt='psql'))

+-----+---------------------+--------------------+---------------------+--------------------+
|     |   sepal length (cm) |   sepal width (cm) |   petal length (cm) |   petal width (cm) |
|-----+---------------------+--------------------+---------------------+--------------------|
|   0 |                 5.1 |                3.5 |                 1.4 |                0.2 |
|   1 |                 4.9 |                3   |                 1.4 |                0.2 |
|   2 |                 4.7 |                3.2 |                 1.3 |                0.2 |
|   3 |                 4.6 |                3.1 |                 1.5 |                0.2 |
|   4 |                 5   |                3.6 |                 1.4 |                0.2 |
|   5 |                 5.4 |                3.9 |                 1.7 |                0.4 |
|   6 |                 4.6 |                3.4 |                 1.4 |                0.3 |
|   7 |                 5   |                3.4 |          

In [92]:
from sklearn.cluster import KMeans
from tabulate import tabulate

# Salin data asli
X_discretized = X.copy()

# Atur jumlah klaster berbeda per fitur
cluster_map = {
    'sepal length (cm)': 4,
    'sepal width (cm)': 3,
    'petal length (cm)': 4,
    'petal width (cm)': 3
}

# Lakukan clustering per fitur
for col in X.columns:
    n_clusters = cluster_map[col]
    km = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
    X_discretized[col] = km.fit_predict(X[[col]])

# Tampilkan hasil
print(tabulate(X_discretized.head(10), headers='keys', tablefmt='psql'))


+----+---------------------+--------------------+---------------------+--------------------+
|    |   sepal length (cm) |   sepal width (cm) |   petal length (cm) |   petal width (cm) |
|----+---------------------+--------------------+---------------------+--------------------|
|  0 |                   2 |                  0 |                   1 |                  1 |
|  1 |                   2 |                  2 |                   1 |                  1 |
|  2 |                   2 |                  2 |                   1 |                  1 |
|  3 |                   2 |                  2 |                   1 |                  1 |
|  4 |                   2 |                  0 |                   1 |                  1 |
|  5 |                   1 |                  0 |                   1 |                  1 |
|  6 |                   2 |                  0 |                   1 |                  1 |
|  7 |                   2 |                  0 |                   1 

In [93]:
# Asli
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Diskritisasi
X_disc_train, X_disc_test, _, _ = train_test_split(X_discretized, y, test_size=0.3, random_state=42)


In [94]:
# Naive Bayes pada data asli
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
acc_nb = accuracy_score(y_test, y_pred_nb)

# Naive Bayes pada data diskrit
nb_disc = GaussianNB()
nb_disc.fit(X_disc_train, y_train)
y_pred_nb_disc = nb_disc.predict(X_disc_test)
acc_nb_disc = accuracy_score(y_test, y_pred_nb_disc)

# Visualisasi
def print_classification_table(y_true, y_pred, title=""):
    report = classification_report(y_true, y_pred, output_dict=True)
    table = []

    for label, metrics in report.items():
        if label not in ['accuracy', 'macro avg', 'weighted avg']:
            row = [label,
                   f"{metrics['precision']:.2f}",
                   f"{metrics['recall']:.2f}",
                   f"{metrics['f1-score']:.2f}",
                   f"{metrics['support']}"]
            table.append(row)

    headers = ["Class", "Precision", "Recall", "F1-Score", "Support"]
    print(tabulate(table, headers=headers, tablefmt="psql"))

def print_confusion_matrix(y_true, y_pred, model_name="Model", labels=None):
    cm = confusion_matrix(y_true, y_pred)
    if labels is None:
        labels = [str(i) for i in range(len(cm))]

    headers = ["Actual \\ Pred"] + list(labels)
    rows = []
    for i, row in enumerate(cm):
        rows.append([labels[i]] + list(row))

    print(f"\n📋 {model_name}")
    print(tabulate(rows, headers=headers, tablefmt="psql"))

label_names = iris.target_names.tolist()  # ['setosa', 'versicolor', 'virginica']

# Naive Bayes
print_confusion_matrix(y_test, y_pred_nb, model_name="Naive Bayes - Data Asli", labels=label_names)
print_classification_table(y_test, y_pred_nb)

print_confusion_matrix(y_test, y_pred_nb_disc, model_name="Naive Bayes - Data Diskrit", labels=label_names)
print_classification_table(y_test, y_pred_nb_disc, title="Naive Bayes - Data Diskrit")

print("\n Akurasi (Data Asli):", acc_nb)
print(" Akurasi (Data Diskritisasi):", acc_nb_disc)



📋 Naive Bayes - Data Asli
+-----------------+----------+--------------+-------------+
| Actual \ Pred   |   setosa |   versicolor |   virginica |
|-----------------+----------+--------------+-------------|
| setosa          |       19 |            0 |           0 |
| versicolor      |        0 |           12 |           1 |
| virginica       |        0 |            0 |          13 |
+-----------------+----------+--------------+-------------+
+---------+-------------+----------+------------+-----------+
|   Class |   Precision |   Recall |   F1-Score |   Support |
|---------+-------------+----------+------------+-----------|
|       0 |        1    |     1    |       1    |        19 |
|       1 |        1    |     0.92 |       0.96 |        13 |
|       2 |        0.93 |     1    |       0.96 |        13 |
+---------+-------------+----------+------------+-----------+

📋 Naive Bayes - Data Diskrit
+-----------------+----------+--------------+-------------+
| Actual \ Pred   |   setosa 

In [95]:
# Decision Tree pada data asli
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)

# Decision Tree pada data diskrit
dt_disc = DecisionTreeClassifier(random_state=42)
dt_disc.fit(X_disc_train, y_train)
y_pred_dt_disc = dt_disc.predict(X_disc_test)
acc_dt_disc = accuracy_score(y_test, y_pred_dt_disc)

# Visualisasi
def print_confusion_matrix(y_true, y_pred, model_name="Model", labels=None):
    cm = confusion_matrix(y_true, y_pred)
    if labels is None:
        labels = [str(i) for i in range(len(cm))]
    headers = ["Actual \\ Pred"] + list(labels)
    rows = [[labels[i]] + list(row) for i, row in enumerate(cm)]
    print(f"\n📋 {model_name}")
    print(tabulate(rows, headers=headers, tablefmt="psql"))

def print_classification_report(y_true, y_pred, title=""):
    report = classification_report(y_true, y_pred, output_dict=True)
    table = []

    for label, metrics in report.items():
        if label not in ['accuracy', 'macro avg', 'weighted avg']:
            row = [label,
                   f"{metrics['precision']:.2f}",
                   f"{metrics['recall']:.2f}",
                   f"{metrics['f1-score']:.2f}",
                   f"{metrics['support']}"]
            table.append(row)

    headers = ["Class", "Precision", "Recall", "F1-Score", "Support"]
    print(tabulate(table, headers=headers, tablefmt="psql"))

# Cetak hasil
label_names = iris.target_names.tolist()

print_confusion_matrix(y_test, y_pred_dt, model_name="Decision Tree - Data Asli", labels=label_names)
print_classification_report(y_test, y_pred_dt)

print_confusion_matrix(y_test, y_pred_dt_disc, model_name="Decision Tree - Data Diskrit", labels=label_names)
print_classification_report(y_test, y_pred_dt_disc)

print("\n🌳 Decision Tree Accuracy")
print(f"Akurasi (Data Asli): {acc_dt:.2f}")
print(f"Akurasi (Data Diskritisasi): {acc_dt_disc:.2f}")



📋 Decision Tree - Data Asli
+-----------------+----------+--------------+-------------+
| Actual \ Pred   |   setosa |   versicolor |   virginica |
|-----------------+----------+--------------+-------------|
| setosa          |       19 |            0 |           0 |
| versicolor      |        0 |           13 |           0 |
| virginica       |        0 |            0 |          13 |
+-----------------+----------+--------------+-------------+
+---------+-------------+----------+------------+-----------+
|   Class |   Precision |   Recall |   F1-Score |   Support |
|---------+-------------+----------+------------+-----------|
|       0 |           1 |        1 |          1 |        19 |
|       1 |           1 |        1 |          1 |        13 |
|       2 |           1 |        1 |          1 |        13 |
+---------+-------------+----------+------------+-----------+

📋 Decision Tree - Data Diskrit
+-----------------+----------+--------------+-------------+
| Actual \ Pred   |   set