# **1. Import Library**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# **2. Memuat Dataset dari Hasil Clustering**

In [2]:
df = pd.read_csv("./df_classification.csv")
df

Unnamed: 0,Product Title,Merchant ID,Cluster Label,Category ID,Category Label,Cluster
0,2561,-1.022159,909,-1.709389,7,2
1,2559,-1.013629,909,-1.709389,7,2
2,2656,-1.005099,909,-1.709389,7,2
3,2563,-0.996569,909,-1.709389,7,2
4,2565,-0.988039,909,-1.709389,7,2
...,...,...,...,...,...,...
35132,27084,-0.527413,11152,1.343838,5,1
35133,27083,-0.527413,11154,1.343838,5,1
35134,27082,-0.527413,11153,1.343838,5,1
35135,6918,0.035574,2685,1.343838,5,2


# **3. Data Splitting**

In [3]:
X = df.drop(columns=["Cluster"])
y = df["Cluster"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **4. Membangun Model Klasifikasi**

## **a. Membangun Model Klasifikasi**

In [4]:
# Mencoba algoritma KNN dan Random Forest
knn = KNeighborsClassifier().fit(X_train, y_train)
rf = RandomForestClassifier().fit(X_train, y_train)

## **b. Evaluasi Model Klasifikasi**

In [5]:
# Membuat fungsi untuk evaluasi
def evaluate_model(model, X, Y):
    y_pred = model.predict(X)
    result = {
        "Accuracy": accuracy_score(Y, y_pred),
        "Precision": precision_score(Y, y_pred, average="macro"),
        "Recall": recall_score(Y, y_pred, average="macro"),
        "F1 Score": f1_score(Y, y_pred, average="macro")
    }
    return result

# Membuat fungsi untuk confusion matrix
def cm(model, X, Y):
    y_pred = model.predict(X)
    cm = confusion_matrix(Y, y_pred)
    return pd.DataFrame(cm)

In [6]:
# Evaluasi KNN
evaluasi_train_knn = pd.DataFrame(evaluate_model(knn, X_train, y_train), index=["train_knn"])
evaluasi_test_knn = pd.DataFrame(evaluate_model(knn, X_test, y_test), index=["test_knn"])
evaluasi_knn = pd.concat([evaluasi_train_knn, evaluasi_test_knn])

# Evaluasi Random Forest
evaluasi_train_rf = pd.DataFrame(evaluate_model(rf, X_train, y_train), index=["train_rf"])
evaluasi_test_rf = pd.DataFrame(evaluate_model(rf, X_test, y_test), index=["test_rf"])
evaluasi_rf = pd.concat([evaluasi_train_rf, evaluasi_test_rf])

# confusion matrix knn
cm_knn = cm(knn, X_test, y_test)

# confusion matrix Random Forest
cm_rf = cm(rf, X_test, y_test)

In [7]:
# Melihat evaluasi KNN
evaluasi_knn

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
train_knn,0.999538,0.999546,0.999534,0.99954
test_knn,0.999004,0.999024,0.998982,0.999003


In [8]:
# Melihat evaluasi Random Forest
evaluasi_rf

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
train_rf,1.0,1.0,1.0,1.0
test_rf,0.999289,0.999304,0.999273,0.999289


In [9]:
# Melihat confusion matrix knn
cm_knn

Unnamed: 0,0,1,2
0,2310,0,4
1,3,2261,0
2,0,0,2450


In [10]:
# Melihat confusion matrix Random Forest
cm_rf

Unnamed: 0,0,1,2
0,2311,0,3
1,2,2262,0
2,0,0,2450


## **c. Analisis Hasil Evaluasi Model Klasifikasi**

##### **Memilih Random Forest: Karena Random Forest menghasilkan skor hampir sempurna pada data test, itu adalah model yang lebih kuat dan lebih baik dalam kasus ini. Meski ada sedikit perbedaan antara train dan test, skor di test set mendekati 1. Jika kinerja model sangat mendekati 1 pada data test, model tersebut mungkin tidak terlalu overfitting, atau tingkat overfittingnya minimal, karena performa di test set juga hampir sama dengan train set.**