In [None]:
# -----------------------------------------------
# 1. Instalasi Streamlit dan Ngrok
# -----------------------------------------------
!pip install streamlit pyngrok

# -----------------------------------------------
# 2. Tambahkan Authtoken Ngrok
# -----------------------------------------------
!ngrok config add-authtoken 2yGNfQjDh69wmZgxaX0U8FOoYKF_2gGrZkXHTzKcv72kRRYTS

Collecting streamlit
  Downloading streamlit-1.45.1-py3-none-any.whl.metadata (8.9 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.11-py3-none-any.whl.metadata (9.4 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m500.5 kB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.45.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.11-py3-none-any.whl (25 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl 

In [None]:
%%writefile clustering_app.py
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import silhouette_score, silhouette_samples, davies_bouldin_score
import plotly.graph_objects as go
import plotly.express as px
import io
import kagglehub
import os
from typing import Tuple, Optional, Union, List

# --- Helper Functions ---
def normalize_data(data: np.ndarray, method: str) -> Tuple[np.ndarray, Optional[object]]:
    """Normalisasi data menggunakan Min-Max Scaling atau Z-Score."""
    if method == "Min-Max Scaling":
        scaler = MinMaxScaler()
        normalized_data = scaler.fit_transform(data)
        return normalized_data, scaler
    elif method == "Z-Score":
        scaler = StandardScaler()
        normalized_data = scaler.fit_transform(data)
        return normalized_data, scaler
    return data, None

def calculate_silhouette_score(data: np.ndarray, labels: np.ndarray) -> Optional[float]:
    """Hitung rata-rata Skor Silhouette untuk mengevaluasi kualitas pengelompokan."""
    try:
        unique_labels = np.unique(labels)
        if len(unique_labels) > 1:
            return silhouette_score(data, labels)
        return None
    except Exception as e:
        st.error(f"[Error] Gagal menghitung Silhouette Score: {str(e)}")
        return None

def calculate_davies_bouldin_score(data: np.ndarray, labels: np.ndarray) -> Optional[float]:
    """Hitung Indeks Davies-Bouldin untuk mengevaluasi kualitas pengelompokan."""
    try:
        unique_labels = np.unique(labels)
        if len(unique_labels) > 1:
            return davies_bouldin_score(data, labels)
        return None
    except Exception as e:
        st.error(f"[Error] Gagal menghitung Davies-Bouldin Index: {str(e)}")
        return None

def analisis_kmeans(data: np.ndarray, labels: np.ndarray, model: KMeans):
    """Lakukan analisis K-Means dan sediakan distribusi klaster, metrik evaluasi, serta detail outlier."""
    n_samples = len(data)
    cluster_counts = np.bincount(labels, minlength=model.n_clusters)
    st.write(f"\n*Distribusi Klaster dengan {n_samples} sampel:*")
    dist_data = []
    for i, count in enumerate(cluster_counts):
        percentage = (count / n_samples) * 100 if n_samples > 0 else 0
        dist_data.append({"Klaster": f"Klaster {i}", "Jumlah Sampel": count, "Persentase (%)": percentage})
    st.dataframe(pd.DataFrame(dist_data), height=300, use_container_width=True)

    inertia = model.inertia_
    silhouette = calculate_silhouette_score(data, labels)
    db_index = calculate_davies_bouldin_score(data, labels)
    st.write("\n*Metrik Evaluasi Klaster:*")
    metrics_data = [
        {"Metrik": "Inertia (Within-Cluster Sum of Squares)", "Nilai": f"{inertia:.3f}"},
        {"Metrik": "Silhouette Score", "Nilai": f"{silhouette:.3f}" if silhouette is not None else "N/A"},
        {"Metrik": "Davies-Bouldin Index", "Nilai": f"{db_index:.3f}" if db_index is not None else "N/A"}
    ]
    st.dataframe(pd.DataFrame(metrics_data), height=150, use_container_width=True)

    distances = np.min(model.transform(data), axis=1)
    threshold = np.percentile(distances, 95) if len(distances) > 0 else np.inf
    outliers = np.where(distances > threshold)[0]
    outlier_percentage = (len(outliers) / n_samples) * 100 if n_samples > 0 else 0
    st.write("\n*Outlier Analysis:*")
    outlier_summary = [
        {"Metrik": "Jumlah Outlier", "Nilai": f"{len(outliers)} sampel"},
        {"Metrik": "Persentase Outlier", "Nilai": f"{outlier_percentage:.2f}%"}
    ]
    st.dataframe(pd.DataFrame(outlier_summary), height=100, use_container_width=True)

    outlier_counts = np.bincount(labels[outliers], minlength=model.n_clusters) if len(outliers) > 0 else np.zeros(model.n_clusters)
    st.write("\n*Distribusi Outlier per Klaster:*")
    outlier_dist_data = []
    for i, count in enumerate(outlier_counts):
        percentage = (count / cluster_counts[i]) * 100 if cluster_counts[i] > 0 else 0
        outlier_dist_data.append({"Klaster": f"Klaster {i}", "Jumlah Outlier": count, "Persentase dari Klaster (%)": percentage})
    st.dataframe(pd.DataFrame(outlier_dist_data), height=200, use_container_width=True)

    return outlier_counts, cluster_counts, outliers, distances[outliers]

def perform_clustering(data: np.ndarray, k_value: int) -> Tuple[np.ndarray, Optional[object]]:
    """Lakukan pengelompokan (clustering) menggunakan K-Means."""
    try:
        model = KMeans(n_clusters=k_value, random_state=42, n_init=10)
        labels = model.fit_predict(data)
        return labels, model
    except Exception as e:
        st.error(f"[Error] Gagal melakukan clustering: {str(e)}")
        return np.array([]), None

def get_clustering_diagnostics(labels: np.ndarray) -> int:
    """Hitung jumlah klaster untuk K-Means."""
    unique_labels = np.unique(labels)
    return len(unique_labels)

def plot_silhouette_analysis(data: np.ndarray, labels: np.ndarray, normalization: str) -> Tuple[Optional[go.Figure], Optional[str]]:
    """Buat plot siluet untuk analisis klaster."""
    try:
        unique_labels = np.unique(labels)
        n_unique_labels = len(unique_labels)
        if n_unique_labels < 2:
            return None, f"Hanya ditemukan {n_unique_labels} cluster. Dibutuhkan minimal 2 cluster untuk analisis silhouette."

        silhouette_vals = silhouette_samples(data, labels)
        silhouette_avg = silhouette_score(data, labels)

        y_lower = 10
        cluster_labels = sorted(unique_labels)
        fig = go.Figure()
        colors = px.colors.qualitative.Plotly

        for i, cluster in enumerate(cluster_labels):
            cluster_silhouette_vals = silhouette_vals[labels == cluster]
            cluster_silhouette_vals.sort()
            size_cluster_i = cluster_silhouette_vals.shape[0]
            y_upper = y_lower + size_cluster_i
            y_vals = np.arange(y_lower, y_upper)
            fig.add_trace(go.Scatter(
                x=cluster_silhouette_vals,
                y=y_vals,
                mode='lines+markers',
                name=f'Cluster {cluster}',
                line=dict(color=colors[i % len(colors)], width=2),
                marker=dict(size=4)
            ))
            y_lower = y_upper + 10

        fig.add_shape(
            type="line",
            x0=silhouette_avg,
            x1=silhouette_avg,
            y0=0,
            y1=y_lower,
            line=dict(color="red", dash="dash")
        )

        fig.add_annotation(
            x=silhouette_avg,
            y=y_lower,
            text=f"Average Silhouette Score: {silhouette_avg:.4f}",
            showarrow=True,
            arrowhead=1
        )

        fig.update_layout(
            title=f"Silhouette Analysis (K-Means dengan {normalization})",
            xaxis_title="Silhouette Coefficient",
            yaxis_title="Sample Index",
            showlegend=True,
            xaxis=dict(range=[-0.1, 1.1]),
            yaxis=dict(tickvals=[], ticktext=[]),
            height=600
        )

        return fig, None
    except Exception as e:
        return None, f"Error saat membuat plot silhouette: {str(e)}"

def plot_elbow_method(data: np.ndarray, normalization: str) -> go.Figure:
    """Lakukan Metode Elbow untuk menentukan nilai K yang optimal dan visualisasikan hasilnya."""
    wcss = []
    for i in range(1, 11):
        kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init=10)
        kmeans.fit(data)
        wcss.append(kmeans.inertia_)

    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=list(range(1, 11)),
        y=wcss,
        mode='lines+markers',
        name='WCSS',
        line=dict(color='blue', width=2),
        marker=dict(size=8)
    ))

    fig.update_layout(
        title=f"Metode Elbow (Normalisasi: {normalization})",
        xaxis_title="Jumlah Kluster",
        yaxis_title="WCSS",
        showlegend=True,
        xaxis=dict(tickmode='linear', dtick=1),
        yaxis=dict(gridcolor='lightgray'),
        plot_bgcolor='white',
        height=500
    )

    return fig

def evaluate_features(data: pd.DataFrame, n_clusters: int = 4, normalization: str = "Min-Max Scaling") -> Tuple[List[str], float, float]:
    """Evaluasi fitur berdasarkan Skor Silhouette dan Indeks Davies-Bouldin tanpa menampilkan baseline."""
    try:
        data_normalized, _ = normalize_data(data.values, normalization)
        data_normalized = pd.DataFrame(data_normalized, columns=data.columns)

        baseline_kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10).fit(data_normalized)
        baseline_sil = silhouette_score(data_normalized, baseline_kmeans.labels_)
        baseline_db = davies_bouldin_score(data_normalized, baseline_kmeans.labels_)

        result = []
        for fitur in data_normalized.columns:
            subset = data_normalized.drop(columns=[fitur])
            labels = KMeans(n_clusters=n_clusters, random_state=42, n_init=10).fit_predict(subset)
            sil = silhouette_score(subset, labels)
            db = davies_bouldin_score(subset, labels)
            result.append({
                'fitur_dihapus': fitur,
                'silhouette': sil,
                'davies_bouldin': db
            })
        df_result = pd.DataFrame(result).sort_values(by=['silhouette', 'davies_bouldin'], ascending=[False, True])
        st.dataframe(df_result, height=300, use_container_width=True)

        max_features_to_remove = len(data.columns) - 6
        fitur_dihapus = df_result[
            (df_result['silhouette'] > baseline_sil) &
            (df_result['davies_bouldin'] < baseline_db)
        ]['fitur_dihapus'].head(max_features_to_remove).tolist()

        fitur_final = [f for f in data.columns if f not in fitur_dihapus]

        if len(fitur_final) < 4:
            st.warning("Seleksi fitur menghasilkan kurang dari 4 fitur. Memilih kombinasi terbaik...")
            fitur_dihapus = df_result['fitur_dihapus'].iloc[:(len(data.columns) - 6)].tolist()
            fitur_final = [f for f in data.columns if f not in fitur_dihapus]

        st.write(f"*Fitur yang dihapus:* {fitur_dihapus}")
        st.write(f"*Fitur tersisa:* {fitur_final}")
        st.write(f"*Jumlah fitur tersisa:* {len(fitur_final)}")

        data_final = data[fitur_final]
        data_final_normalized, _ = normalize_data(data_final.values, normalization)
        kmeans_final = KMeans(n_clusters=n_clusters, random_state=42, n_init=10).fit(data_final_normalized)
        labels_final = kmeans_final.labels_
        sil_final = silhouette_score(data_final_normalized, labels_final)
        db_final = davies_bouldin_score(data_final_normalized, labels_final)

        return fitur_final, sil_final, db_final
    except Exception as e:
        st.error(f"[Error] Gagal melakukan seleksi fitur: {str(e)}")
        return data.columns.tolist(), 0.0, float('inf')

def correlation_filtering_auto(data: pd.DataFrame, thresholds: Optional[List[float]] = None, n_clusters: int = 4, normalization: str = "Min-Max Scaling") -> Tuple[List[str], float, float]:
    """Lakukan seleksi fitur otomatis menggunakan penyaringan korelasi tanpa menampilkan baseline."""
    if thresholds is None:
        thresholds = np.arange(0.5, 0.91, 0.05)

    data_normalized, _ = normalize_data(data.values, normalization)
    data_normalized = pd.DataFrame(data_normalized, columns=data.columns)

    hasil_evaluasi = []
    st.write("*Mengevaluasi berbagai threshold korelasi...*")
    for threshold in thresholds:
        try:
            corr_matrix = data_normalized.corr().abs()
            upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
            to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
            data_filtered = data.drop(columns=to_drop)

            if data_filtered.shape[1] < 4:
                st.write(f"*Threshold:* {threshold:.2f} | *Dropped:* {len(to_drop)} | *Fitur Kurang dari 4*")
                continue

            data_filtered_normalized, _ = normalize_data(data_filtered.values, normalization)
            kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
            labels = kmeans.fit_predict(data_filtered_normalized)
            sil = silhouette_score(data_filtered_normalized, labels)
            db = davies_bouldin_score(data_filtered_normalized, labels)

            hasil_evaluasi.append({
                'threshold': threshold,
                'n_features': data_filtered.shape[1],
                'n_dropped': len(to_drop),
                'silhouette': sil,
                'davies_bouldin': db
            })

            st.write(f"*Threshold:* {threshold:.2f} | *Dropped:* {len(to_drop)} | *S:* {sil:.3f} | *DBI:* {db:.3f}")
        except Exception as e:
            st.write(f"*Threshold:* {threshold:.2f} | *Error:* {str(e)}")

    df_eval = pd.DataFrame(hasil_evaluasi)
    st.dataframe(df_eval, height=300, use_container_width=True)

    if df_eval.empty:
        st.warning("Tidak ada threshold yang menghasilkan fitur yang valid. Menggunakan semua fitur...")
        fitur_final = data.columns.tolist()
        data_final = data.copy()
    else:
        df_eval['total_score'] = df_eval['silhouette'] - df_eval['davies_bouldin']
        best_threshold = df_eval.sort_values(by='total_score', ascending=False).iloc[0]['threshold']
        st.write(f"\n*Threshold terbaik berdasarkan metrik:* {best_threshold:.2f}")
        corr_matrix = data_normalized.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        to_drop = [column for column in upper.columns if any(upper[column] > best_threshold)]
        data_final = data.drop(columns=to_drop)
        fitur_final = data_final.columns.tolist()

    try:
        data_final_normalized, _ = normalize_data(data_final.values, normalization)
        kmeans_final = KMeans(n_clusters=n_clusters, random_state=42, n_init=10).fit(data_final_normalized)
        labels_final = kmeans_final.labels_
        sil_final = silhouette_score(data_final_normalized, labels_final)
        db_final = davies_bouldin_score(data_final_normalized, labels_final)
    except Exception as e:
        st.error(f"[Error] Gagal menghitung metrik akhir: {str(e)}")
        sil_final, db_final = 0.0, float('inf')

    st.write(f"\n*Evaluasi Akhir (Threshold = {best_threshold}):*")
    st.write(f"*Jumlah fitur tersisa:* {data_final.shape[1]}")
    st.write(f"*Silhouette Score:* {sil_final:.3f}")
    st.write(f"*Davies-Bouldin Index:* {db_final:.3f}")

    return fitur_final, sil_final, db_final

def generate_recommendations(df: pd.DataFrame, data_final: pd.DataFrame, labels: np.ndarray, silhouette: Optional[float],
                            db_index: Optional[float], k_value: int, outlier_counts: np.ndarray, cluster_counts: np.ndarray,
                            outliers: np.ndarray, outlier_distances: np.ndarray, cleaned_indices: np.ndarray) -> List[str]:
    """Hasilkan rekomendasi berdasarkan hasil clustering dan analisis outlier."""
    recommendations = []
    n_samples = len(labels)
    outlier_percentages = [outlier_counts[i] / cluster_counts[i] * 100 if cluster_counts[i] > 0 else 0 for i in range(k_value)]
    max_outlier_cluster = np.argmax(outlier_percentages) if len(outlier_percentages) > 0 else -1
    max_outlier_percentage = max(outlier_percentages) if len(outlier_percentages) > 0 else 0

    if max_outlier_cluster >= 0 and max_outlier_percentage > 0:
        recommendations.append(
            f"*Rekomendasi 1:* Periksa Klaster {max_outlier_cluster} karena memiliki persentase outlier tertinggi ({max_outlier_percentage:.2f}%)."
        )
    else:
        recommendations.append("*Rekomendasi 1:* Tidak ada klaster dengan outlier signifikan.")

    if len(outliers) > 0 and max_outlier_cluster >= 0:
        outlier_df = pd.DataFrame({
            "Index": outliers,
            "Nama Item": [df.iloc[cleaned_indices[idx]]['Description'] if 'Description' in df.columns and cleaned_indices[idx] < len(df) else f"Item_{cleaned_indices[idx]}" for idx in outliers],
            "Klaster": labels[outliers],
            "Jarak ke Centroid": [f"{d:.3f}" for d in outlier_distances]
        })
        cluster_outliers = outlier_df[outlier_df["Klaster"] == max_outlier_cluster].head(3)
        outlier_items = ", ".join(cluster_outliers["Nama Item"].values) if not cluster_outliers.empty else "Tidak ada item"
        recommendations.append(
            f"*Rekomendasi 2:* Evaluasi ulang item outlier di Klaster {max_outlier_cluster} (contoh: {outlier_items})."
        )
    else:
        recommendations.append("*Rekomendasi 2:* Tidak ada outlier signifikan untuk dievaluasi.")

    if silhouette is not None and silhouette < 0.4:
        recommendations.append(
            f"*Rekomendasi 3:* Silhouette Score ({silhouette:.4f}) di bawah 0.4, menunjukkan kualitas clustering suboptimal. "
            f"Pertimbangkan menyesuaikan K (saat ini {k_value}) dengan K lebih besar (misalnya, {k_value + 1})."
        )
    else:
        recommendations.append("*Rekomendasi 3:* Kualitas clustering memadai berdasarkan Silhouette Score.")

    if db_index is not None and db_index > 1.0:
        recommendations.append(
            f"*Rekomendasi 4:* Davies-Bouldin Index ({db_index:.4f}) di atas 1.0, menunjukkan pemisahan klaster kurang optimal."
        )
    else:
        recommendations.append("*Rekomendasi 4:* Pemisahan klaster cukup baik berdasarkan Davies-Bouldin Index.")

    if len(outliers) > 0:
        example_outlier = outlier_df.iloc[0] if not outlier_df.empty else None
        if example_outlier is not None:
            item_name = example_outlier["Nama Item"]
            cluster = example_outlier["Klaster"]
            recommendations.append(
                f"*Rekomendasi 5:* Gunakan pengetahuan domain untuk menangani outlier seperti '{item_name}' di Klaster {cluster}."
            )
        else:
            recommendations.append("*Rekomendasi 5:* Tidak ada outlier untuk dianalisis dengan pengetahuan domain.")
    else:
        recommendations.append("*Rekomendasi 5:* Tidak ada outlier untuk dianalisis dengan pengetahuan domain.")

    if len(outliers) > 0:
        outlier_with_high_distance = outlier_df.iloc[outlier_df["Jarak ke Centroid"].astype(float).idxmax()] if not outlier_df.empty else None
        if outlier_with_high_distance is not None:
            item_name = outlier_with_high_distance["Nama Item"]
            recommendations.append(
                f"*Rekomendasi 6:* Tingkatkan pengumpulan data untuk item seperti '{item_name}' (jarak ke centroid: {outlier_with_high_distance['Jarak ke Centroid']})."
            )
        else:
            recommendations.append("*Rekomendasi 6:* Tidak ada outlier signifikan untuk meningkatkan pengumpulan data.")
    else:
        recommendations.append("*Rekomendasi 6:* Tidak ada outlier signifikan untuk meningkatkan pengumpulan data.")

    total_outlier_percentage = len(outliers) / n_samples * 100 if n_samples > 0 else 0
    if total_outlier_percentage > 5:
        recommendations.append(
            f"*Rekomendasi 7:* Persentase outlier total ({total_outlier_percentage:.2f}%) lebih dari 5%. "
            "Pertimbangkan metode normalisasi lain seperti RobustScaler."
        )
    else:
        recommendations.append("*Rekomendasi 7:* Persentase outlier total cukup rendah.")

    if max_outlier_percentage > 5:
        recommendations.append(
            f"*Rekomendasi 8:* Klaster {max_outlier_cluster} memiliki outlier tinggi ({max_outlier_percentage:.2f}%), "
            "yang dapat mengurangi interpretabilitas klaster."
        )
    else:
        recommendations.append("*Rekomendasi 8:* Interpretabilitas klaster baik berdasarkan distribusi outlier.")

    if len(outliers) > 0:
        recommendations.append(
            "*Rekomendasi 9:* Outlier mungkin mewakili item unik atau data salah label. "
            "Tinjau outlier secara manual untuk memastikan akurasi data."
        )
    else:
        recommendations.append("*Rekomendasi 9:* Tidak ada outlier yang terdeteksi, tidak perlu tinjauan manual.")

    if max_outlier_percentage > 5:
        recommendations.append(
            f"*Rekomendasi 10:* Klaster {max_outlier_cluster} memiliki persentase outlier tinggi ({max_outlier_percentage:.2f}%). "
            f"Pertimbangkan untuk meningkatkan K menjadi {k_value + 1} untuk mengelompokkan outlier ke klaster baru."
        )
    else:
        recommendations.append(
            f"*Rekomendasi 10:* Persentase outlier di semua klaster di bawah 5%. Tidak perlu menambah jumlah klaster (K={k_value})."
        )

    return recommendations

# --- Main Application ---
def main():
    """Fungsi utama untuk menjalankan aplikasi clustering Streamlit."""
    st.title("Clustering dengan K-Means")

    # Sidebar untuk input
    with st.sidebar:
        st.header("Konfigurasi Clustering")
        st.markdown("*Pilih parameter untuk analisis clustering:*")
        normalization = st.selectbox(
            "Pilih Normalisasi",
            ["Min-Max Scaling", "Z-Score"],
            help="Min-Max Scaling: [0,1]. Z-Score: mean=0, std=1."
        )
        feature_method = st.selectbox(
            "Pilih Metode Seleksi Fitur",
            ["Baseline", "Silhouette + Davies-Bouldin", "Threshold Correlation"],
            help="Baseline: Fitur manual. Silhouette + DB: Berdasarkan metrik clustering. Threshold Correlation: Berdasarkan korelasi."
        )
        uploaded_file = st.file_uploader(
            "Unggah file CSV (opsional)",
            type=["csv"],
            help="Unggah CSV dengan kolom numerik. Kolom 'Description' opsional untuk interpretasi."
        )

    # Main content
    st.subheader("1. Memuat Dataset")
    try:
        if uploaded_file is not None:
            df = pd.read_csv(uploaded_file)
            st.write("Dataset diganti dengan file yang diunggah.")
        else:
            st.write("Memuat dataset default food.csv dari Kaggle...")
            dataset_path = kagglehub.dataset_download("shrutisaxena/food-nutrition-dataset")
            file_path = os.path.join(dataset_path, "food.csv")
            df = pd.read_csv(file_path)
            st.write(f"Dataset default berhasil dimuat dari: {file_path}")

        if df.empty:
            st.error("Dataset kosong! Silakan unggah dataset yang valid atau periksa dataset default.")
            st.stop()

        # Tampilkan dataset yang dimuat
        st.subheader("2. Data yang Dimuat")
        st.write("*Data yang dimuat:*")
        st.dataframe(df, height=300, use_container_width=True)

        # Tampilkan statistik deskriptif
        st.write("*Statistik Deskriptif Data:*")
        st.dataframe(df.describe(), height=300, use_container_width=True)

        # -----------------------------------------------
        # 3. TARGET DATA
        st.subheader("3. Target Data")
        if uploaded_file is None:  # Dataset bawaan dari Kaggle
            kolom_fitur = [
                'Data.Carbohydrate', 'Data.Protein', 'Data.Fat.Total Lipid', 'Data.Kilocalories',
                'Data.Fiber', 'Data.Sugar Total', 'Data.Major Minerals.Calcium',
                'Data.Major Minerals.Iron', 'Data.Vitamins.Vitamin C',
                'Data.Vitamins.Vitamin E', 'Data.Major Minerals.Sodium',
                'Data.Cholesterol'
            ]
            if not all(col in df.columns for col in kolom_fitur):
                st.error("Beberapa kolom fitur tidak ditemukan dalam dataset Kaggle. Periksa nama kolom.")
                st.stop()
            data_pilih_fitur = df[kolom_fitur].copy()
            data_deskripsi = df[['Description']].copy() if 'Description' in df.columns else pd.DataFrame()
            st.write(f"Ukuran dataset setelah pemilihan fitur: {data_pilih_fitur.shape} (baris, kolom)")
            st.write("*Fitur yang digunakan untuk dataset Kaggle:*", kolom_fitur)
        else:  # Dataset yang diunggah
            numeric_columns = df.select_dtypes(include=np.number).columns.tolist()
            if len(numeric_columns) < 2:
                st.error("Dataset harus memiliki setidaknya 2 kolom numerik untuk clustering!")
                st.stop()
            selected_features = st.multiselect(
                "Pilih fitur yang akan digunakan",
                numeric_columns,
                default=numeric_columns[:min(len(numeric_columns), 12)],
                help="Pilih minimal 2 fitur numerik."
            )
            if not selected_features or len(selected_features) < 2:
                st.error("Pilih setidaknya 2 fitur untuk clustering!")
                st.stop()
            data_pilih_fitur = df[selected_features].copy()
            data_deskripsi = df[['Description']].copy() if 'Description' in df.columns else pd.DataFrame()
            st.write(f"Ukuran dataset setelah pemilihan fitur: {data_pilih_fitur.shape} (baris, kolom)")
            st.write("*Fitur yang dipilih pengguna:*", selected_features)

        # -----------------------------------------------
        # 4. PEMBERSIHAN DATA
        st.subheader("4. Pembersihan Data")
        initial_rows = data_pilih_fitur.shape[0]
        st.write(f"*Jumlah baris awal:* {initial_rows}")

        # Periksa duplikat awal
        initial_duplicates = data_pilih_fitur.duplicated().sum()
        st.write(f"*Jumlah duplikat awal dalam fitur terpilih:* {initial_duplicates}")

        st.write("Mengkonversi kolom ke tipe numerik...")
        for kolom in data_pilih_fitur.columns:
            data_pilih_fitur[kolom] = pd.to_numeric(data_pilih_fitur[kolom], errors='coerce')

        data_bersih = data_pilih_fitur.dropna()
        nan_removed = initial_rows - data_bersih.shape[0]
        st.write(f"Jumlah baris yang dihapus karena NaN: {nan_removed}")

        data_bersih = data_bersih[(data_bersih >= 0).all(axis=1)]
        neg_removed = initial_rows - nan_removed - data_bersih.shape[0]
        st.write(f"Jumlah baris yang dihapus karena nilai negatif: {neg_removed}")

        data_bersih_before_dedup = data_bersih.copy()
        data_bersih = data_bersih.drop_duplicates()
        dup_removed = data_bersih_before_dedup.shape[0] - data_bersih.shape[0]
        st.write(f"Jumlah baris yang dihapus karena duplikat: {dup_removed}")

        st.write(f"*Jumlah baris setelah pembersihan:* {data_bersih.shape[0]}")
        st.dataframe(data_bersih, height=300, use_container_width=True)

        if data_bersih.empty:
            st.error("Dataset kosong setelah pembersihan.")
            st.stop()

        if data_bersih.shape[1] < 2:
            st.error("Dataset harus memiliki setidaknya 2 fitur numerik setelah pembersihan!")
            st.stop()

        cleaned_indices = data_bersih.index
        X = data_bersih.values
        X_normalized_elbow, _ = normalize_data(X, normalization)

        st.write("\n*Validasi Normalisasi:*")
        if normalization == "Min-Max Scaling":
            st.write("Rentang data setelah normalisasi (harus [0,1]):")
            st.write(f"Min: {X_normalized_elbow.min():.2f}, Max: {X_normalized_elbow.max():.2f}")
        else:
            st.write("Statistik data setelah normalisasi (harus mean~0, std~1):")
            st.write(f"Mean: {X_normalized_elbow.mean():.2e}, Std: {X_normalized_elbow.std():.2f}")

        # -----------------------------------------------
        # 5. VALIDASI JUMLAH KLASTER (ELBOW METHOD)
        st.subheader("5. Validasi Jumlah Klaster (Elbow Method)")
        st.write("*Menghitung WCSS untuk menentukan jumlah klaster optimal...*")
        elbow_fig = plot_elbow_method(X_normalized_elbow, normalization)
        st.plotly_chart(elbow_fig)
        st.write("*Instruksi:* Lihat plot di atas untuk menentukan 'elbow point'.")

        k_value = st.number_input("Masukkan jumlah K (minimal 2)", min_value=2, max_value=10, value=4, step=1)

        if not st.button("Lanjutkan Clustering dengan K yang Dipilih"):
            st.stop()

        # -----------------------------------------------
        # 6. SELEKSI FITUR
        st.subheader("6. Seleksi Fitur")
        data_final = data_bersih.copy()
        fitur_final = data_bersih.columns.tolist()

        if feature_method == "Baseline":
            data_normalized, _ = normalize_data(data_bersih.values, normalization)
            baseline_kmeans = KMeans(n_clusters=k_value, random_state=42, n_init=10).fit(data_normalized)
            baseline_sil = silhouette_score(data_normalized, baseline_kmeans.labels_)
            baseline_db = davies_bouldin_score(data_normalized, baseline_kmeans.labels_)
            st.write(f"*Baseline (Fitur Terpilih) -> Silhouette: {baseline_sil:.3f}, Davies-Bouldin: {baseline_db:.3f}*")

        if feature_method == "Baseline":
            st.write("*Metode: Baseline*")
            if st.button("Lakukan Seleksi Fitur Baseline"):
                fitur_sil_db, sil_sil_db, db_sil_db = evaluate_features(data_bersih, n_clusters=k_value, normalization=normalization)
                fitur_corr, sil_corr, db_corr = correlation_filtering_auto(data_bersih, n_clusters=k_value, normalization=normalization)
                if sil_sil_db > sil_corr or (sil_sil_db == sil_corr and db_sil_db < db_corr):
                    st.write("*Metode Terpilih: Silhouette + Davies-Bouldin*")
                    fitur_final = fitur_sil_db
                else:
                    st.write("*Metode Terpilih: Threshold Correlation*")
                    fitur_final = fitur_corr
        elif feature_method == "Silhouette + Davies-Bouldin":
            st.write("*Metode: Silhouette + Davies-Bouldin Index*")
            fitur_final, _, _ = evaluate_features(data_bersih, n_clusters=k_value, normalization=normalization)
        elif feature_method == "Threshold Correlation":
            st.write("*Metode: Threshold Correlation*")
            fitur_final, _, _ = correlation_filtering_auto(data_bersih, n_clusters=k_value, normalization=normalization)

        data_final = data_bersih[fitur_final]
        st.write(f"*Jumlah fitur yang digunakan:* {len(fitur_final)}")
        st.write(f"*Fitur yang digunakan:* {fitur_final}")

        # -----------------------------------------------
        # 7. TRANSFORMASI DATA
        st.subheader("7. Transformasi Data")
        st.write("Menangani outlier dengan pembatasi pada persentil ke-99...")
        for kolom in data_final.columns:
            batas_atas = data_final[kolom].quantile(0.99)
            data_final[kolom] = data_final[kolom].clip(upper=batas_atas)

        X = data_final.values
        X_normalized, scaler = normalize_data(X, normalization)
        labels, model = perform_clustering(X_normalized, k_value)
        if labels.size == 0:
            st.error("Clustering gagal menghasilkan label. Periksa data atau parameter.")
            st.stop()
        data_final['Cluster'] = labels

        n_clusters = get_clustering_diagnostics(labels)
        silhouette = calculate_silhouette_score(X_normalized, labels)
        db_index = calculate_davies_bouldin_score(X_normalized, labels)

        # -----------------------------------------------
        # 8. HASIL CLUSTERING
        st.subheader("8. Hasil Clustering")
        st.write(f"*Algoritma*: K-Means")
        st.write(f"*Normalisasi*: {normalization}")
        st.write(f"*Jumlah Klaster (K)*: {k_value}")
        st.write(f"*Jumlah Klaster Terbentuk*: {n_clusters}")
        outlier_counts, cluster_counts, outliers, outlier_distances = analisis_kmeans(X_normalized, labels, model)

        st.write("*Akurasi (Evaluasi):*")
        if silhouette is not None:
            st.write(f"- Silhouette Score: {silhouette:.4f}")
        else:
            st.write("- Silhouette Score: Tidak dapat dihitung")
        if db_index is not None:
            st.write(f"- Davies-Bouldin Index: {db_index:.4f}")
        else:
            st.write("- Davies-Bouldin Index: Tidak dapat dihitung")

        # -----------------------------------------------
        # 9. SILHOUETTE ANALYSIS
        st.subheader("9. Silhouette Analysis")
        fig, error_msg = plot_silhouette_analysis(X_normalized, labels, normalization)
        if fig is not None:
            st.plotly_chart(fig)
        else:
            st.warning(f"Silhouette Analysis tidak dapat ditampilkan: {error_msg}")

        # -----------------------------------------------
        # 10. INSTANCE ERROR (OUTLIER) ANALYSIS
        st.subheader("10. Instance Error (Outlier) Analysis")
        st.write("\n*Detail Instance Error (Outlier):*")
        distances = np.zeros(len(labels))
        cluster_centers = model.cluster_centers_
        for i in range(len(labels)):
            cluster = labels[i]
            distances[i] = np.linalg.norm(X_normalized[i] - cluster_centers[cluster])
        threshold = np.percentile(distances, 95) if len(distances) > 0 else np.inf
        outliers = np.where(distances > threshold)[0]

        if len(outliers) > 0:
            st.write("*Daftar item yang dianggap outlier:*")
            outlier_details = []
            unique_clusters = range(n_clusters)
            for idx in outliers:
                orig_idx = cleaned_indices[idx]
                dists = []
                for cluster in unique_clusters:
                    cluster_points = X_normalized[labels == cluster]
                    if len(cluster_points) > 0:
                        center = np.mean(cluster_points, axis=0)
                        dist = np.linalg.norm(X_normalized[idx] - center)
                        dists.append(dist)
                nearest_cluster = unique_clusters[np.argmin(dists)] if dists else -1
                distance = min(dists) if dists else float('nan')
                item_name = df.iloc[orig_idx]['Description'] if 'Description' in df.columns and orig_idx < len(df) else f"Item_{orig_idx}"
                outlier_details.append({
                    "Index": idx,
                    "Nama Item": item_name,
                    "Klaster": nearest_cluster,
                    "Jarak ke Centroid": f"{distance:.3f}"
                })

            outlier_df = pd.DataFrame(outlier_details)
            st.dataframe(outlier_df, height=300, use_container_width=True)

            outlier_output = io.StringIO()
            outlier_df.to_csv(outlier_output, index=False)
            st.download_button(
                label="Download seluruh data outlier sebagai CSV",
                data=outlier_output.getvalue(),
                file_name="outlier_data.csv",
                mime="text/csv"
            )
        else:
            st.write("*Tidak ada outlier yang ditemukan.*")

        # -----------------------------------------------
        # 11. INTERPRETASI KUALITATIF: TAMPILAN CONTOH MAKANAN
        st.subheader("11. Interpretasi Kualitatif: Tampilan Contoh Makanan per Klaster")
        kolom_fitur = data_final.columns.tolist()
        if 'Cluster' in kolom_fitur:
            kolom_fitur.remove('Cluster')

        st.write("\n*Contoh Makanan per Klaster:*")
        unique_clusters = range(k_value)

        for cluster in unique_clusters:
            st.write(f"\n*Cluster {cluster}:*")
            cluster_data = data_final[data_final['Cluster'] == cluster]
            display_columns = ['Description'] + kolom_fitur if 'Description' in df.columns else kolom_fitur
            if 'Description' in df.columns:
                cluster_data_with_desc = cluster_data.copy()
                cluster_data_with_desc['Description'] = df.loc[cluster_data.index, 'Description']
                st.dataframe(
                    cluster_data_with_desc[display_columns],
                    height=300,
                    use_container_width=True
                )
            else:
                st.dataframe(
                    cluster_data[display_columns],
                    height=300,
                    use_container_width=True
                )

            cluster_output = io.StringIO()
            cluster_data.to_csv(cluster_output, index=False)
            st.download_button(
                label=f"Download data Cluster {cluster} sebagai CSV",
                data=cluster_output.getvalue(),
                file_name=f"cluster_{cluster}_data.csv",
                mime="text/csv"
            )

        # -----------------------------------------------
        # 12. REKOMENDASI
        st.subheader("12. Rekomendasi Berdasarkan Hasil Clustering dan Outlier")
        recommendations = generate_recommendations(
            df, data_final, labels, silhouette, db_index, k_value, outlier_counts, cluster_counts, outliers, outlier_distances, cleaned_indices
        )
        for rec in recommendations:
            st.write(rec)

        # -----------------------------------------------
        # 13. DOWNLOAD HASIL CLUSTERING KESELURUHAN
        st.subheader("13. Download Hasil Clustering Keseluruhan")
        result_df = df.loc[cleaned_indices].copy()
        result_df['Cluster'] = labels
        st.dataframe(result_df, height=300, use_container_width=True)
        output = io.StringIO()
        result_df.to_csv(output, index=False)
        st.download_button(
            label="Download hasil clustering keseluruhan",
            data=output.getvalue(),
            file_name="clustering_result.csv",
            mime="text/csv"
        )
    except Exception as e:
        st.error(f"Terjadi kesalahan saat memproses data: {str(e)}")

if __name__ == "__main__":
    main()

Writing clustering_app.py


In [None]:
from pyngrok import ngrok
import threading
import time
import subprocess
import socket

# Fungsi menunggu sampai port Streamlit terbuka
def wait_for_port(host, port, timeout=60):
    start_time = time.time()
    while time.time() - start_time < timeout:
        try:
            with socket.create_connection((host, port), timeout=2):
                return True
        except OSError:
            time.sleep(1)
    return False

# Jalankan Streamlit di thread terpisah
def start_streamlit():
    subprocess.Popen(['streamlit', 'run', '/content/clustering_app.py'])

threading.Thread(target=start_streamlit).start()

# Tunggu sampai Streamlit aktif
if wait_for_port('localhost', 8501):
    print("Streamlit sudah siap di port 8501")
else:
    print("Timeout: Gagal menunggu Streamlit berjalan")

# Cek apakah tunnel ke port 8501 sudah aktif
existing_tunnel = None
for tunnel in ngrok.get_tunnels():
    if "http://localhost:8501" in tunnel.config.get("addr", ""):
        existing_tunnel = tunnel
        break

# Jika belum ada, buat tunnel baru
if not existing_tunnel:
    public_url = ngrok.connect(8501, bind_tls=True)
else:
    public_url = existing_tunnel

# Tampilkan URL publik
print(f"\nAkses aplikasimu di: {public_url.public_url}")
print("Jika muncul halaman peringatan ngrok, klik 'Visit Site' untuk melanjutkan ke aplikasi Streamlit.")

Streamlit sudah siap di port 8501

Akses aplikasimu di: https://98b0-34-125-188-6.ngrok-free.app
Jika muncul halaman peringatan ngrok, klik 'Visit Site' untuk melanjutkan ke aplikasi Streamlit.
