# **SETUP DAN IMPORT LIBRARIES**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score,
                           f1_score, precision_score, recall_score)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings('ignore')

!pip install colorama -q

# Set style
plt.style.use('default')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")
print("📊 Creating Interactive Visualization Dashboard...")


✅ All libraries imported successfully!
📊 Creating Interactive Visualization Dashboard...


# **LOAD AND PREPARE DATA**

In [None]:
print("\n" + "="*60)
print("📁 DATA LOADING AND PREPARATION")
print("="*60)

# Upload dataset
from google.colab import files

print("📁 Please upload your AI Assistant Usage dataset:")
uploaded = files.upload()

filename = list(uploaded.keys())[0]
df_original = pd.read_csv(filename)

# Comprehensive data preparation
df = df_original.copy()

# Handle missing values
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna('Unknown')
    else:
        df[col] = df[col].fillna(df[col].median())

df = df.drop_duplicates()

print(f"✅ Dataset prepared: {df.shape[0]} rows × {df.shape[1]} columns")

# Identify columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Identify target variable
potential_targets = []
for col in categorical_cols:
    unique_count = df[col].nunique()
    if 2 <= unique_count <= 20:
        if any(keyword in col.lower() for keyword in ['category', 'type', 'class', 'usage', 'purpose', 'label']):
            potential_targets.append(col)

if not potential_targets:
    for col in categorical_cols:
        if 2 <= df[col].nunique() <= 10:
            potential_targets.append(col)
            break

if not potential_targets:
    df['usage_pattern'] = np.random.choice(['Academic', 'Personal', 'Creative'], size=len(df))
    potential_targets = ['usage_pattern']

target_col = potential_targets[0]
print(f"🎯 Target variable: {target_col}")

# Feature engineering for visualization
feature_dfs = []
categorical_features = [col for col in categorical_cols if col != target_col]

for col in categorical_features:
    if df[col].nunique() <= 10:
        encoded = pd.get_dummies(df[col], prefix=col, drop_first=True)
        feature_dfs.append(encoded)
    else:
        le = LabelEncoder()
        encoded_series = pd.Series(le.fit_transform(df[col].astype(str)),
                                 name=f"{col}_encoded", index=df.index)
        feature_dfs.append(pd.DataFrame(encoded_series))

if numerical_cols:
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df[numerical_cols])
    scaled_df = pd.DataFrame(scaled_data,
                           columns=[f"{col}_scaled" for col in numerical_cols],
                           index=df.index)
    feature_dfs.append(scaled_df)

X = pd.concat(feature_dfs, axis=1) if feature_dfs else pd.DataFrame(index=df.index)
X = X.fillna(0).replace([np.inf, -np.inf], 0)

# Prepare target
le_target = LabelEncoder()
y = le_target.fit_transform(df[target_col])
target_classes = le_target.classes_

print(f"📊 Visualization ready: {X.shape[0]} samples × {X.shape[1]} features")



📁 DATA LOADING AND PREPARATION
📁 Please upload your AI Assistant Usage dataset:


Saving ai_assistant_usage_student_life.csv to ai_assistant_usage_student_life (1).csv
✅ Dataset prepared: 10000 rows × 11 columns
🎯 Target variable: TaskType
📊 Visualization ready: 10000 samples × 17 features


# **1. DATASET OVERVIEW DASHBOARD**

Kode ini membuat **dashboard visualisasi interaktif** untuk memberikan gambaran lengkap tentang dataset Anda. Berikut ringkasannya:

1. **Tata letak dashboard**:
   Terdiri dari 3 baris dan 3 kolom subplot, dengan berbagai jenis grafik seperti indikator angka, bar chart, pie chart, box plot, dan heatmap.

2. **Isi dashboard**:

   * **Total Sampel & Fitur** (indikator angka dengan gauge)
   * **Distribusi Target** (bar chart jumlah tiap kelas target)
   * **Tipe Fitur** (pie chart kategori fitur: numerik vs kategorikal)
   * **Nilai Hilang (Missing Values)** (bar chart kolom yang memiliki nilai kosong)
   * **Tipe Data** (bar chart jumlah tipe data di kolom dataset)
   * **Analisis Keseimbangan Kelas** (bar chart dengan rasio balance kelas target)
   * **Statistik Fitur Numerik** (box plot untuk 3 fitur numerik pertama)
   * **Fitur Kategorikal** (bar chart top 10 kategori pada salah satu fitur kategorikal)
   * **Korelasi Fitur Numerik** (heatmap korelasi antar fitur numerik jika ada lebih dari satu)

3. **Tujuan**:
   Menyajikan visualisasi interaktif yang memudahkan eksplorasi data awal, deteksi masalah (seperti missing values, ketidakseimbangan kelas), dan insight tentang struktur data.

4. **Output**:
   Dashboard berukuran besar (tinggi 1200 px) yang bisa langsung ditampilkan (biasanya di notebook Jupyter atau aplikasi web).



In [None]:
print("\n" + "="*60)
print("📊 1. DATASET OVERVIEW DASHBOARD")
print("="*60)

# Create comprehensive overview
fig = make_subplots(
    rows=3, cols=3,
    subplot_titles=(
        'Dataset Summary', 'Target Distribution', 'Feature Types',
        'Missing Values', 'Data Types', 'Class Balance',
        'Numerical Features Stats', 'Categorical Features', 'Feature Correlation'
    ),
    specs=[[{'type': 'indicator'}, {'type': 'bar'}, {'type': 'pie'}],
           [{'type': 'bar'}, {'type': 'bar'}, {'type': 'bar'}],
           [{'type': 'box'}, {'type': 'bar'}, {'type': 'heatmap'}]]
)

# 1.1 Dataset Summary
fig.add_trace(go.Indicator(
    mode="number+gauge",
    value=len(df),
    title={'text': f"Total Samples<br>{df.shape[1]} Features"},
    gauge={'shape': "bullet", 'axis': {'range': [None, len(df)*1.2]}},
), row=1, col=1)

# 1.2 Target Distribution
target_counts = df[target_col].value_counts()
fig.add_trace(go.Bar(
    x=target_counts.index,
    y=target_counts.values,
    name='Target Distribution',
    marker_color='lightblue'
), row=1, col=2)

# 1.3 Feature Types
feature_type_counts = pd.Series({
    'Categorical': len(categorical_cols),
    'Numerical': len(numerical_cols)
})
fig.add_trace(go.Pie(
    labels=feature_type_counts.index,
    values=feature_type_counts.values,
    name='Feature Types'
), row=1, col=3)

# 1.4 Missing Values
missing_counts = df.isnull().sum()
missing_cols = missing_counts[missing_counts > 0]
if len(missing_cols) > 0:
    fig.add_trace(go.Bar(
        x=missing_cols.index,
        y=missing_cols.values,
        name='Missing Values',
        marker_color='red'
    ), row=2, col=1)

# 1.5 Data Types
dtype_counts = df.dtypes.value_counts()
fig.add_trace(go.Bar(
    x=[str(dt) for dt in dtype_counts.index],
    y=dtype_counts.values,
    name='Data Types',
    marker_color='green'
), row=2, col=2)

# 1.6 Class Balance Analysis
class_balance = df[target_col].value_counts()
balance_ratio = class_balance.min() / class_balance.max()
fig.add_trace(go.Bar(
    x=class_balance.index,
    y=class_balance.values,
    name=f'Balance Ratio: {balance_ratio:.2f}',
    marker_color='orange'
), row=2, col=3)

# 1.7 Numerical Features Box Plot
if numerical_cols:
    for col in numerical_cols[:3]:  # Show first 3 numerical columns
        fig.add_trace(go.Box(
            y=df[col],
            name=col,
            showlegend=False
        ), row=3, col=1)

# 1.8 Categorical Features
if categorical_features:
    cat_col = categorical_features[0]
    cat_counts = df[cat_col].value_counts().head(10)
    fig.add_trace(go.Bar(
        x=cat_counts.values,
        y=cat_counts.index,
        orientation='h',
        name=f'Top {cat_col}',
        marker_color='purple'
    ), row=3, col=2)

# 1.9 Feature Correlation (if numerical features exist)
if len(numerical_cols) > 1:
    corr_matrix = df[numerical_cols].corr()
    fig.add_trace(go.Heatmap(
        z=corr_matrix.values,
        x=corr_matrix.columns,
        y=corr_matrix.columns,
        colorscale='RdBu',
        showscale=False
    ), row=3, col=3)

fig.update_layout(height=1200, title_text="📊 Dataset Overview Dashboard", showlegend=False)
fig.show()



📊 1. DATASET OVERVIEW DASHBOARD


# **2. MODEL PERFORMANCE DASHBOARD**

Kode ini buat **dashboard performa model ML** lengkap dengan visualisasi dan evaluasi, intinya:

1. **Model yang dilatih:**

   * Random Forest
   * Gradient Boosting
   * Logistic Regression
   * SVM (dengan probabilitas)

2. **Data dibagi** jadi train dan test (80%-20%) secara stratified.

3. **Evaluasi tiap model** dihitung metrik: accuracy, F1-score, precision, recall.

4. **Dashboard visual** terdiri dari 6 plot:

   * Bar chart akurasi tiap model
   * Bar chart F1-score tiap model
   * Scatter plot precision vs recall tiap model dengan warna sesuai F1-score
   * Heatmap confusion matrix model terbaik
   * Bar chart feature importance (jika model mendukung, misal Random Forest)
   * Histogram distribusi confidence prediksi model terbaik

5. **Output di console:**

   * Nama model terbaik berdasar F1-score
   * Nilai F1-score dan akurasi model terbaik

In [None]:
print("\n" + "="*60)
print("🤖 2. MODEL PERFORMANCE DASHBOARD")
print("="*60)

# Train models for comparison
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(random_state=42, probability=True)
}

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train and evaluate models
model_results = {}
for name, model in models.items():
    print(f"Training {name}...")
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)

        model_results[name] = {
            'accuracy': accuracy_score(y_test, y_pred),
            'f1_score': f1_score(y_test, y_pred, average='weighted'),
            'precision': precision_score(y_test, y_pred, average='weighted'),
            'recall': recall_score(y_test, y_pred, average='weighted'),
            'y_pred': y_pred,
            'y_pred_proba': y_pred_proba,
            'model': model
        }
    except Exception as e:
        print(f"Error training {name}: {e}")

# Create performance comparison dashboard
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=(
        'Model Accuracy Comparison', 'F1-Score Comparison', 'Precision vs Recall',
        'Confusion Matrix (Best Model)', 'Feature Importance', 'Prediction Confidence'
    ),
    specs=[[{'type': 'bar'}, {'type': 'bar'}, {'type': 'scatter'}],
           [{'type': 'heatmap'}, {'type': 'bar'}, {'type': 'histogram'}]]
)

# 2.1 Model Accuracy
model_names = list(model_results.keys())
accuracies = [model_results[name]['accuracy'] for name in model_names]
fig.add_trace(go.Bar(
    x=model_names,
    y=accuracies,
    name='Accuracy',
    marker_color='lightgreen',
    text=[f'{acc:.3f}' for acc in accuracies],
    textposition='auto'
), row=1, col=1)

# 2.2 F1-Score Comparison
f1_scores = [model_results[name]['f1_score'] for name in model_names]
fig.add_trace(go.Bar(
    x=model_names,
    y=f1_scores,
    name='F1-Score',
    marker_color='lightcoral',
    text=[f'{f1:.3f}' for f1 in f1_scores],
    textposition='auto'
), row=1, col=2)

# 2.3 Precision vs Recall
precisions = [model_results[name]['precision'] for name in model_names]
recalls = [model_results[name]['recall'] for name in model_names]
fig.add_trace(go.Scatter(
    x=precisions,
    y=recalls,
    mode='markers+text',
    text=model_names,
    textposition='top center',
    marker=dict(size=10, color=f1_scores, colorscale='viridis', showscale=True),
    name='Precision vs Recall'
), row=1, col=3)

# Find best model
best_model_name = max(model_results.keys(), key=lambda x: model_results[x]['f1_score'])
best_model_results = model_results[best_model_name]

# 2.4 Confusion Matrix for best model
cm = confusion_matrix(y_test, best_model_results['y_pred'])
fig.add_trace(go.Heatmap(
    z=cm,
    x=target_classes,
    y=target_classes,
    colorscale='Blues',
    text=cm,
    texttemplate="%{text}",
    showscale=False
), row=2, col=1)

# 2.5 Feature Importance (if available)
best_model = best_model_results['model']
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False).head(10)

    fig.add_trace(go.Bar(
        x=feature_importance['importance'],
        y=feature_importance['feature'],
        orientation='h',
        name='Feature Importance',
        marker_color='gold'
    ), row=2, col=2)

# 2.6 Prediction Confidence Distribution
prediction_confidence = np.max(best_model_results['y_pred_proba'], axis=1)
fig.add_trace(go.Histogram(
    x=prediction_confidence,
    name='Confidence Distribution',
    marker_color='mediumpurple',
    nbinsx=20
), row=2, col=3)

fig.update_layout(height=1000, title_text=f"🤖 Model Performance Dashboard (Best: {best_model_name})", showlegend=False)
fig.show()

print(f"🏆 Best Model: {best_model_name}")
print(f"📊 Best F1-Score: {best_model_results['f1_score']:.4f}")
print(f"🎯 Best Accuracy: {best_model_results['accuracy']:.4f}")


🤖 2. MODEL PERFORMANCE DASHBOARD
Training Random Forest...
Training Gradient Boosting...
Training Logistic Regression...
Training SVM...


🏆 Best Model: Random Forest
📊 Best F1-Score: 0.2363
🎯 Best Accuracy: 0.2695


# **3. DETAILED CLASSIFICATION ANALYSIS**

Kode ini membuat dashboard analisis klasifikasi detail untuk model terbaik kamu, dengan fitur:

* **Grafik bar**: precision, recall, dan f1-score per kelas, plus jumlah support (data aktual per kelas) sebagai garis dan titik merah.
* **Heatmap** metrik precision, recall, f1-score per kelas.
* **Grafik bar** perbandingan jumlah prediksi vs aktual per kelas.
* **Boxplot** distribusi confidence model untuk prediksi tiap kelas.



In [None]:
print("\n" + "="*60)
print("🔍 3. DETAILED CLASSIFICATION ANALYSIS")
print("="*60)

# Create detailed classification report visualization
classification_rep = classification_report(y_test, best_model_results['y_pred'],
                                         target_names=target_classes, output_dict=True)

# Extract metrics for each class
classes = target_classes
precision_scores = [classification_rep[cls]['precision'] for cls in classes]
recall_scores = [classification_rep[cls]['recall'] for cls in classes]
f1_scores_class = [classification_rep[cls]['f1-score'] for cls in classes]
support_scores = [classification_rep[cls]['support'] for cls in classes]

# Create detailed analysis dashboard
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Per-Class Performance Metrics', 'Classification Report Heatmap',
        'Prediction Distribution by Class', 'Model Confidence by Class'
    ),
    specs=[[{'secondary_y': True}, {'type': 'heatmap'}],
           [{'type': 'bar'}, {'type': 'box'}]]
)

# 3.1 Per-Class Performance Metrics
fig.add_trace(go.Bar(
    x=classes,
    y=precision_scores,
    name='Precision',
    marker_color='lightblue',
    yaxis='y'
), row=1, col=1)

fig.add_trace(go.Bar(
    x=classes,
    y=recall_scores,
    name='Recall',
    marker_color='lightcoral',
    yaxis='y'
), row=1, col=1)

fig.add_trace(go.Bar(
    x=classes,
    y=f1_scores_class,
    name='F1-Score',
    marker_color='lightgreen',
    yaxis='y'
), row=1, col=1)

fig.add_trace(go.Scatter(
    x=classes,
    y=support_scores,
    mode='markers+lines',
    name='Support',
    marker=dict(size=10, color='red'),
    yaxis='y2'
), row=1, col=1)

# 3.2 Classification Report as Heatmap
metrics_matrix = np.array([precision_scores, recall_scores, f1_scores_class])
fig.add_trace(go.Heatmap(
    z=metrics_matrix,
    x=classes,
    y=['Precision', 'Recall', 'F1-Score'],
    colorscale='RdYlGn',
    text=np.round(metrics_matrix, 3),
    texttemplate="%{text}",
    showscale=True
), row=1, col=2)

# 3.3 Prediction Distribution by Class
pred_counts = pd.Series(best_model_results['y_pred']).value_counts()
actual_counts = pd.Series(y_test).value_counts()

fig.add_trace(go.Bar(
    x=[target_classes[i] for i in actual_counts.index],
    y=actual_counts.values,
    name='Actual',
    marker_color='skyblue',
    opacity=0.7
), row=2, col=1)

fig.add_trace(go.Bar(
    x=[target_classes[i] for i in pred_counts.index],
    y=pred_counts.values,
    name='Predicted',
    marker_color='orange',
    opacity=0.7
), row=2, col=1)

# 3.4 Model Confidence by Class
confidence_by_class = {}
for i, class_name in enumerate(target_classes):
    class_mask = best_model_results['y_pred'] == i
    if np.any(class_mask):
        confidence_by_class[class_name] = prediction_confidence[class_mask]

for class_name, confidences in confidence_by_class.items():
    fig.add_trace(go.Box(
        y=confidences,
        name=class_name,
        showlegend=False
    ), row=2, col=2)

# Update layout with secondary y-axis
fig.update_layout(height=1000, title_text="🔍 Detailed Classification Analysis")
fig.update_yaxes(title_text="Metrics Score", row=1, col=1)
fig.update_yaxes(title_text="Support Count", secondary_y=True, row=1, col=1)
fig.show()


🔍 3. DETAILED CLASSIFICATION ANALYSIS


# **4. DATA EXPLORATION INTERACTIVE DASHBOARD**

Kode ini membangun dashboard eksplorasi data interaktif dengan fitur utama:

* **PCA 2D scatter plot** untuk visualisasi data fitur berukuran tinggi dalam dua dimensi, diberi warna menurut kelas target.
* **t-SNE 2D scatter plot** untuk dataset kecil (<=1000), visualisasi nonlinear yang membantu menemukan cluster natural.
* **Distribusi target pada fitur numerik utama** sebagai scatter per kelas.
* **Heatmap korelasi fitur numerik (10 fitur pertama)** untuk analisis hubungan antar fitur.
* **Statistik ringkas** (count, mean, std, dll) dari satu fitur numerik utama dalam bar chart.
* **Boxplot outlier detection** untuk 3 fitur numerik pertama (jika ada).



In [None]:
print("\n" + "="*60)
print("🔍 4. DATA EXPLORATION INTERACTIVE DASHBOARD")
print("="*60)

# Dimensionality reduction for visualization
if X.shape[1] > 2:
    print("📊 Performing dimensionality reduction for visualization...")

    # PCA
    pca = PCA(n_components=2, random_state=42)
    X_pca = pca.fit_transform(X)

    # t-SNE (for smaller datasets)
    if len(X) <= 1000:
        tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(X)-1))
        X_tsne = tsne.fit_transform(X[:1000])  # Limit for performance
    else:
        X_tsne = None

    print(f"✅ PCA explained variance ratio: {pca.explained_variance_ratio_.sum():.3f}")

# Create exploration dashboard
exploration_cols = 2 if X_tsne is not None else 1
fig = make_subplots(
    rows=3, cols=exploration_cols,
    subplot_titles=(
        'PCA Visualization', 't-SNE Visualization' if X_tsne is not None else '',
        'Target Distribution by Features', 'Feature Correlation Analysis' if exploration_cols > 1 else '',
        'Statistical Summary', 'Outlier Detection' if exploration_cols > 1 else ''
    ) if exploration_cols > 1 else (
        'PCA Visualization',
        'Target Distribution by Features',
        'Statistical Summary'
    ),
    specs=[[{'type': 'scatter'}, {'type': 'scatter'}] if exploration_cols > 1 else [{'type': 'scatter'}],
           [{'type': 'scatter'}, {'type': 'heatmap'}] if exploration_cols > 1 else [{'type': 'scatter'}],
           [{'type': 'bar'}, {'type': 'box'}] if exploration_cols > 1 else [{'type': 'bar'}]]
)

# 4.1 PCA Visualization
colors = [target_classes[yi] for yi in y]
fig.add_trace(go.Scatter(
    x=X_pca[:, 0],
    y=X_pca[:, 1],
    mode='markers',
    marker=dict(
        color=y,
        colorscale='viridis',
        showscale=True,
        colorbar=dict(title="Classes", tickvals=list(range(len(target_classes))),
                     ticktext=target_classes)
    ),
    text=[f"Class: {target_classes[yi]}" for yi in y],
    name='PCA'
), row=1, col=1)

# 4.2 t-SNE Visualization (if available)
if X_tsne is not None:
    fig.add_trace(go.Scatter(
        x=X_tsne[:, 0],
        y=X_tsne[:, 1],
        mode='markers',
        marker=dict(
            color=y[:1000],
            colorscale='plasma',
            showscale=False
        ),
        text=[f"Class: {target_classes[yi]}" for yi in y[:1000]],
        name='t-SNE'
    ), row=1, col=2)

# 4.3 Target Distribution Analysis
if numerical_cols:
    num_col = numerical_cols[0]
    for i, class_name in enumerate(target_classes):
        class_data = df[df[target_col] == class_name][num_col]
        fig.add_trace(go.Scatter(
            x=class_data,
            y=[class_name] * len(class_data),
            mode='markers',
            name=class_name,
            marker=dict(size=8, opacity=0.6),
            showlegend=False
        ), row=2, col=1)

# 4.4 Feature Correlation Analysis (if multiple columns)
if exploration_cols > 1 and len(numerical_cols) > 1:
    corr_matrix = df[numerical_cols[:10]].corr()  # Limit to 10 features
    fig.add_trace(go.Heatmap(
        z=corr_matrix.values,
        x=corr_matrix.columns,
        y=corr_matrix.columns,
        colorscale='RdBu',
        zmid=0
    ), row=2, col=2)

# 4.5 Statistical Summary
if numerical_cols:
    stats_col = numerical_cols[0]
    stats = df[stats_col].describe()
    fig.add_trace(go.Bar(
        x=stats.index,
        y=stats.values,
        name='Statistics',
        marker_color='lightsteelblue'
    ), row=3, col=1)

# 4.6 Outlier Detection (if multiple columns)
if exploration_cols > 1 and numerical_cols:
    for col in numerical_cols[:3]:
        fig.add_trace(go.Box(
            y=df[col],
            name=col,
            showlegend=False
        ), row=3, col=2)

fig.update_layout(
    height=1200,
    title_text="🔍 Interactive Data Exploration Dashboard",
    showlegend=True
)
fig.show()



🔍 4. DATA EXPLORATION INTERACTIVE DASHBOARD
📊 Performing dimensionality reduction for visualization...
✅ PCA explained variance ratio: 1.000


# **5. INSIGHTS AND RECOMMENDATIONS DASHBOARD**

Kode ini membuat dashboard *Insights and Recommendations* yang menyajikan ringkasan komprehensif sekaligus rekomendasi utama dari analisis data dan performa model. Berikut poin pentingnya:

* **Dashboard visual:**

  * Indikator skor kualitas data (berdasarkan missing values)
  * Indikator skor performa model terbaik (F1-score)
  * Bar chart distribusi jenis fitur (kategorikal, numerik, engineered)
  * Pie chart distribusi kategori performa semua model (Excellent, Good, Fair)

* **Ringkasan teks lengkap** untuk:

  * Kualitas dataset (jumlah data, missing, duplikat)
  * Performa model (model terbaik, akurasi, F1-score, level performa)
  * Analisis fitur (jumlah fitur, fitur paling penting, diversitas, dimensi)
  * Hasil klasifikasi (jumlah kelas, keseimbangan kelas, confidence rata-rata, prediksi confidence rendah)

* **Rekomendasi otomatis berdasarkan hasil analisis:**

  * Perbaikan kualitas data (jika ada missing)
  * Saran tuning atau deployment model berdasarkan F1-score
  * Fokus pada fitur paling penting (jika tersedia)
  * Penanganan ketidakseimbangan kelas (jika perlu)
  * Tindak lanjut pada prediksi dengan confidence rendah (jika >20%)


In [None]:


print("\n" + "="*60)
print("💡 5. INSIGHTS AND RECOMMENDATIONS DASHBOARD")
print("="*60)

# Generate key insights
insights = {
    'Dataset Quality': {
        'Total Samples': len(df),
        'Missing Values': df.isnull().sum().sum(),
        'Duplicate Rows': len(df_original) - len(df),
        'Data Quality Score': 'High' if df.isnull().sum().sum() == 0 else 'Medium'
    },
    'Model Performance': {
        'Best Model': best_model_name,
        'Best Accuracy': f"{best_model_results['accuracy']:.4f}",
        'Best F1-Score': f"{best_model_results['f1_score']:.4f}",
        'Performance Level': 'Excellent' if best_model_results['f1_score'] > 0.9 else 'Good' if best_model_results['f1_score'] > 0.7 else 'Fair'
    },
    'Feature Analysis': {
        'Total Features': X.shape[1],
        'Most Important Feature': feature_importance.iloc[0]['feature'] if 'feature_importance' in locals() else 'N/A',
        'Feature Diversity': len(categorical_cols) + len(numerical_cols),
        'Dimensionality': 'Optimal' if X.shape[1] < 100 else 'High'
    },
    'Classification Results': {
        'Number of Classes': len(target_classes),
        'Class Balance': 'Balanced' if (pd.Series(y).value_counts().min() / pd.Series(y).value_counts().max()) > 0.7 else 'Imbalanced',
        'Average Confidence': f"{prediction_confidence.mean():.3f}",
        'Low Confidence Predictions': f"{(prediction_confidence < 0.6).mean()*100:.1f}%"
    }
}

# Create insights dashboard
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Dataset Quality Metrics', 'Model Performance Summary',
                   'Feature Analysis Results', 'Classification Insights'),
    specs=[[{'type': 'indicator'}, {'type': 'indicator'}],
           [{'type': 'bar'}, {'type': 'pie'}]]
)

# 5.1 Dataset Quality Score
quality_score = 100 - (df.isnull().sum().sum() / len(df) * 100)
fig.add_trace(go.Indicator(
    mode="gauge+number",
    value=quality_score,
    title={'text': "Data Quality Score"},
    gauge={
        'axis': {'range': [None, 100]},
        'bar': {'color': "darkgreen"},
        'steps': [{'range': [0, 50], 'color': "lightgray"},
                 {'range': [50, 80], 'color': "yellow"},
                 {'range': [80, 100], 'color': "lightgreen"}],
        'threshold': {'line': {'color': "red", 'width': 4},
                     'thickness': 0.75, 'value': 90}
    }
), row=1, col=1)

# 5.2 Model Performance Score
performance_score = best_model_results['f1_score'] * 100
fig.add_trace(go.Indicator(
    mode="gauge+number",
    value=performance_score,
    title={'text': "Model Performance (F1-Score %)"},
    gauge={
        'axis': {'range': [None, 100]},
        'bar': {'color': "darkblue"},
        'steps': [{'range': [0, 60], 'color': "lightgray"},
                 {'range': [60, 80], 'color': "yellow"},
                 {'range': [80, 100], 'color': "lightblue"}],
        'threshold': {'line': {'color': "red", 'width': 4},
                     'thickness': 0.75, 'value': 85}
    }
), row=1, col=2)

# 5.3 Feature Distribution
feature_types = ['Categorical', 'Numerical', 'Engineered']
feature_counts = [len(categorical_cols), len(numerical_cols), X.shape[1] - len(categorical_cols) - len(numerical_cols)]
fig.add_trace(go.Bar(
    x=feature_types,
    y=feature_counts,
    name='Feature Types',
    marker_color=['lightcoral', 'lightblue', 'lightgreen']
), row=2, col=1)

# 5.4 Performance Distribution
perf_categories = ['Excellent (>90%)', 'Good (70-90%)', 'Fair (<70%)']
perf_counts = []

for name, results in model_results.items():
    f1 = results['f1_score']
    if f1 > 0.9:
        if len(perf_counts) == 0:
            perf_counts.append(1)
        else:
            perf_counts[0] = perf_counts[0] + 1
    elif f1 > 0.7:
        if len(perf_counts) <= 1:
            perf_counts.append(1)
        else:
            perf_counts[1] = perf_counts[1] + 1
    else:
        if len(perf_counts) <= 2:
            perf_counts.append(1)
        else:
            perf_counts[2] = perf_counts[2] + 1

while len(perf_counts) < 3:
    perf_counts.append(0)

fig.add_trace(go.Pie(
    labels=perf_categories,
    values=perf_counts,
    name='Performance Distribution'
), row=2, col=2)

fig.update_layout(height=800, title_text="💡 Insights and Recommendations Dashboard")
fig.show()

# Print comprehensive insights
print("\n📊 COMPREHENSIVE INSIGHTS SUMMARY")
print("=" * 60)

for category, metrics in insights.items():
    print(f"\n🔸 {category}:")
    for metric, value in metrics.items():
        print(f"   • {metric}: {value}")

# Generate recommendations
print(f"\n🚀 KEY RECOMMENDATIONS:")
print("=" * 40)

recommendations = []

# Data quality recommendations
if df.isnull().sum().sum() > 0:
    recommendations.append("📊 Address missing values to improve data quality")
else:
    recommendations.append("✅ Data quality is excellent - no missing values")

# Model performance recommendations
if best_model_results['f1_score'] > 0.9:
    recommendations.append("🏆 Model performance is excellent - ready for deployment")
elif best_model_results['f1_score'] > 0.7:
    recommendations.append("📈 Good model performance - consider hyperparameter tuning")
else:
    recommendations.append("⚠️ Model performance needs improvement - try feature engineering")

# Feature recommendations
if hasattr(best_model, 'feature_importances_'):
    top_feature = feature_importance.iloc[0]['feature']
    recommendations.append(f"🎯 Focus on top feature: {top_feature}")

# Class balance recommendations
class_balance_ratio = pd.Series(y).value_counts().min() / pd.Series(y).value_counts().max()
if class_balance_ratio < 0.5:
    recommendations.append("⚖️ Address class imbalance using resampling techniques")
else:
    recommendations.append("✅ Classes are well balanced")

# Confidence recommendations
low_conf_ratio = (prediction_confidence < 0.6).mean()
if low_conf_ratio > 0.2:
    recommendations.append(f"🎯 {low_conf_ratio*100:.1f}% predictions have low confidence - investigate further")

# Print recommendations
for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")

print(f"\n✅ VISUALIZATION DASHBOARD COMPLETED!")
print("=" * 60)
print("🎉 All visualizations and analyses are ready!")
print("📊 Dashboard includes:")
print("   • Dataset Overview Dashboard")
print("   • Model Performance Dashboard")
print("   • Detailed Classification Analysis")
print("   • Interactive Data Exploration")
print("   • Insights and Recommendations")

print(f"\n🚀 Ready for presentation and final report!")


💡 5. INSIGHTS AND RECOMMENDATIONS DASHBOARD



📊 COMPREHENSIVE INSIGHTS SUMMARY

🔸 Dataset Quality:
   • Total Samples: 10000
   • Missing Values: 0
   • Duplicate Rows: 0
   • Data Quality Score: High

🔸 Model Performance:
   • Best Model: Random Forest
   • Best Accuracy: 0.2695
   • Best F1-Score: 0.2363
   • Performance Level: Fair

🔸 Feature Analysis:
   • Total Features: 17
   • Most Important Feature: SessionID_encoded
   • Feature Diversity: 10
   • Dimensionality: Optimal

🔸 Classification Results:
   • Number of Classes: 6
   • Class Balance: Imbalanced
   • Average Confidence: 0.365
   • Low Confidence Predictions: 98.6%

🚀 KEY RECOMMENDATIONS:
1. ✅ Data quality is excellent - no missing values
2. ⚠️ Model performance needs improvement - try feature engineering
3. 🎯 Focus on top feature: SessionID_encoded
4. ⚖️ Address class imbalance using resampling techniques
5. 🎯 98.6% predictions have low confidence - investigate further

✅ VISUALIZATION DASHBOARD COMPLETED!
🎉 All visualizations and analyses are ready!
📊 Dashboard 