# Data Mining Project


[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FarnoodTavasoli/datamining_project/blob/main/data_mining_project.ipynb)



## Setup for Google Colab

In [None]:
try:
    from google.colab import drive
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    drive.mount('/content/drive',force_remount=True)
    print("Google Drive mounted successfully!")
else:
    print("Running locally")

In [None]:
# all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (accuracy_score, precision_score,
                             f1_score, confusion_matrix)
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from itertools import permutations

import warnings
warnings.filterwarnings('ignore')

# random seed for reproducibility
np.random.seed(42)

# plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## Data Loading and Exploration

Loading the Ionosphere dataset and performing initial exploration.

In [None]:
# Load the Ionosphere dataset
if IN_COLAB:
    data_path = '/content/drive/MyDrive/datamining_project/ionosphere.data'
else:
    data_path = 'files/ionosphere_5/ionosphere.data'


column_names = [f'feature_{i}' for i in range(1, 35)] + ['class']

df = pd.read_csv(data_path, header=None, names=column_names)

print(f"Dataset loaded successfully!")
df.head()

In [None]:
# Basic dataset info
print("=" * 80)
print("DATASET INFORMATION")
print("=" * 80)
print(f"Rows: {df.shape[0]}")
print(f"Features: {df.shape[1] - 1}")
print(f"\nMissing values:")
print(df.isnull().sum().sum())
print(f"\nClass distribution:")
print(df['class'].value_counts())

In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
df['class'].value_counts().plot(
    kind='bar',
    ax=axes[0],
    color=['#2ecc71', '#e74c3c']
)
axes[0].set_title('Class Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Class', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)

# Pie chart
df['class'].value_counts().plot(
    kind='pie',
    ax=axes[1],
    autopct='%1.1f%%',
    colors=['#2ecc71', '#e74c3c'],
    startangle=90
)
axes[1].set_title('Class Proportion', fontsize=14, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

In [None]:
# Visualize feature distributions
num_features = df.shape[1] - 1
n_cols = 6
n_rows = int(np.ceil(num_features / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 3))
axes = axes.ravel()

for i in range(num_features):
    feature_name = f'feature_{i+1}'
    axes[i].hist(
        df[feature_name],
        bins=30,
        alpha=0.7,
        color='steelblue',
        edgecolor='black'
    )
    axes[i].set_title(f'{feature_name}', fontsize=9)
    axes[i].set_xlabel('Value', fontsize=8)
    axes[i].set_ylabel('Frequency', fontsize=8)

# remove empty subplots
for j in range(num_features, len(axes)):
    axes[j].axis('off')

plt.suptitle('Distribution of Features', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(18, 14))
correlation_matrix = df.iloc[:, :34].corr()
sns.heatmap(
    correlation_matrix,
    annot=False,
    cmap='coolwarm',
    center=0,
    linewidths=0.3,
    cbar_kws={'label': 'Correlation'}
)
plt.title('Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

## Data Preprocessing

Preparing the data for machine learning models.

In [None]:
# Separate features and target
X = df.drop('class', axis=1)
y = df['class']

# Encode target variable (g=good, b=bad)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Features shape: {X.shape}")
print(f"\nClass encoding:")
for i, class_name in enumerate(label_encoder.classes_):
    print(f"  {class_name} -> {i}")

In [None]:
# Remove constant features (no variety)

constant_features = []
for column in X.columns:
    if X[column].nunique() == 1:
        constant_features.append(column)

if constant_features:
    print(f"Found {len(constant_features)} constant feature(s) with no variety:")
    for feature in constant_features:
        print(f"  - {feature}: {X[feature].unique()[0]}")
    
    X = X.drop(columns=constant_features)
    print(f"\nConstant features dropped!")
    print(f"Features shape after removing constant features: {X.shape}")
else:
    print("No constant features found. All features have variety.")

print(f"\nRemaining features shape: {X.shape}")


In [None]:
# Split data for train and test (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"\nTraining set class distribution:")
print(pd.Series(y_train).value_counts())
print(f"\nTest set class distribution:")
print(pd.Series(y_test).value_counts())

In [None]:
# standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully!")
print(f"\nScaled training data shape: {X_train_scaled.shape}")
print(f"Scaled test data shape: {X_test_scaled.shape}")

## Preprocessing Results Visualization

Visualizing how the data changed after preprocessing.

In [None]:
# Compare original vs scaled features
comparison_stats = pd.DataFrame({
    'Feature': [f'F{i+1}' for i in range(X_train.shape[1])],
    'Original Mean': X_train.mean().values,
    'Original Std': X_train.std().values,
    'Scaled Mean': X_train_scaled.mean(axis=0),
    'Scaled Std': X_train_scaled.std(axis=0)
})

print("="*80)
print("DATA STATISTICS: ORIGINAL vs SCALED (All Features)")
print("="*80)
print(comparison_stats.round(4).to_string(index=False))
print("\n")

# Statistics of scaled data
print("="*80)
print("SCALED TRAINING DATA SUMMARY")
print("="*80)
print(f"Overall Mean (should be ~0): {X_train_scaled.mean():.6f}")
print(f"Overall Std Dev (should be ~1): {X_train_scaled.std():.6f}")
print(f"Min value: {X_train_scaled.min():.4f}")
print(f"Max value: {X_train_scaled.max():.4f}")


In [None]:
# Statistical summary table
print("="*80)
print("DATA PREPROCESSING SUMMARY")
print("="*80)

summary_data = {
    'Metric': [
        'Total Samples',
        'Training Samples',
        'Test Samples',
        'Number of Features',
        'Class 0 (Bad)',
        'Class 1 (Good)',
        'Feature Scaling',
        'Scaled Data Mean',
        'Scaled Data Std Dev'
    ],
    'Value': [
        f"{len(df)}",
        f"{len(X_train)}",
        f"{len(X_test)}",
        f"{X_train.shape[1]}",
        f"{(y_train == 0).sum()}",
        f"{(y_train == 1).sum()}",
        "StandardScaler",
        f"{X_train_scaled.mean():.6f}",
        f"{X_train_scaled.std():.6f}"
    ]
}

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))
print("\n")


In [None]:
# Train/Test split visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Split proportions (bar plot)
split_labels = ['Train Set', 'Test Set']
split_sizes = [len(X_train), len(X_test)]
colors_split = ['#3498db', '#e74c3c']

axes[0].bar(
    split_labels,
    split_sizes,
    color=colors_split,
    edgecolor='black',
    linewidth=2
)
axes[0].set_title('Train/Test Split Distribution', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Number of Samples', fontsize=11)
for i, v in enumerate(split_sizes):
    axes[0].text(i, v + 5, str(v), ha='center', fontweight='bold', fontsize=11)

# Class distribution in train and test sets
train_class_dist = pd.Series(y_train).value_counts().sort_index()
test_class_dist = pd.Series(y_test).value_counts().sort_index()
class_names = ['Bad (0)', 'Good (1)']

x = np.arange(len(class_names))
width = 0.35

axes[1].bar(
    x - width/2,
    train_class_dist.values,
    width,
    label='Train Set',
    color='#3498db',
    edgecolor='black'
)
axes[1].bar(
    x + width/2,
    test_class_dist.values,
    width,
    label='Test Set',
    color='#e74c3c',
    edgecolor='black'
)
axes[1].set_title('Class Distribution in Train/Test Sets', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Number of Samples', fontsize=11)
axes[1].set_xticks(x)
axes[1].set_xticklabels(class_names)
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

In [None]:
# Visualize boxplots comparing original vs scaled data
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Original data boxplot
num_features = X_train.shape[1]
axes[0].boxplot(
    [X_train.iloc[:, i] for i in range(num_features)],
    labels=[f'F{i+1}' for i in range(num_features)]
)
axes[0].set_title(f'Original Features Boxplot (All {num_features} Features)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Value')
axes[0].set_xlabel('Features')
axes[0].grid(True, alpha=0.3)
axes[0].tick_params(axis='x', rotation=45)

# Scaled data boxplot
axes[1].boxplot(
    [X_train_scaled[:, i] for i in range(num_features)],
    labels=[f'F{i+1}' for i in range(num_features)]
)
axes[1].set_title(f'Scaled Features Boxplot (All {num_features} Features)', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Value')
axes[1].set_xlabel('Features')
axes[1].grid(True, alpha=0.3)
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Visualize ALL feature distributions
num_features = X_train.shape[1]
num_cols = 4
num_rows = (num_features + num_cols - 1) // num_cols

fig, axes = plt.subplots(num_rows, num_cols, figsize=(18, 4*num_rows))
axes = axes.ravel()

for i in range(num_features):
    # Plot original
    axes[i].hist(
        X_train.iloc[:, i],
        bins=30,
        alpha=0.6,
        label='Original',
        color='steelblue',
        edgecolor='black'
    )
    # Plot scaled
    ax2 = axes[i].twinx()
    ax2.hist(
        X_train_scaled[:, i],
        bins=30,
        alpha=0.6,
        label='Scaled',
        color='orange',
        edgecolor='black'
    )
    
    axes[i].set_title(f'Feature {i+1}', fontsize=10, fontweight='bold')
    axes[i].set_xlabel('Value', fontsize=9)
    axes[i].set_ylabel('Frequency (Original)', color='steelblue', fontsize=9)
    ax2.set_ylabel('Frequency (Scaled)', color='orange', fontsize=9)
    axes[i].tick_params(axis='y', labelcolor='steelblue')
    ax2.tick_params(axis='y', labelcolor='orange')
    axes[i].grid(True, alpha=0.3)

# delete empty subplots
for i in range(num_features, len(axes)):
    axes[i].axis('off')

plt.suptitle(f'All {num_features} Features: Original vs Scaled Distributions', fontsize=14, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

## Cross Validation effect on Decision Tree

Compare four Decision Tree training strategies and evaluate them on the test set.

In [None]:
# TPR = Recall, FPR = FP / (FP + TN)

def evaluate_on_test(model, X_test, y_test):
    
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    tpr = tp / (tp + fn) if (tp + fn) else 0.0
    fpr = fp / (fp + tn) if (fp + tn) else 0.0
    return {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'TPR (Recall)': tpr,
        'FPR': fpr,
        'F1': f1_score(y_test, y_pred)
    }

print("="*80)
print("DECISION TREE: 4 TRAINING STRATEGIES")
print("="*80)

# 1) Normal training
base_tree = DecisionTreeClassifier()
base_tree.fit(X_train, y_train)

# 2) Training with 10-fold CV
cv = StratifiedKFold(n_splits=10, shuffle=True)
param_grid = {
    'max_depth': [None, 3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
cv_search = GridSearchCV(
    DecisionTreeClassifier(),
    param_grid=param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)
cv_search.fit(X_train, y_train)
cv_tree = cv_search.best_estimator_
cv_scores = cv_search.cv_results_['mean_test_score']

# 3) Normal training with CCP post-pruning
X_train_sub, X_val, y_train_sub, y_val = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train
)
path = DecisionTreeClassifier().cost_complexity_pruning_path(X_train_sub, y_train_sub)
ccp_alphas = path.ccp_alphas

val_scores = []
for alpha in ccp_alphas:
    clf = DecisionTreeClassifier(ccp_alpha=alpha)
    clf.fit(X_train_sub, y_train_sub)
    val_scores.append(accuracy_score(y_val, clf.predict(X_val)))

best_alpha_val = ccp_alphas[int(np.argmax(val_scores))]
ccp_tree = DecisionTreeClassifier(ccp_alpha=best_alpha_val)
ccp_tree.fit(X_train, y_train)

# 4) 10-fold CV with CCP
cv_alpha_scores = []
for alpha in ccp_alphas:
    clf = DecisionTreeClassifier(ccp_alpha=alpha)
    scores = cross_val_score(clf, X_train, y_train, cv=cv, scoring='accuracy')
    cv_alpha_scores.append(scores.mean())

best_alpha_cv = ccp_alphas[int(np.argmax(cv_alpha_scores))]
ccp_cv_tree = DecisionTreeClassifier(ccp_alpha=best_alpha_cv)
ccp_cv_tree.fit(X_train, y_train)

# Compare all models
results = []

results.append({
    'Model': 'Normal Training',
    'CV Accuracy (train)': None,
    'CCP Alpha': None,
    **evaluate_on_test(base_tree, X_test, y_test)
})

results.append({
    'Model': '10-Fold CV',
    'CV Accuracy (train)': cv_search.best_score_,
    'CCP Alpha': None,
    **evaluate_on_test(cv_tree, X_test, y_test)
})

results.append({
    'Model': 'Normal + CCP',
    'CV Accuracy (train)': None,
    'CCP Alpha': best_alpha_val,
    **evaluate_on_test(ccp_tree, X_test, y_test)
})

results.append({
    'Model': '10-Fold CV + CCP',
    'CV Accuracy (train)': max(cv_alpha_scores),
    'CCP Alpha': best_alpha_cv,
    **evaluate_on_test(ccp_cv_tree, X_test, y_test)
})

results_df = pd.DataFrame(results)

results_df['CV Accuracy (train)'] = results_df['CV Accuracy (train)'].round(6)
results_df['CCP Alpha'] = results_df['CCP Alpha'].apply(lambda x: None if x is None else round(float(x), 8))
for col in ['Accuracy', 'Precision', 'TPR (Recall)', 'FPR','F1']:
    results_df[col] = results_df[col].round(6)

print(results_df.to_string(index=False))

# Highlight best model by test accuracy
best_model = results_df.loc[results_df['Accuracy'].idxmax()]
print("\nBest model by test accuracy:")
print(best_model.to_string())

In [None]:
# Comparison Visualization

metrics_to_plot = ['Accuracy', 'Precision', 'TPR (Recall)', 'FPR', 'F1']
models = results_df['Model'].tolist()

x = np.arange(len(metrics_to_plot))
width = 0.18

fig, ax = plt.subplots(figsize=(14, 6))
for i, model in enumerate(models):
    scores = results_df.loc[results_df['Model'] == model, metrics_to_plot].values.flatten()
    ax.bar(x + i * width, scores, width, label=model)

ax.set_title('Model Comparison on Test Set Metrics', fontsize=13, fontweight='bold')
ax.set_xlabel('Metric')
ax.set_ylabel('Score')
ax.set_xticks(x + (width * (len(models) - 1) / 2))
ax.set_xticklabels(metrics_to_plot, rotation=0)
ax.set_ylim(0, 1)
ax.legend(ncol=2)
ax.grid(True, axis='y', alpha=0.3)

plt.tight_layout()
plt.show()


## Ensemble Methods: AdaBoost & Random Forest

Comparing ensemble strategies using AdaBoost and Random Forest with and without CCP pruning


In [None]:

print("="*80)
print("ENSEMBLE METHODS: 4 TRAINING STRATEGIES")
print("="*80)

# Get CCP alphas from a single tree
X_ens_train, X_ens_val, y_ens_train, y_ens_val = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train
)
path = DecisionTreeClassifier().cost_complexity_pruning_path(X_ens_train, y_ens_train)
ccp_alphas_ens = path.ccp_alphas


# Finding alpha for Ada
ada_val_scores = []
for alpha in ccp_alphas_ens:
    base_est = DecisionTreeClassifier(ccp_alpha=alpha)
    ada_clf = AdaBoostClassifier(estimator=base_est, n_estimators=50, algorithm='SAMME')
    ada_clf.fit(X_ens_train, y_ens_train)
    ada_val_scores.append(accuracy_score(y_ens_val, ada_clf.predict(X_ens_val)))

best_alpha_ada = ccp_alphas_ens[int(np.argmax(ada_val_scores))]

# Finding alpha for Random Forest
rf_val_scores = []
for alpha in ccp_alphas_ens:
    base_est = DecisionTreeClassifier(ccp_alpha=alpha)
    rf_clf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=None, min_impurity_decrease=alpha)
    rf_clf.fit(X_ens_train, y_ens_train)
    rf_val_scores.append(accuracy_score(y_ens_val, rf_clf.predict(X_ens_val)))

best_alpha_rf = ccp_alphas_ens[int(np.argmax(rf_val_scores))]

# 1) AdaBoost Normal
ada_base = AdaBoostClassifier(n_estimators=50, algorithm='SAMME')
ada_base.fit(X_train, y_train)

# 2) AdaBoost Pruned

ada_pruned = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(ccp_alpha=best_alpha_ada),
    n_estimators=50,
    algorithm='SAMME'
)
ada_pruned.fit(X_train, y_train)

# 3) Random Forest Normal
rf_base = RandomForestClassifier(n_estimators=100)
rf_base.fit(X_train, y_train)

# 4) Random Forest Pruned
rf_pruned = RandomForestClassifier(n_estimators=100, min_impurity_decrease=best_alpha_rf)
rf_pruned.fit(X_train, y_train)

ens_results = []

ens_results.append({
    'Model': 'AdaBoost (default)',
    'CCP Alpha': None,
    **evaluate_on_test(ada_base, X_test, y_test)
})

ens_results.append({
    'Model': 'AdaBoost (Pruned)',
    'CCP Alpha': best_alpha_ada,
    **evaluate_on_test(ada_pruned, X_test, y_test)
})

ens_results.append({
    'Model': 'Random Forest (default)',
    'CCP Alpha': None,
    **evaluate_on_test(rf_base, X_test, y_test)
})

ens_results.append({
    'Model': 'Random Forest (Pruned)',
    'CCP Alpha': best_alpha_rf,
    **evaluate_on_test(rf_pruned, X_test, y_test)
})

ens_results_df = pd.DataFrame(ens_results)

# Results
ens_results_df['CCP Alpha'] = ens_results_df['CCP Alpha'].apply(lambda x: None if x is None else round(float(x), 8))
for col in ['Accuracy', 'Precision', 'TPR (Recall)', 'FPR',  'F1']:
    ens_results_df[col] = ens_results_df[col].round(6)

print(ens_results_df.to_string(index=False))

# Highlight best model
best_ens = ens_results_df.loc[ens_results_df['Accuracy'].idxmax()]
print("Best ensemble model by test accuracy:")
print(best_ens.to_string())



In [None]:
#  Ensemble model comparison visualization
ens_metrics = ['Accuracy', 'Precision', 'TPR (Recall)', 'FPR', 'F1']
ens_models = ens_results_df['Model'].tolist()

x = np.arange(len(ens_metrics))
width = 0.18

fig, ax = plt.subplots(figsize=(14, 6))
for i, model in enumerate(ens_models):
    scores = ens_results_df.loc[ens_results_df['Model'] == model, ens_metrics].values.flatten()
    ax.bar(x + i * width, scores, width, label=model)

ax.set_title('Ensemble Methods: Model Comparison on Test Set', fontsize=13, fontweight='bold')
ax.set_xlabel('Metric')
ax.set_ylabel('Score')
ax.set_xticks(x + (width * (len(ens_models) - 1) / 2))
ax.set_xticklabels(ens_metrics, rotation=0)
ax.set_ylim(0, 1)
ax.legend(ncol=2)
ax.grid(True, axis='y', alpha=0.3)

plt.tight_layout()
plt.show()


## 3. Clustering (K-Means vs K-Medoids)
comparing cluster assignments using misclassification percent


In [None]:
X_all_scaled = StandardScaler().fit_transform(df.drop('class', axis=1).drop(columns=df.columns[df.nunique() == 1]))
y_true = LabelEncoder().fit_transform(df['class'])  # b=0, g=1

k_values = [2, 3, 4, 5]

def compute_misclassification(y_true, labels, k):
    """misclassification percent by comparing cluster assignments to actual labels.

    Args:
        y_true: True labels.
        labels: Cluster labels.
        k: Number of clusters.

    Returns:
        Misclassification percentage.
    """
    
    unique_true = np.unique(y_true)
    unique_clusters = np.unique(labels)

    best_error = len(y_true)

    if k == len(unique_true):
        # Try all permutations of label assignment
        for perm in permutations(unique_true):
            mapping = dict(zip(unique_clusters, perm))
            mapped = np.array([mapping.get(c, -1) for c in labels])
            errors = np.sum(mapped != y_true)
            if errors < best_error:
                best_error = errors
    else:
        mapped = np.zeros_like(labels)
        for c in unique_clusters:
            mask = labels == c
            if np.sum(mask) > 0:
                majority = np.bincount(y_true[mask]).argmax()
                mapped[mask] = majority
        best_error = np.sum(mapped != y_true)

    return (best_error / len(y_true)) * 100


def k_medoids(X, k, max_iter=100, random_state=42):
    """ K-Medoids clustering algorithm

    Args:
        X: Feature matrix.
        k: Number of clusters.
        max_iter: Maximum iterations.
        random_state: Random seed.

    Returns:
        tuple: (labels, medoid_indices)
    """
    rng = np.random.default_rng(random_state)
    n = X.shape[0]

    # distance matrix
    dist = np.linalg.norm(X[:, None, :] - X[None, :, :], axis=2)

    medoid_indices = rng.choice(n, size=k, replace=False)
    labels = None

    for i in range(max_iter):
        # Assign points to nearest medoid
        distances_to_medoids = dist[:, medoid_indices]
        new_labels = np.argmin(distances_to_medoids, axis=1)

        # Update medoids
        new_medoids = medoid_indices.copy()
        for j in range(k):
            cluster_idx = np.where(new_labels == j)[0]
            if len(cluster_idx) == 0:
                new_medoids[j] = rng.choice(n)
                continue
            cluster_dist = dist[np.ix_(cluster_idx, cluster_idx)]
            medoid_local = cluster_idx[np.argmin(cluster_dist.sum(axis=1))]
            new_medoids[j] = medoid_local

        if labels is not None and np.array_equal(new_labels, labels) and np.array_equal(new_medoids, medoid_indices):
            labels = new_labels
            medoid_indices = new_medoids
            break

        labels = new_labels
        medoid_indices = new_medoids

    return labels, medoid_indices


# Run clustering for each k
results = []
kmeans_models = {}
kmedoids_models = {}

for k in k_values:
    # K-Means
    km = KMeans(n_clusters=k, n_init=10, random_state=42)
    km_labels = km.fit_predict(X_all_scaled)
    km_error = compute_misclassification(y_true, km_labels, k)
    kmeans_models[k] = km_labels

    # K-Medoids
    kmed_labels, _ = k_medoids(X_all_scaled, k, max_iter=100, random_state=42)
    kmed_error = compute_misclassification(y_true, kmed_labels, k)
    kmedoids_models[k] = kmed_labels

    results.append({
        'k': k,
        'K-Means Error (%)': round(km_error, 2),
        'K-Medoids Error (%)': round(kmed_error, 2)
    })

cluster_df = pd.DataFrame(results)
print("Clustering Misclassification Rates:")
print(cluster_df.to_string(index=False),"\n")

display(
    cluster_df.style.set_caption("Misclassification % by Clustering Method and k")
    .set_properties(**{'text-align': 'center'})
    .highlight_min(
        subset=['K-Means Error (%)', 'K-Medoids Error (%)'],
        color='lightgreen'
    )
)

In [None]:
# Cluster visualization
# Reduce to 2D
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_all_scaled)

colors = ['#e41a1c', '#377eb8', '#4daf4a', '#984ea3', '#ff7f00']

for k in k_values:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    fig.suptitle(f'Clustering Comparison â€” k = {k}', fontsize=14, fontweight='bold')
    
    # K-Means
    ax = axes[0]
    km_labels = kmeans_models[k]
    for c in range(k):
        mask = km_labels == c
        ax.scatter(
            X_pca[mask, 0],
            X_pca[mask, 1],
            c=colors[c],
            label=f'Cluster {c}',
            alpha=0.6,
            edgecolors='w',
            linewidth=0.3,
            s=30
        )
    ax.set_title(f'K-Means (k={k})\nError: {cluster_df.loc[cluster_df["k"]==k, "K-Means Error (%)"].values[0]}%')
    ax.set_xlabel('PCA Component 1')
    ax.set_ylabel('PCA Component 2')
    ax.legend(fontsize=8)
    ax.grid(True, alpha=0.3)
    
    # K-Medoids
    ax = axes[1]
    kmed_labels = kmedoids_models[k]
    for c in range(k):
        mask = kmed_labels == c
        ax.scatter(
            X_pca[mask, 0],
            X_pca[mask, 1],
            c=colors[c],
            label=f'Cluster {c}',
            alpha=0.6,
            edgecolors='w',
            linewidth=0.3,
            s=30
        )
    ax.set_title(f'K-Medoids (k={k})\nError: {cluster_df.loc[cluster_df["k"]==k, "K-Medoids Error (%)"].values[0]}%')
    ax.set_xlabel('PCA Component 1')
    ax.set_ylabel('PCA Component 2')
    ax.legend(fontsize=8)
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()