In [None]:
pip install openml

In [27]:
# Define centralized paths
BASE_DIR = Path("C:/Users/Salom/Documents/ibm_data_science_foundations")
CODE_DIR = BASE_DIR / "code"
DATA_DIR = BASE_DIR / "data"
IMAGES_DIR = BASE_DIR / "images"

# Example usage
BANKNOTE_CSV_PATH = DATA_DIR / "banknote_authentication.csv"
STATS_IMG_PATH = IMAGES_DIR / "stats" / "mean_std_table.png"
BOXPLOT_IMG_PATH = IMAGES_DIR / "boxplots" / "banknote_boxplot.png"

In [28]:
"""
Banknote Authentication Clustering Pipeline
Author: Salome Scherer
Description:
- Loads OpenML banknote dataset
- Performs EDA and K-Means clustering
- Saves visualizations to structured folders
"""

# 📦 Imports
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import openml

# 📁 Centralized Paths
BASE_DIR = Path("C:/Users/Salom/Documents/ibm_data_science_foundations")
CODE_DIR = BASE_DIR / "code"
DATA_DIR = BASE_DIR / "data"
IMAGES_DIR = BASE_DIR / "images"

BANKNOTE_CSV_PATH = DATA_DIR / "banknote_authentication.csv"

# 📂 Ensure image subdirectories exist
def ensure_directories():
    for subfolder in ["stats", "boxplots", "distributions", "clustering"]:
        Path(IMAGES_DIR / subfolder).mkdir(parents=True, exist_ok=True)

# 📥 Load and Clean Data
def load_and_clean_banknote_data(save_csv_path=None):
    dataset_id = 1462
    dataset = openml.datasets.get_dataset(dataset_id)
    df, _, _, _ = dataset.get_data()
    df.dropna(inplace=True)

    if save_csv_path:
        os.makedirs(os.path.dirname(save_csv_path), exist_ok=True)
        df.to_csv(save_csv_path, index=False)
        print(f"✅ CSV saved at: {save_csv_path}")

    return df

# 📊 Visualization Functions
def plot_mean_std_table(df, save_path):
    numeric_df = df.select_dtypes(include="number")
    stats_df = pd.DataFrame({
        "Mean": numeric_df.mean(),
        "Std Dev": numeric_df.std()
    })

    fig, ax = plt.subplots(figsize=(8, 4))
    ax.set_title("Mean and Standard Deviation", fontsize=14, fontweight="bold", pad=15)
    ax.axis("off")
    table = ax.table(cellText=stats_df.round(2).values,
                     colLabels=stats_df.columns,
                     rowLabels=stats_df.index,
                     cellLoc='center',
                     loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.2, 1.2)
    for key, cell in table.get_celld().items():
        cell.set_edgecolor("black")
        cell.set_height(0.1)
    try:
        plt.savefig(save_path, dpi=300, bbox_inches="tight")
        print(f"✅ Saved: {save_path}")
    except Exception as e:
        print(f"❌ Failed to save {save_path}: {e}")
    plt.close()

def plot_boxplot(df, features, save_path):
    plt.figure(figsize=(8, 6))
    plt.boxplot([df[feature] for feature in features], tick_labels=features, patch_artist=True)
    plt.title("BankNotes Analysis", fontsize=14, fontweight="bold")
    plt.xlabel("Features", fontsize=12)
    plt.ylabel("Value Distribution", fontsize=12)
    plt.grid(True, linestyle="--", alpha=0.6)
    try:
        plt.savefig(save_path, dpi=300, bbox_inches="tight")
        print(f"✅ Saved: {save_path}")
    except Exception as e:
        print(f"❌ Failed to save {save_path}: {e}")
    plt.close()

def plot_feature_distributions(df, features, save_path):
    fig, axes = plt.subplots(1, len(features), figsize=(12, 4))
    for i, feature in enumerate(features):
        axes[i].hist(df[feature], bins=30, color="skyblue", edgecolor="black")
        axes[i].set_title(f"{feature} Distribution")
        axes[i].set_xlabel("Value")
        axes[i].set_ylabel("Frequency")
        axes[i].grid(True, linestyle="--", alpha=0.5)
    fig.suptitle("Feature Distributions", fontsize=14, fontweight="bold")
    try:
        plt.savefig(save_path, dpi=300, bbox_inches="tight")
        print(f"✅ Saved: {save_path}")
    except Exception as e:
        print(f"❌ Failed to save {save_path}: {e}")
    plt.close()

# 🔍 Clustering Functions
def run_kmeans_clustering(df, features, save_path, n_clusters=2, random_state=42):
    X = df[features]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
    df["Cluster"] = kmeans.fit_predict(X_scaled)
    plt.figure(figsize=(10, 6))
    plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=df["Cluster"], cmap="coolwarm", edgecolors="black")
    plt.xlabel(f"{features[0]} (Standardized)", fontsize=12)
    plt.ylabel(f"{features[1]} (Standardized)", fontsize=12)
    plt.title("K-Means Clustering - Banknote Authentication", fontsize=14, fontweight="bold")
    plt.colorbar(label="Cluster Label")
    plt.grid(True, linestyle="--", alpha=0.6)
    try:
        plt.savefig(save_path, dpi=300, bbox_inches="tight")
        print(f"✅ Saved: {save_path}")
    except Exception as e:
        print(f"❌ Failed to save {save_path}: {e}")
    plt.close()
    return df, X_scaled

def plot_kmeans_with_centroids(df, features, save_path, n_clusters=2, random_state=42):
    scaler = StandardScaler()
    df_norm = pd.DataFrame(scaler.fit_transform(df[features]), columns=features)
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
    df_norm["Cluster"] = kmeans.fit_predict(df_norm)
    centers = kmeans.cluster_centers_
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.scatter(df_norm[features[0]], df_norm[features[1]], c=df_norm["Cluster"], cmap="coolwarm", alpha=0.5)
    ax.scatter(centers[:, 0], centers[:, 1], s=200, marker='X', color='black', label="Centroids")
    ax.set_title("Genuine and Fake Notes Clusters", fontsize=14, fontweight="bold")
    ax.set_xlabel(f"{features[0]}")
    ax.set_ylabel(f"{features[1]}")
    ax.legend()
    ax.grid(True, linestyle="--", alpha=0.6)
    try:
        plt.savefig(save_path, dpi=300, bbox_inches="tight")
        print(f"✅ Saved: {save_path}")
    except Exception as e:
        print(f"❌ Failed to save {save_path}: {e}")
    plt.close()
    return df_norm, centers

def plot_kmeans_with_ellipsoids(df, features, save_path, n_clusters=2, random_state=42, ellipse_size=0.6):
    scaler = StandardScaler()
    df_norm = pd.DataFrame(scaler.fit_transform(df[features]), columns=features)
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
    df_norm["Cluster"] = kmeans.fit_predict(df_norm)
    centers = kmeans.cluster_centers_
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.scatter(df_norm[features[0]], df_norm[features[1]], c=df_norm["Cluster"], cmap="coolwarm", alpha=0.5)
    ax.set_title("Genuine and Fake Notes Clusters", fontsize=14, fontweight="bold")
    ax.set_xlabel(f"{features[0]}")
    ax.set_ylabel(f"{features[1]}")
    ax.grid(True, linestyle="--", alpha=0.6)
    for center in centers:
        ellipse = Ellipse(xy=center[:2], width=ellipse_size, height=ellipse_size,
                          edgecolor="gold", facecolor="yellow", alpha=0.3)
        ax.add_patch(ellipse)
    try:
        plt.savefig(save_path, dpi=300, bbox_inches="tight")
        print(f"✅ Saved: {save_path}")
    except Exception as e:
        print(f"❌ Failed to save {save_path}: {e}")
    plt.close()
    return df_norm, centers

def plot_kmeans_iterations(df, features, save_path):
    scaler = StandardScaler()
    df_norm = pd.DataFrame(scaler.fit_transform(df[features]), columns=features)
    fig, ax = plt.subplots(2, 5, figsize=(15, 5))
    ax = ax.ravel()
    for i in range(10):
        kmeans = KMeans(n_clusters=2, random_state=i, n_init=10)
        labels = kmeans.fit_predict(df_norm)
        centers = kmeans.cluster_centers_
        ax[i].scatter(df_norm[features[0]][labels == 0], df_norm[features[1]][labels == 0], c="g", alpha=0.5)
        ax[i].scatter(df_norm[features[0]][labels == 1], df_norm[features[1]][labels == 1], c="b", alpha=0.5)
        for center in centers:
            ellipse = Ellipse(xy=center[:2], width=0.6, height=0.6, edgecolor="gold", facecolor="yellow", alpha=0.3)
            ax[i].add_patch(ellipse)
        ax[i].set_title(f"Iteration {i+1}")
        ax[i].grid(True, linestyle="--", alpha=0.6)
    plt.tight_layout()
    try:
        plt.savefig(save_path, dpi=300, bbox_inches="tight")
        print(f"✅ Saved: {save_path}")
    except Exception as e:
        print(f"❌ Failed to save {save_path}: {e}")
    plt.close()

In [29]:
if __name__ == "__main__":
    ensure_directories()
    features = ["V1", "V2", "V3", "V4"]

    # Load and clean data
    df = load_and_clean_banknote_data(save_csv_path=BANKNOTE_CSV_PATH)

    # Visualizations
    plot_mean_std_table(df, save_path=IMAGES_DIR / "stats" / "mean_std_table.png")
    plot_boxplot(df, features, save_path=IMAGES_DIR / "boxplots" / "banknote_boxplot.png")
    plot_feature_distributions(df, features, save_path=IMAGES_DIR / "distributions" / "feature_histograms.png")

    # Clustering
    clustered_df, _ = run_kmeans_clustering(df, features[:2], save_path=IMAGES_DIR / "clustering" / "kmeans_basic.png")
    _, _ = plot_kmeans_with_centroids(df, features[:2], save_path=IMAGES_DIR / "clustering" / "kmeans_centroids.png")
    _, _ = plot_kmeans_with_ellipsoids(df, features[:2], save_path=IMAGES_DIR / "clustering" / "kmeans_ellipsoids.png")
    plot_kmeans_iterations(df, features[:2], save_path=IMAGES_DIR / "clustering" / "kmeans_iterations.png")

✅ CSV saved at: C:\Users\Salom\Documents\ibm_data_science_foundations\data\banknote_authentication.csv
✅ Saved: C:\Users\Salom\Documents\ibm_data_science_foundations\images\stats\mean_std_table.png
✅ Saved: C:\Users\Salom\Documents\ibm_data_science_foundations\images\boxplots\banknote_boxplot.png
✅ Saved: C:\Users\Salom\Documents\ibm_data_science_foundations\images\distributions\feature_histograms.png
✅ Saved: C:\Users\Salom\Documents\ibm_data_science_foundations\images\clustering\kmeans_basic.png
✅ Saved: C:\Users\Salom\Documents\ibm_data_science_foundations\images\clustering\kmeans_centroids.png
✅ Saved: C:\Users\Salom\Documents\ibm_data_science_foundations\images\clustering\kmeans_ellipsoids.png
✅ Saved: C:\Users\Salom\Documents\ibm_data_science_foundations\images\clustering\kmeans_iterations.png
