In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

SCALERS = {
    "StandardScaler": StandardScaler(),
    "MinMaxScaler": MinMaxScaler(),
    "RobustScaler": RobustScaler(),
    "MaxAbsScaler": MaxAbsScaler()
}

def run_scaler_experiment_classification(X, y, title):
    """
    Applies 4 scalers to X, trains 3 models (NB, DT, ANN) and prints accuracy for each.
    """
    print(f"\n\n==============================")
    print(f"DATASET: {title}")
    print(f"X shape: {X.shape}, y shape: {y.shape}")
    print(f"==============================")

    # Split once (fair comparison)
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.25, random_state=42, stratify=y
    )

    # Models used in this chat
    models = {
        "NaiveBayes(GaussianNB)": GaussianNB(),
        "DecisionTree(entropy)": DecisionTreeClassifier(criterion="entropy", random_state=42),
        "ANN(MLP)": MLPClassifier(hidden_layer_sizes=(64, 32), activation="relu",
                                  max_iter=500, random_state=42)
    }

    for scaler_name, scaler in SCALERS.items():
        print(f"\n--- Scaler: {scaler_name} ---")

        # Scale X (fit only on train, transform test)
        X_tr_s = scaler.fit_transform(X_tr)
        X_te_s = scaler.transform(X_te)

        for model_name, model in models.items():
            model.fit(X_tr_s, y_tr)
            pred = model.predict(X_te_s)
            acc = accuracy_score(y_te, pred)
            print(f"{model_name:22s} Accuracy = {acc:.4f}")


In [2]:
# -------- Load Diabetes dataset (Pima Indians) --------
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
cols = ["Pregnancies","Glucose","BloodPressure","SkinThickness",
        "Insulin","BMI","DiabetesPedigreeFunction","Age","Outcome"]
df = pd.read_csv(url, names=cols)

X = df.drop(columns=["Outcome"])
y = df["Outcome"].astype(int)

# Medical rule: 0 values in these columns are not realistic -> treat as missing
zero_cols = ["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]
X[zero_cols] = X[zero_cols].replace(0, np.nan)

# Impute missing values
imputer = SimpleImputer(strategy="median")
X = imputer.fit_transform(X)

run_scaler_experiment_classification(X, y, "Diabetes (Pima Indians)")




DATASET: Diabetes (Pima Indians)
X shape: (768, 8), y shape: (768,)

--- Scaler: StandardScaler ---
NaiveBayes(GaussianNB) Accuracy = 0.7240
DecisionTree(entropy)  Accuracy = 0.7031




ANN(MLP)               Accuracy = 0.7552

--- Scaler: MinMaxScaler ---
NaiveBayes(GaussianNB) Accuracy = 0.7240
DecisionTree(entropy)  Accuracy = 0.7135




ANN(MLP)               Accuracy = 0.7083

--- Scaler: RobustScaler ---
NaiveBayes(GaussianNB) Accuracy = 0.7240
DecisionTree(entropy)  Accuracy = 0.7188




ANN(MLP)               Accuracy = 0.7396

--- Scaler: MaxAbsScaler ---
NaiveBayes(GaussianNB) Accuracy = 0.7240
DecisionTree(entropy)  Accuracy = 0.7083
ANN(MLP)               Accuracy = 0.7240




In [3]:
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()
X = data.data
y = data.target  # 0=malignant, 1=benign

# (Usually no missing values, but safe to keep imputer)
imputer = SimpleImputer(strategy="median")
X = imputer.fit_transform(X)

run_scaler_experiment_classification(X, y, "Breast Cancer (Wisconsin)")




DATASET: Breast Cancer (Wisconsin)
X shape: (569, 30), y shape: (569,)

--- Scaler: StandardScaler ---
NaiveBayes(GaussianNB) Accuracy = 0.9371
DecisionTree(entropy)  Accuracy = 0.9301
ANN(MLP)               Accuracy = 0.9720

--- Scaler: MinMaxScaler ---
NaiveBayes(GaussianNB) Accuracy = 0.9371
DecisionTree(entropy)  Accuracy = 0.9301
ANN(MLP)               Accuracy = 0.9790

--- Scaler: RobustScaler ---
NaiveBayes(GaussianNB) Accuracy = 0.9371
DecisionTree(entropy)  Accuracy = 0.9301
ANN(MLP)               Accuracy = 0.9720

--- Scaler: MaxAbsScaler ---
NaiveBayes(GaussianNB) Accuracy = 0.9371
DecisionTree(entropy)  Accuracy = 0.9301
ANN(MLP)               Accuracy = 0.9860


In [4]:
from sklearn.datasets import fetch_openml

X_raw, y_raw = fetch_openml(name="car", version=1, as_frame=True, return_X_y=True)

df = X_raw.copy()
df["class"] = y_raw

# Encode features
df["buying"]   = df["buying"].map({"low":0, "med":1, "high":2, "vhigh":3})
df["maint"]    = df["maint"].map({"low":0, "med":1, "high":2, "vhigh":3})
df["doors"]    = df["doors"].map({"2":2, "3":3, "4":4, "5more":5})
df["persons"]  = df["persons"].map({"2":2, "4":4, "more":5})
df["lug_boot"] = df["lug_boot"].map({"small":0, "med":1, "big":2})
df["safety"]   = df["safety"].map({"low":0, "med":1, "high":2})

# Encode target
df["target"] = df["class"].map({"unacc":0, "acc":1, "good":2, "vgood":3})

X = df.drop(columns=["class","target"]).values
y = df["target"].astype(int).values

# Safe impute (if any)
imputer = SimpleImputer(strategy="most_frequent")
X = imputer.fit_transform(X)

run_scaler_experiment_classification(X, y, "Car Evaluation")


  warn(




DATASET: Car Evaluation
X shape: (1728, 6), y shape: (1728,)

--- Scaler: StandardScaler ---
NaiveBayes(GaussianNB) Accuracy = 0.7523
DecisionTree(entropy)  Accuracy = 0.9676
ANN(MLP)               Accuracy = 0.9931

--- Scaler: MinMaxScaler ---
NaiveBayes(GaussianNB) Accuracy = 0.7523
DecisionTree(entropy)  Accuracy = 0.9676




ANN(MLP)               Accuracy = 0.9815

--- Scaler: RobustScaler ---
NaiveBayes(GaussianNB) Accuracy = 0.7546
DecisionTree(entropy)  Accuracy = 0.9653
ANN(MLP)               Accuracy = 0.9931

--- Scaler: MaxAbsScaler ---
NaiveBayes(GaussianNB) Accuracy = 0.7523
DecisionTree(entropy)  Accuracy = 0.9653
ANN(MLP)               Accuracy = 0.9769




In [7]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler

# --------------------------------------------------
# Generate synthetic Customer Segmentation dataset
# --------------------------------------------------
# Features: Age, Annual Income, Spending Score
X, _ = make_blobs(
    n_samples=300,
    centers=5,
    n_features=3,
    cluster_std=2.5,
    random_state=42
)

df = pd.DataFrame(X, columns=["Age", "Annual_Income", "Spending_Score"])

print("Customer dataset shape:", df.shape)

# --------------------------------------------------
# Prepare data
# --------------------------------------------------
X = df.copy()

imputer = SimpleImputer(strategy="median")
X = imputer.fit_transform(X)

# --------------------------------------------------
# Define scalers
# --------------------------------------------------
SCALERS = {
    "StandardScaler": StandardScaler(),
    "MinMaxScaler": MinMaxScaler(),
    "RobustScaler": RobustScaler(),
    "MaxAbsScaler": MaxAbsScaler()
}

# --------------------------------------------------
# Apply KMeans with each scaler
# --------------------------------------------------
print("\nScaler Comparison (KMeans, k=5):\n")

for scaler_name, scaler in SCALERS.items():
    X_scaled = scaler.fit_transform(X)

    kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)

    sil = silhouette_score(X_scaled, labels)
    inertia = kmeans.inertia_

    print(f"{scaler_name:15s}  Silhouette = {sil:.4f}   Inertia = {inertia:.2f}")


Customer dataset shape: (300, 3)

Scaler Comparison (KMeans, k=5):

StandardScaler   Silhouette = 0.4095   Inertia = 112.31
MinMaxScaler     Silhouette = 0.3998   Inertia = 5.96
RobustScaler     Silhouette = 0.4277   Inertia = 40.06
MaxAbsScaler     Silhouette = 0.3936   Inertia = 21.47
